目录
概率类模型的评估指标
布里尔分数Brier Score
对数损失
可靠性曲线
预测概率的直方图
多项式朴素贝叶斯
sklearn中的多项式贝叶斯
伯努利朴素贝叶斯
朴素贝叶斯的样本不均衡问题
补集朴素贝叶斯
贝叶斯作文本分类
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import brier_score_loss
from sklearn.metrics import log_loss
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
X = load_digits().data
y = load_digits().target
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size = 0.3,random_state = 0)
estimators = [GaussianNB().fit(Xtrain,Ytrain)
,LR(solver = 'lbfgs',max_iter = 3000,multi_class = 'auto').fit(Xtrain,Ytrain)
,SVC(kernel = 'rbf').fit(Xtrain,Ytrain)
]
name = ['GaussianNB','LogisticRegression','SVC']
for i,estimator in enumerate(estimators):
if hasattr(estimator,'predict_proba'): #查看是否存在概率属性
prob = estimator.predict_proba(Xtest)
print("{}'s log_loss is:{}\n".format(name[i],log_loss(Ytest,prob)))
else: #不存在则进行归一化处理
prob = (estimator.decision_function(Xtest) - estimator.decision_function(Xtest).min())\
/(estimator.decision_function(Xtest).max()-estimator.decision_function(Xtest).min())
print("{}'s log_loss is:{}\n".format(name[i],log_loss(Ytest,prob)))
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification as mc
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split
#创建二分类数据
X, y = mc(n_samples=100000, n_features=20 # 总共20个特征
, n_classes=2 # 标签为2分类
, n_informative=2 # 其中两个代表较多信息
, n_redundant=10 # 10个都是冗余特征
, random_state=42)
#样本量足够大 10%作为训练集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.9, random_state=0)
estimators = [GaussianNB().fit(Xtrain, Ytrain)
, LR(solver='lbfgs', max_iter=3000, multi_class='auto').fit(Xtrain, Ytrain)
, SVC(kernel='rbf').fit(Xtrain, Ytrain)
]
fig, ax = plt.subplots(figsize=(8, 6))
name = ['GaussianNB', 'LogisticRegression', 'SVC']
for i, estimator in enumerate(estimators):
if hasattr(estimator, 'predict_proba'):
proba = estimator.predict_proba(Xtest)[:, 1]
else:
prob = estimator.decision_function(Xtest)
proba = (prob - prob.min()) / (prob.max() - prob.min())
trueproba, predproba = calibration_curve(Ytest, proba, n_bins=10)
bls = brier_score_loss(Ytest, proba, pos_label=1)
ax.plot(trueproba, predproba, 'o-', label='{}\'s brier_score_loss:{:.4f}'.format(name[i], bls))
ax.set_xlabel('Mean Predict Probability')
ax.set_ylabel('True Probability for Class 1')
ax.set_ylim(-0.05, 1.05)
ax.plot([0, 1], [0, 1], '--', c='k', label='Perfectly Clibration')
ax.legend(loc='best')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification as mc
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split
#创建二分类数据
X, y = mc(n_samples=100000, n_features=20 # 总共20个特征
, n_classes=2 # 标签为2分类
, n_informative=2 # 其中两个代表较多信息
, n_redundant=10 # 10个都是冗余特征
, random_state=42)
#样本量足够大 10%作为训练集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.9, random_state=0)
estimators = [GaussianNB().fit(Xtrain, Ytrain)
, LR(solver='lbfgs', max_iter=3000, multi_class='auto').fit(Xtrain, Ytrain)
, SVC(kernel='rbf').fit(Xtrain, Ytrain)
]
fig,ax1 = plt.subplots(figsize = (8,6))
name = ['GaussianNB','LogisticRegression','SVC']
for i,estimator in enumerate(estimators):
if hasattr(estimator,'predict_proba'):
proba = estimator.predict_proba(Xtest)[:,1]
else:
prob = estimator.decision_function(Xtest)
proba = (prob-prob.min())/(prob.max()-prob.min())
ax1.hist(proba
,bins=10
,label=name[i]
,histtype="step" #设置直方图为透明
,lw=2 #设置直方图每个柱子描边的粗细
,density = True
)
ax1.set_xlabel('Distribution of Probability')
ax1.set_ylabel('Mean Predict Probability')
ax1.set_xlim([-0.05, 1.05])
ax1.set_xticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
ax1.legend()
plt.show()
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB #多项式朴素贝叶斯
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.datasets import make_blobs
from sklearn.metrics import brier_score_loss
import numpy as np
#数据样本为500
class_1=500
class_2=500
centers=[[0.0,0.0],[2.1,2.1]]
cluster_std=[0.5,0.5]
X,y=make_blobs(n_samples=[class_1,class_2],
centers=centers,
cluster_std=cluster_std,
random_state=10,shuffle=False#不打乱数据集
)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y
,test_size=0.3
,random_state=420)
#归一化 确保特征矩阵不存在负值
#多项式贝叶斯不接受负值
mms=MinMaxScaler().fit(Xtrain)
Xtrain=mms.transform(Xtrain)
Xtest=mms.transform(Xtest)
#建立多项式贝叶斯分类器
mnb=MultinomialNB().fit(Xtrain,Ytrain)
print(mnb.class_log_prior_)#查看每个标签类的对数先验概率 永远是负数
print(np.exp(mnb.class_log_prior_))#查看真正的概率值
print(mnb.score(Xtest,Ytest))
#[-0.69029411 -0.69600841]
#[0.50142857 0.49857143]
#0.49666666666666665
#将连续型变量离散化后分箱查看准确性
kbs=KBinsDiscretizer(n_bins=10,encode='onehot').fit(Xtrain)
Xtrain_ = kbs.transform(Xtrain)
Xtest_ = kbs.transform(Xtest)
mnb_=MultinomialNB().fit(Xtrain_,Ytrain)
print(mnb_.score(Xtest_,Ytest))
#0.9933333333333333
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import brier_score_loss as BS,recall_score,roc_auc_score as AUC
class_1 = 50000 #多数类为50000个样本
class_2 = 500 #少数类为500个样本
centers = [[0.0, 0.0], [5.0, 5.0]] #设定两个类别的中心
clusters_std = [3, 1] #设定两个类别的方差
X, y = make_blobs(n_samples=[class_1, class_2],
centers=centers,
cluster_std=clusters_std,
random_state=0, shuffle=False)
name = ["Multinomial","Gaussian","Bernoulli"]
models = [MultinomialNB(),GaussianNB(),BernoulliNB()]
for name,clf in zip(name,models):
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3,random_state=420)
#不是高斯分布则需要进行分箱离散化处理
if name != "Gaussian":
kbs = KBinsDiscretizer(n_bins=10, encode='onehot').fit(Xtrain)
Xtrain = kbs.transform(Xtrain)
Xtest = kbs.transform(Xtest)
clf.fit(Xtrain,Ytrain)
y_pred = clf.predict(Xtest)
proba = clf.predict_proba(Xtest)[:,1]
score = clf.score(Xtest,Ytest)
print(name)
print("\tBrier:{:.3f}".format(BS(Ytest,proba,pos_label=1)))
print("\tAccuracy:{:.3f}".format(score))
print("\tRecall:{:.3f}".format(recall_score(Ytest,y_pred)))
print("\tAUC:{:.3f}".format(AUC(Ytest,proba)))
"""
Multinomial
Brier:0.007
Accuracy:0.990
Recall:0.000
AUC:0.991
Gaussian
Brier:0.006
Accuracy:0.990
Recall:0.438
AUC:0.993
Bernoulli
Brier:0.009
Accuracy:0.987
Recall:0.771
AUC:0.987
"""
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import brier_score_loss as BS
import pandas as pd
import numpy as np
#初次使用这个数据集的时候,会在实例化的时候开始下载
data = fetch_20newsgroups()
#提取4钟类型数据
categories = ["sci.space" #科学技术 - 太空
,"rec.sport.hockey" #运动 - 曲棍球
,"talk.politics.guns" #政治 - 枪支问题
,"talk.politics.mideast"] #政治 - 中东问题
train = fetch_20newsgroups(subset="train",categories = categories)
test = fetch_20newsgroups(subset="test",categories = categories)
#提取训练集测试集
Xtrain = train.data
Xtest = test.data
Ytrain = train.target
Ytest = test.target
#使用TFIDF编码
tfidf=TFIDF().fit(Xtrain)
Xtrain_=tfidf.transform(Xtrain)
Xtest_ = tfidf.transform(Xtest)
name = ["Multinomial","Complement","Bournulli"]
#注意高斯朴素贝叶斯不接受稀疏矩阵
models = [MultinomialNB(),ComplementNB(),BernoulliNB()]
for name,clf in zip(name,models):
clf.fit(Xtrain_,Ytrain)
y_pred = clf.predict(Xtest_)
proba = clf.predict_proba(Xtest_)
score = clf.score(Xtest_,Ytest)
print(name)
#4个不同的标签取值下的布里尔分数
Bscore = []
for i in range(len(np.unique(Ytrain))):
bs = BS(Ytest==i,proba[:,i],pos_label=i)
Bscore.append(bs)
print("\tBrier under {}:{:.3f}".format(train.target_names[i],bs))
print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
print("\tAccuracy:{:.3f}".format(score))
print("\n")
"""
Multinomial
Brier under rec.sport.hockey:0.857
Brier under sci.space:0.033
Brier under talk.politics.guns:0.169
Brier under talk.politics.mideast:0.178
Average Brier:0.309
Accuracy:0.975
Complement
Brier under rec.sport.hockey:0.804
Brier under sci.space:0.039
Brier under talk.politics.guns:0.137
Brier under talk.politics.mideast:0.160
Average Brier:0.285
Accuracy:0.986
Bournulli
Brier under rec.sport.hockey:0.925
Brier under sci.space:0.025
Brier under talk.politics.guns:0.205
Brier under talk.politics.mideast:0.193
Average Brier:0.337
Accuracy:0.902
"""