"bikeshare": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/bikeshare.zip",
"signature": "4ed07a929ccbe0171309129e6adda1c4390190385dd6001ba9eecc795a21eef2"
"hobbies": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/hobbies.zip",
"signature": "6114e32f46baddf049a18fb05bad3efa98f4e6a0fe87066c94071541cb1e906f"
"concrete": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/concrete.zip",
"signature": "5807af2f04e14e407f61e66a4f3daf910361a99bb5052809096b47d3cccdfc0a"
"credit": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/credit.zip",
"signature": "2c6f5821c4039d70e901cc079d1404f6f49c3d6815871231c40348a69ae26573"
"energy": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/energy.zip",
"signature": "174eca3cd81e888fc416c006de77dbe5f89d643b20319902a0362e2f1972a34e"
"game": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/game.zip",
"signature": "ce799d1c55fcf1985a02def4d85672ac86c022f8f7afefbe42b20364fba47d7a"
"mushroom": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/mushroom.zip",
"signature": "f79fdbc33b012dabd06a8f3cb3007d244b6aab22d41358b9aeda74417c91f300"
"occupancy": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/occupancy.zip",
"signature": "0b390387584586a05f45c7da610fdaaf8922c5954834f323ae349137394e6253"
"spam": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/spam.zip",
"signature": "000309ac2b61090a3001de3e262a5f5319708bb42791c62d15a08a2f9f7cb30a"
"walking": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/walking.zip",
"signature": "7a36615978bc3bb74a2e9d5de216815621bd37f6a42c65d3fc28b242b4d6e040"
"nfl": {
"url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/nfl.zip",
"signature": "4989c66818ea18217ee0fe3a59932b963bd65869928c14075a5c50366cb81e1f"
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
可视化器 | ClassificationReport |
快速使用方法 | classification_report() |
模型 | 分类 |
工作流程 | 模型评估 |
from sklearn.model_selection import TimeSeriesSplit
from sklearn.naive_bayes import GaussianNB
from yellowbrick.classifier import ClassificationReport
from yellowbrick.datasets import load_occupancy
# Load the classification dataset
X, y = load_occupancy()
# Specify the target classes
classes = ["unoccupied", "occupied"]
# Create the training and test data
# 建立时序数据
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Instantiate the classification model and visualizer
# 建立分类器
model = GaussianNB()
# support 表示是否添加support方块格中显示数据。{True,False,None,'percent','count'},默认值:无
# support-'percent'表示百分比显示数据,'count'表示按个数显示数据
visualizer = ClassificationReport(model, classes=classes, support=True)
visualizer.fit(X_train, y_train) # Fit the visualizer and the model
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.show(); # Finalize and show the figure
from sklearn.model_selection import TimeSeriesSplit
from sklearn.naive_bayes import GaussianNB
from yellowbrick.datasets import load_occupancy
from yellowbrick.classifier import classification_report
# Load the classification data set
X, y = load_occupancy()
# Specify the target classes
classes = ["unoccupied", "occupied"]
# Create the training and test data
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Instantiate the visualizer
visualizer = classification_report(
GaussianNB(), X_train, y_train, X_test, y_test, classes=classes, support=True
ConfusionMatrix visualizer是一个ScoreVisualizer,它使用一个合适的scikit学习分类器和一组测试X和y值,并返回一个报告,显示每个测试值预测类与实际类的比较情况。数据科学家使用混淆矩阵来理解哪些类最容易混淆。它们提供的信息与分类报告中提供的信息相似,但它们提供了对单个数据点分类的更深入的了解,而不是顶级分数。
下面是一些使用ConfusionMatrix可视化工具的示例;更多信息可以通过查看scikit learn文档来找到。
可视化器 | ConfusionMatrix |
快速使用方法 | confusion_matrix() |
模型 | 分类 |
工作流程 | 模型评估 |
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
# We'll use the handwritten digits data set from scikit-learn.
# Each feature of this dataset is an 8x8 pixel image of a handwritten number.
# Digits.data converts these 64 pixels into a single array of features
#我们将使用scikit learn中的手写数字数据集。
# Digits.data 将这64个像素转换为一个维度数组
digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = tts(X, y, test_size =0.2, random_state=11)
model = LogisticRegression(multi_class="auto", solver="liblinear")
# The ConfusionMatrix visualizer taxes a model
# 混淆矩阵分类号
cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9])
# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train, y_train)
# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
#为了创建ConfusionMatrix,我们需要一些测试数据。对数据执行Score runs predict()然后从scikit learn创建混淆矩阵。
cm.score(X_test, y_test)
# 图中显示的每一类的个数
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
iris = load_iris()
X = iris.data
y = iris.target
classes = iris.target_names
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
model = LogisticRegression(multi_class="auto", solver="liblinear")
# percent表示显示百分比
iris_cm = ConfusionMatrix(
model, classes=classes,
0: 'setosa', 1: 'versicolor', 2: 'virginica'},percent =True
iris_cm.fit(X_train, y_train)
iris_cm.score(X_test, y_test)
from yellowbrick.datasets import load_credit
from yellowbrick.classifier import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
#Load the classification dataset
X, y = load_credit()
#Create the train and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
# Instantiate the visualizer with the classification model
X_train, y_train, X_test, y_test,
classes=['not_defaulted', 'defaulted']
可视化器 | ROCAUC |
快速使用方法 | roc_auc() |
模型 | 分类 |
工作流程 | 模型评估 |
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ROCAUC
from yellowbrick.datasets import load_spam
# Load the classification dataset
X, y = load_spam()
# Create the training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Instantiate the visualizer with the classification model
# 初始分类器
model = LogisticRegression(multi_class="auto", solver="liblinear")
visualizer = ROCAUC(model, classes=["not_spam", "is_spam"])
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.show(); # Finalize and show the figure
Yellowbrick的ROCAUC可视化工具允许绘制多类分类曲线。ROC曲线通常用于二值分类,而事实上Scikit-Learn roc_curve度量仅能对二值分类器进行度量。Yellowbrick通过对输出进行二进制化(每个类)或使用one vs-rest(micro score)或one vs-all(macro score)分类策略来解决这个问题。关于one vs-rest(micro score)或one vs-all(macro score)解释见https://blog.csdn.net/u010551621/article/details/46907575
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from yellowbrick.classifier import ROCAUC
from yellowbrick.datasets import load_game
# Load multi-class classification dataset
X, y = load_game()
# Encode the non-numeric columns
X = OrdinalEncoder().fit_transform(X)
y = LabelEncoder().fit_transform(y)
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Instaniate the classification model and visualizer
model = RidgeClassifier()
# 多类
visualizer = ROCAUC(model, classes=["win", "loss", "draw"])
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.show(); # Finalize and render the figure
默认情况下,在使用多类ROCAUC可视化工具时,除了每个类的微观和宏观平均曲线外,还绘制了每个类的曲线。这使用户可以按类别检查敏感性和特异性之间的权衡。请注意,对于multi-class ROCAUC,必须至少将micro,macro或per_class参数之一设置为True(默认情况下,所有参数都设置为True)。
from yellowbrick.classifier.rocauc import roc_auc
from yellowbrick.datasets import load_credit
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#Load the classification dataset
X, y = load_credit()
#Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y)
# Instantiate the visualizer with the classification model
model = LogisticRegression()
roc_auc(model, X_train, y_train, X_test=X_test, y_test=y_test, classes=['not_defaulted', 'defaulted']);
关于 Precision-Recall具体可以参考https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
可视化器 | PrecisionRecallCurve |
快速使用方法 | precision_recall_curve() |
模型 | 分类 |
工作流程 | 模型评估 |
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split as tts
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.datasets import load_spam
# Load the dataset and split into train/test splits
X, y = load_spam()
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True)
# Create the visualizer, fit, score, and show it
viz = PrecisionRecallCurve(RidgeClassifier())
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split as tts
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.datasets import load_game
# Load dataset and encode categorical variables
X, y = load_game()
X = OrdinalEncoder().fit_transform(X)
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True)
# Create the visualizer, fit, score, and show it
viz = PrecisionRecallCurve(RandomForestClassifier(n_estimators=10))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split as tts
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.datasets import load_game
# Load dataset and encode categorical variables
X, y = load_game()
X = OrdinalEncoder().fit_transform(X)
encoder = LabelEncoder()
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True)
# Create the visualizer, fit, score, and show it
viz = PrecisionRecallCurve(
MultinomialNB(), per_class=True, iso_f1_curves=True,
fill_area=False, micro=False, classes=encoder.classes_
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split as tts
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.datasets import load_game
# Load dataset and encode categorical variables
X, y = load_game()
X = OrdinalEncoder().fit_transform(X)
encoder = LabelEncoder()
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True)
# Create the visualizer, fit, score, and show it
viz = PrecisionRecallCurve(
MultinomialNB(), per_class=True, iso_f1_curves=True,
fill_area=False, micro=False, classes=encoder.classes_
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
可视化器 | ClassBalance |
快速使用方法 | class_balance() |
模型 | 分类 |
工作流程 | 特征分析,目标分析,模型选择 |
from yellowbrick.datasets import load_game
from yellowbrick.target import ClassBalance
# Load the classification dataset
X, y = load_game()
# Instantiate the visualizer
visualizer = ClassBalance(labels=["draw", "loss", "win"])
visualizer.fit(y) # Fit the data to the visualizer
visualizer.show(); # Finalize and render the figure
由此得到的数字使我们能够诊断平衡问题的严重性。在这个图中,我们可以看到“赢”类在其他两个类中占主导地位。一个潜在的解决方案可能是创建一个二进制分类器:“win”vs“not win”,并将“loss”和“draw”类组合成一个类。
如果在评估过程中必须保持班级不平衡(例如,被分类的事件实际上与频率所暗示的一样罕见),则应使用分层抽样来创建训练和测试拆分。 这样可以确保测试数据的类别与培训数据的比例大致相同。 虽然scikit-learn默认在train_test_split和其他cv方法中执行此操作,但比较两个拆分中每个类的支持可能很有用。
from sklearn.model_selection import TimeSeriesSplit
from yellowbrick.datasets import load_occupancy
from yellowbrick.target import ClassBalance
# Load the classification dataset
X, y = load_occupancy()
# Create the training and test data
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Instantiate the visualizer
visualizer = ClassBalance(labels=["unoccupied", "occupied"])
visualizer.fit(y_train, y_test) # Fit the data to the visualizer
from yellowbrick.datasets import load_game
from yellowbrick.target import class_balance
# Load the dataset
X, y = load_game()
# Use the quick method and immediately show the figure
可视化器 | ClassPredictionError |
快速使用方法 | class_prediction_error() |
模型 | 分类 |
工作流程 | 模型评估 |
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ClassPredictionError
# Create classification dataset
X, y = make_classification(
n_samples=1000, n_classes=5, n_informative=3, n_clusters_per_class=1,
classes = ["apple", "kiwi", "pear", "banana", "orange"]
# Perform 80/20 training/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(
RandomForestClassifier(random_state=42, n_estimators=10), classes=classes
# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)
# Evaluate the model on the test data
visualizer.score(X_test, y_test)
# Draw visualization
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.datasets import load_credit
X, y = load_credit()
classes = ['account in default', 'current with bills']
# Perform 80/20 training/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(
RandomForestClassifier(n_estimators=10), classes=classes
# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)
# Evaluate the model on the test data
visualizer.score(X_test, y_test)
# Draw visualization
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split as tts
from yellowbrick.classifier import class_prediction_error
from yellowbrick.datasets import load_occupancy
# Load the dataset and split into train/test splits
X, y = load_occupancy()
X_train, X_test, y_train, y_test = tts(
X, y, test_size=0.2, shuffle=True
X_train, y_train, X_test, y_test,
classes=["vacant", "occupied"]
可视化器 | discriminationThreshold |
快速使用方法 | discrimination_threshold() |
模型 | 分类 |
工作流程 | 模型评估 |
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import DiscriminationThreshold
from yellowbrick.datasets import load_spam
# Load a binary classification dataset
X, y = load_spam()
# Instantiate the classification model and visualizer
model = LogisticRegression(multi_class="auto", solver="liblinear")
visualizer = DiscriminationThreshold(model)
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.show(); # Finalize and render the figure
许多分类器使用Decision_function对阳性分类评分,或使用predict_proba函数计算阳性分类的概率。 如果分数或概率大于某个判别阈值,则选择肯定类别,否则选择否定类别。
在上图中,我们看到可视化器已调整为寻找最佳F1分数,该分数标注为阈值0.43。 为了考虑模型相对于度量的变化(显示为中值曲线周围的填充区域),模型在多个训练/测试拆分中多次运行。
from yellowbrick.classifier.threshold import discrimination_threshold
from yellowbrick.datasets import load_occupancy
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
#Load the classification dataset
X, y = load_occupancy()
# Instantiate the visualizer with the classification model
model = KNeighborsClassifier(3)
# exclude设定不显示的指标
# argmax阈值评价的指标,可选'precision','recall','fscore',None,默认'f1score'
discrimination_threshold(model, X, y,exclude='queue_rate', argmax='precision');