from sklearn import tree #导入需要的模块
clf = tree.DecisionTreeClassifier()#实例化
clf = clf.fit(x_train,y_train) #用训练集训练模型
result = clf.score(x_test,y_test) #导入测试集,从接口中调用需要的信息
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
wine = load_wine()
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.23 | 1.71 | 2.43 | 15.6 | 127.0 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065.0 | 0 |
1 | 13.20 | 1.78 | 2.14 | 11.2 | 100.0 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050.0 | 0 |
2 | 13.16 | 2.36 | 2.67 | 18.6 | 101.0 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185.0 | 0 |
3 | 14.37 | 1.95 | 2.50 | 16.8 | 113.0 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480.0 | 0 |
4 | 13.24 | 2.59 | 2.87 | 21.0 | 118.0 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735.0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
173 | 13.71 | 5.65 | 2.45 | 20.5 | 95.0 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740.0 | 2 |
174 | 13.40 | 3.91 | 2.48 | 23.0 | 102.0 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750.0 | 2 |
175 | 13.27 | 4.28 | 2.26 | 20.0 | 120.0 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835.0 | 2 |
176 | 13.17 | 2.59 | 2.37 | 20.0 | 120.0 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840.0 | 2 |
177 | 14.13 | 4.10 | 2.74 | 24.5 | 96.0 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560.0 | 2 |
178 rows × 14 columns
wine.feature_names #特征的名字
['alcohol',
'malic_acid',
'ash',
'alcalinity_of_ash',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavanoid_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'od280/od315_of_diluted_wines',
'proline']
#划分训练集和测试集
xtrain,xtest,ytrain,ytest = train_test_split(wine.data,wine.target,test_size=0.3)
# 建模
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)
import graphviz
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','稀释葡萄酒','脯氨酸']
dot_data = tree.export_graphviz(clf,
feature_names=feature_name,
class_names=['琴酒','雪莉','贝尔摩德'],
filled=True,
rounded=True)
graph = graphviz.Source(dot_data)
graph
clf.feature_importances_ #查看各特征的重要性
array([0. , 0.06776584, 0. , 0. , 0. ,
0. , 0.17665173, 0. , 0. , 0.25465528,
0. , 0.03491509, 0.46601206])
[*zip(feature_name,clf.feature_importances_)]# 查看特征对应的重要性
[('酒精', 0.0),
('苹果酸', 0.0677658377725383),
('灰', 0.0),
('灰的碱性', 0.0),
('镁', 0.0),
('总酚', 0.0),
('类黄酮', 0.17665173372492327),
('非黄烷类酚类', 0.0),
('花青素', 0.0),
('颜色强度', 0.254655282950384),
('色调', 0.0),
('稀释葡萄酒', 0.03491508994608114),
('脯氨酸', 0.4660120556060733)]
clf = tree.DecisionTreeClassifier(criterion='entropy',
random_state=30,
splitter='random')
# 该方法适用于二分类,可以快速绘制ROC曲线,但在该三分类问题上会报错
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(clf, xtest, ytest)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_19198/571040821.py in
1 from sklearn.metrics import RocCurveDisplay
----> 2 RocCurveDisplay.from_estimator(clf, xtest, ytest)
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/sklearn/metrics/_plot/roc_curve.py in from_estimator(cls, estimator, X, y, sample_weight, drop_intermediate, response_method, pos_label, name, ax, **kwargs)
233 estimator,
234 response_method=response_method,
--> 235 pos_label=pos_label,
236 )
237
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/sklearn/metrics/_plot/base.py in _get_response(X, estimator, response_method, pos_label)
107 if y_pred_shape != 2:
108 raise ValueError(
--> 109 f"{classification_error} fit on multiclass ({y_pred_shape} classes)"
110 " data"
111 )
ValueError: Expected 'estimator' to be a binary classifier, but got DecisionTreeClassifier fit on multiclass (3 classes) data
#apply 返回每个测试样本所在的叶子结点的索引
clf.apply(xtest)
array([12, 6, 8, 3, 8, 3, 8, 3, 3, 3, 6, 12, 4, 3, 12, 3, 6,
3, 11, 6, 6, 6, 12, 6, 3, 8, 3, 6, 8, 6, 3, 3, 12, 3,
3, 12, 6, 12, 12, 6, 6, 10, 6, 8, 6, 6, 3, 12, 12, 3, 3,
8, 12, 3])
#predict 返回每个测试样本的分类回归结果
clf.predict(xtest)
array([0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 1, 2, 1, 0, 2, 2, 2,
0, 2, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 1,
2, 2, 1, 0, 0, 1, 1, 1, 0, 1])