D3、C4.5算法都是基于信息熵来进行划分节点选取的,主要用于分类问题。而CART决策树全称为分类回归树(Classification And Regression Tree),分类和回归问题都可以使用。
CART构建的都是二叉树,而ID3 C4.5是根据具体类别然后进行划分
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log
import pprint
# 书上题目5.1
def create_data():
datasets = [['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否'],
labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
# 返回数据集和每个维度的名称
return datasets, labels
datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns=labels)
年龄 | 有工作 | 有自己的房子 | 信贷情况 | 类别 | |
0 | 青年 | 否 | 否 | 一般 | 否 |
1 | 青年 | 否 | 否 | 好 | 否 |
2 | 青年 | 是 | 否 | 好 | 是 |
3 | 青年 | 是 | 是 | 一般 | 是 |
4 | 青年 | 否 | 否 | 一般 | 否 |
5 | 中年 | 否 | 否 | 一般 | 否 |
6 | 中年 | 否 | 否 | 好 | 否 |
7 | 中年 | 是 | 是 | 好 | 是 |
8 | 中年 | 否 | 是 | 非常好 | 是 |
9 | 中年 | 否 | 是 | 非常好 | 是 |
10 | 老年 | 否 | 是 | 非常好 | 是 |
11 | 老年 | 否 | 是 | 好 | 是 |
12 | 老年 | 是 | 否 | 好 | 是 |
13 | 老年 | 是 | 否 | 非常好 | 是 |
14 | 老年 | 否 | 否 | 一般 | 否 |
# 熵
# 思路:用一个字典来存储y以及对应的个数
def calc_ent(datasets):
data_length = len(datasets)
label_count = {}
for i in range(data_length):
label = datasets[i][-1]
if label not in label_count:
label_count[label] = 0
label_count[label] += 1
ent = -sum([(p/data_length)*log(p/data_length, 2) for p in label_count.values()])
return ent
# 经验条件熵
# 思路:用一个字典来存储每一行
def cond_ent(datasets, axis=0):
data_length = len(datasets)
feature_sets = {}
for i in range(data_length):
feature = datasets[i][axis]
if feature not in feature_sets:
feature_sets[feature] = []
cond_ent = sum([(len(p)/data_length)*calc_ent(p) for p in feature_sets.values()])
return cond_ent
# 信息增益
def info_gain(ent, cond_ent):
return ent - cond_ent
def info_gain_train(datasets):
# 思路:循环遍历每一个特征 然后计算信息增益 最后用一个lambda函数来取最大的结果
count = len(datasets[0]) - 1 # 因为默认从0开始 所以要减去1
ent = calc_ent(datasets) # 计算经验熵
best_feature = []
for c in range(count): # 对每一个变量进行循环遍历
c_info_gain = info_gain(ent, cond_ent(datasets, axis=c))
best_feature.append((c, c_info_gain))
print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain))
# labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
# 比较大小
best_ = max(best_feature, key=lambda x: x[-1])
return '特征({})的信息增益最大,选择为根节点特征'.format(labels[best_[0]])
特征(年龄) - info_gain - 0.083
特征(有工作) - info_gain - 0.324
特征(有自己的房子) - info_gain - 0.420
特征(信贷情况) - info_gain - 0.363
# 定义节点类 二叉树
class Node:
def __init__(self, root=True, label=None, feature_name=None, feature=None):
self.root = root
self.label = label
self.feature_name = feature_name
self.feature = feature
self.tree = {}
self.result = {'label:': self.label, 'feature': self.feature, 'tree': self.tree}
def __repr__(self):
return '{}'.format(self.result)
def add_node(self, val, node):
self.tree[val] = node
def predict(self, features):
if self.root is True:
return self.label
return self.tree[features[self.feature]].predict(features)
class DTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon
self._tree = {}
# 熵
def calc_ent(datasets):
data_length = len(datasets)
label_count = {}
for i in range(data_length):
label = datasets[i][-1]
if label not in label_count:
label_count[label] = 0
label_count[label] += 1
ent = -sum([(p/data_length)*log(p/data_length, 2) for p in label_count.values()])
return ent
# 经验条件熵
def cond_ent(self, datasets, axis=0):
data_length = len(datasets)
feature_sets = {}
for i in range(data_length):
feature = datasets[i][axis]
if feature not in feature_sets:
feature_sets[feature] = []
cond_ent = sum([(len(p)/data_length)*self.calc_ent(p) for p in feature_sets.values()])
return cond_ent
# 信息增益
def info_gain(ent, cond_ent):
return ent - cond_ent
def info_gain_train(self, datasets):
count = len(datasets[0]) - 1
ent = self.calc_ent(datasets)
best_feature = []
for c in range(count):
c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))
best_feature.append((c, c_info_gain))
# 比较大小
best_ = max(best_feature, key=lambda x: x[-1])
return best_
def train(self, train_data):
_, y_train, features = train_data.iloc[:, :-1], train_data.iloc[:, -1], train_data.columns[:-1]
# 1,若D中实例属于同一类Ck,则T为单节点树,并将类Ck作为结点的类标记,返回T
if len(y_train.value_counts()) == 1:
return Node(root=True,
# 2, 若A为空,则T为单节点树,将D中实例树最大的类Ck作为该节点的类标记,返回T
if len(features) == 0:
return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
# 3,计算最大信息增益 同5.1,Ag为信息增益最大的特征
max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
max_feature_name = features[max_feature]
# 4,Ag的信息增益小于阈值eta,则置T为单节点树,并将D中是实例数最大的类Ck作为该节点的类标记,返回T
if max_info_gain < self.epsilon:
return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
# 5,构建Ag子集
node_tree = Node(root=False, feature_name=max_feature_name, feature=max_feature)
feature_list = train_data[max_feature_name].value_counts().index
for f in feature_list:
sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis=1)
# 6, 递归生成树
sub_tree = self.train(sub_train_df)
node_tree.add_node(f, sub_tree)
# pprint.pprint(node_tree.tree)
return node_tree
def fit(self, train_data):
self._tree = self.train(train_data)
return self._tree
def predict(self, X_test):
return self._tree.predict(X_test)
# 开始进行!
datasets, labels = create_data()
data_df = pd.DataFrame(datasets, columns=labels)
dt = DTree()
tree = dt.fit(data_df)
# 查看树的情况
{'label:': None, 'feature': 2, 'tree': {'否': {'label:': None, 'feature': 1, 'tree': {'否': {'label:': '否', 'feature': None, 'tree': {}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}
# 上述feature对应的就是下面labels的索引!
['年龄', '有工作', '有自己的房子', '信贷情况', '类别']
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# 打标签
df['label'] = iris.target
# 重命名列名
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
# 只取前100行 以及前两列+最后一列
data = np.array(df.iloc[:100, [0, 1, -1]])
(100, 3)
array([[5.1, 3.5, 0. ],
[4.9, 3. , 0. ],
[4.7, 3.2, 0. ],
[4.6, 3.1, 0. ],
[5. , 3.6, 0. ]])
X = data[:,:2]
y = data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
(70, 2)
(30, 2)
# data
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0, 1, -1]])
# print(data)
return data[:,:2], data[:,-1]
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
pre = clf.predict(X_test)
array([0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0.,
0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1.])
# 方法1 计算分错率
def CalError(test_label, pre):
error_num = 0
for i in range(len(pre)):
if test_label[i] != pre[i]:
error_num += 1
error_ratio = error_num/len(pre)
print('分错率为:%.2f' % error_ratio)
return error_ratio
CalError(y_test, pre)
# 输出有点问题
tree_pic = export_graphviz(clf, out_file="mytree1.pdf")
with open('mytree1.pdf') as f:
dot_graph = f.read()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
accepts = pd.read_csv('./data/accepts.csv')
(5845, 25)
application_id | account_number | bad_ind | vehicle_year | vehicle_make | bankruptcy_ind | tot_derog | tot_tr | age_oldest_tr | tot_open_tr | ... | purch_price | msrp | down_pyt | loan_term | loan_amt | ltv | tot_income | veh_mileage | used_ind | weight | |
0 | 2314049 | 11613 | 1 | 1998.0 | FORD | N | 7.0 | 9.0 | 64.0 | 2.0 | ... | 17200.00 | 17350.0 | 0.00 | 36 | 17200.00 | 99.0 | 6550.00 | 24000.0 | 1 | 1.00 |
1 | 63539 | 13449 | 0 | 2000.0 | DAEWOO | N | 0.0 | 21.0 | 240.0 | 11.0 | ... | 19588.54 | 19788.0 | 683.54 | 60 | 19588.54 | 99.0 | 4666.67 | 22.0 | 0 | 4.75 |
2 | 7328510 | 14323 | 1 | 1998.0 | PLYMOUTH | N | 7.0 | 10.0 | 60.0 | NaN | ... | 13595.00 | 11450.0 | 0.00 | 60 | 10500.00 | 92.0 | 2000.00 | 19600.0 | 1 | 1.00 |
3 | 8725187 | 15359 | 1 | 1997.0 | FORD | N | 3.0 | 10.0 | 35.0 | 5.0 | ... | 12999.00 | 12100.0 | 3099.00 | 60 | 10800.00 | 118.0 | 1500.00 | 10000.0 | 1 | 1.00 |
4 | 4275127 | 15812 | 0 | 2000.0 | TOYOTA | N | 0.0 | 10.0 | 104.0 | 2.0 | ... | 26328.04 | 22024.0 | 0.00 | 60 | 26328.04 | 122.0 | 4144.00 | 14.0 | 0 | 4.75 |
5 rows × 25 columns
application_id 0
account_number 0
bad_ind 0
vehicle_year 1
vehicle_make 299
bankruptcy_ind 217
tot_derog 213
tot_tr 213
age_oldest_tr 216
tot_open_tr 1419
tot_rev_tr 638
tot_rev_debt 478
tot_rev_line 478
rev_util 0
fico_score 314
purch_price 0
msrp 1
down_pyt 0
loan_term 0
loan_amt 0
ltv 1
tot_income 5
veh_mileage 1
used_ind 0
weight 0
dtype: int64
accepts = accepts.dropna(axis = 0, how='any')
(4105, 25)
application_id | account_number | bad_ind | vehicle_year | vehicle_make | bankruptcy_ind | tot_derog | tot_tr | age_oldest_tr | tot_open_tr | ... | purch_price | msrp | down_pyt | loan_term | loan_amt | ltv | tot_income | veh_mileage | used_ind | weight | |
0 | 2314049 | 11613 | 1 | 1998.0 | FORD | N | 7.0 | 9.0 | 64.0 | 2.0 | ... | 17200.00 | 17350.0 | 0.00 | 36 | 17200.00 | 99.0 | 6550.00 | 24000.0 | 1 | 1.00 |
1 | 63539 | 13449 | 0 | 2000.0 | DAEWOO | N | 0.0 | 21.0 | 240.0 | 11.0 | ... | 19588.54 | 19788.0 | 683.54 | 60 | 19588.54 | 99.0 | 4666.67 | 22.0 | 0 | 4.75 |
3 | 8725187 | 15359 | 1 | 1997.0 | FORD | N | 3.0 | 10.0 | 35.0 | 5.0 | ... | 12999.00 | 12100.0 | 3099.00 | 60 | 10800.00 | 118.0 | 1500.00 | 10000.0 | 1 | 1.00 |
4 | 4275127 | 15812 | 0 | 2000.0 | TOYOTA | N | 0.0 | 10.0 | 104.0 | 2.0 | ... | 26328.04 | 22024.0 | 0.00 | 60 | 26328.04 | 122.0 | 4144.00 | 14.0 | 0 | 4.75 |
5 | 8712513 | 16979 | 0 | 2000.0 | DODGE | Y | 2.0 | 15.0 | 136.0 | 4.0 | ... | 26272.72 | 26375.0 | 0.00 | 36 | 26272.72 | 100.0 | 5400.00 | 1.0 | 0 | 4.75 |
5 rows × 25 columns
target = accepts['bad_ind']
data = accepts.ix[:, 'bankruptcy_ind':'used_ind']
data['lti_temp'] = data['loan_amt'] / data['tot_income']
data['lti_temp'] = data['lti_temp'].map(lambda x: 10 if x >= 10 else x)
del data['loan_amt']
data['bankruptcy_ind'] = data['bankruptcy_ind'].map({'N':0, 'Y':1})
data.head() # 都得是数字表示的!
(4105, 19)
/Users/apple/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:2: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
from ipykernel import kernelapp as app
bankruptcy_ind | tot_derog | tot_tr | age_oldest_tr | tot_open_tr | tot_rev_tr | tot_rev_debt | tot_rev_line | rev_util | fico_score | purch_price | msrp | down_pyt | loan_term | ltv | tot_income | veh_mileage | used_ind | lti_temp | |
0 | 0 | 7.0 | 9.0 | 64.0 | 2.0 | 1.0 | 506.0 | 500.0 | 101 | 650.0 | 17200.00 | 17350.0 | 0.00 | 36 | 99.0 | 6550.00 | 24000.0 | 1 | 2.625954 |
1 | 0 | 0.0 | 21.0 | 240.0 | 11.0 | 7.0 | 34605.0 | 57241.0 | 60 | 649.0 | 19588.54 | 19788.0 | 683.54 | 60 | 99.0 | 4666.67 | 22.0 | 0 | 4.197541 |
3 | 0 | 3.0 | 10.0 | 35.0 | 5.0 | 4.0 | 4019.0 | 5946.0 | 68 | 603.0 | 12999.00 | 12100.0 | 3099.00 | 60 | 118.0 | 1500.00 | 10000.0 | 1 | 7.200000 |
4 | 0 | 0.0 | 10.0 | 104.0 | 2.0 | 0.0 | 0.0 | 1800.0 | 0 | 764.0 | 26328.04 | 22024.0 | 0.00 | 60 | 122.0 | 4144.00 | 14.0 | 0 | 6.353292 |
5 | 1 | 2.0 | 15.0 | 136.0 | 4.0 | 3.0 | 3651.0 | 5747.0 | 64 | 680.0 | 26272.72 | 26375.0 | 0.00 | 36 | 100.0 | 5400.00 | 1.0 | 0 | 4.865319 |
针对有序的:可以直接变成连续型数据: 1 2 3 4 5这样
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target,
random_state = 23, test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(3284, 19) (821, 19) (3284,) (821,)
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='gini', # 特征选择标准
max_depth=3, # 树的最大深度
class_weight=None, # 每一类标签的权重是相等的
random_state= 23)
clf.fit(X_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=23,
import sklearn.metrics as metrics
print(metrics.classification_report(y_test, clf.predict(X_test)))
precision recall f1-score support
0 0.82 1.00 0.90 669
1 0.50 0.02 0.04 152
micro avg 0.81 0.81 0.81 821
macro avg 0.66 0.51 0.47 821
weighted avg 0.76 0.81 0.74 821
clf.set_params(**{'class_weight': {0:1, 1:3}})
clf.fit(X_train, y_train)
print(metrics.classification_report(y_test, clf.predict(X_test)))
precision recall f1-score support
0 0.92 0.65 0.76 669
1 0.33 0.74 0.45 152
micro avg 0.67 0.67 0.67 821
macro avg 0.62 0.70 0.61 821
weighted avg 0.81 0.67 0.71 821
# 看不到列名
array([0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.18723539, 0. , 0.66368633,
0. , 0. , 0. , 0. , 0.13392455,
0. , 0. , 0. , 0.01515373])
list(zip(data.columns, clf.feature_importances_))
[('bankruptcy_ind', 0.0),
('tot_derog', 0.0),
('tot_tr', 0.0),
('age_oldest_tr', 0.0),
('tot_open_tr', 0.0),
('tot_rev_tr', 0.0),
('tot_rev_debt', 0.0),
('tot_rev_line', 0.18723538913454854),
('rev_util', 0.0),
('fico_score', 0.6636863306714282),
('purch_price', 0.0),
('msrp', 0.0),
('down_pyt', 0.0),
('loan_term', 0.0),
('ltv', 0.133924553427229),
('tot_income', 0.0),
('veh_mileage', 0.0),
('used_ind', 0.0),
('lti_temp', 0.015153726766794278)]
def plot_feature_importances(feature_importances,title,feature_names):
# 将重要性值标准化
feature_importances = 100.0*(feature_importances/max(feature_importances))
# 将得分从高到低排序
index_sorted = np.flipud(np.argsort(feature_importances))
# 让X坐标轴上的标签居中显示
pos = np.arange(index_sorted.shape[0])+0.5
plt.xticks(pos,np.array(feature_names)[index_sorted], rotation=90) # 转90度就可以了!
plt.ylabel('Relative Importance')
features = data.columns.tolist()
plot_feature_importances(clf.feature_importances_, 'DT', features)
import pydotplus
from IPython.display import Image
import sklearn.tree as tree
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=data.columns,
class_names=['0', '1'], filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
dot_data = tree.export_graphviz(clf, out_file = None)
graph = pydotplus.graph_from_dot_data(dot_data)
from sklearn.model_selection import ParameterGrid, GridSearchCV
max_depth = [None,]
max_leaf_nodes = np.arange(5,10,1)
class_weight = [{0:1, 1:2}, {0:1, 1:3}]
param_grid = {'max_depth':max_depth,
'max_leaf_nodes' :max_leaf_nodes,
'class_weight' :class_weight}
clf_cv = GridSearchCV(estimator = clf,
param_grid = param_grid,
cv = 5,
scoring = 'roc_auc')
# 总共建模50次,上述代码会帮我们找到模型交叉验证最好的那一组参数
# GridSearchCV训练后返回的是最优参数的模型 直接对其进行评估
clf_cv.fit(X_train, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight={0: 1, 1: 3}, criterion='gini',
max_depth=3, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=23,
fit_params=None, iid='warn', n_jobs=None,
param_grid={'max_depth': [None], 'max_leaf_nodes': array([5, 6, 7, 8, 9]), 'class_weight': [{0: 1, 1: 2}, {0: 1, 1: 3}]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='roc_auc', verbose=0)
array([5, 6, 7, 8, 9])
{'class_weight': {0: 1, 1: 3}, 'max_depth': None, 'max_leaf_nodes': 8}
print(metrics.classification_report(y_test, clf_cv.predict(X_test)))
precision recall f1-score support
0 0.89 0.70 0.78 669
1 0.32 0.63 0.42 152
micro avg 0.68 0.68 0.68 821
macro avg 0.61 0.66 0.60 821
weighted avg 0.79 0.68 0.72 821
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_predict
def plot_roc(train_x, train_y, test_x, test_y, clf):
y_score = clf.fit(train_x, train_y).decision_function(test_x)
fpr,tpr,threshold = roc_curve(test_y, y_score) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
# 开始绘图
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
return roc_auc
plot_roc(X_train.values, y_train.values.reshape(-1,1),
X_test.values, y_test.values.reshape(-1,1), clf_cv)
# 查看模型预测结果
train_est = clf_cv.predict(X_train) # 用模型预测训练集的结果
train_est_p=clf_cv.predict_proba(X_train)[:,1] #用模型预测训练集的概率
test_est=clf_cv.predict(X_test) # 用模型预测测试集的结果
test_est_p=clf_cv.predict_proba(X_test)[:,1] # 用模型预测测试集的概率
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(y_train, train_est_p)
plt.plot(fpr_test, tpr_test, color='blue') # 测试集ROC曲线
plt.plot(fpr_train, tpr_train, color='red') # 训练集ROC曲线
plt.title('ROC curve')
# 看测试集AUC为多少
print(metrics.roc_auc_score(y_test, test_est_p))