数据 DataComputerBuys.csv
数据预处理:
# -*- coding:utf-8 -*-
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
allElectronicsData = open('DataComputerBuys.csv', 'rt')
reader = csv.reader(allElectronicsData)
headers = next(reader)
print(headers)
#['RID', 'age', 'income', 'student', 'credit_rating', 'Class_buys_computer']
featureList = [] #对应每个样例的特征及特征值
labelList = [] #对应每个样例的标记值
for row in reader:
labelList.append(row[-1]) #取每一行数据末尾的分类标记
rowDict = {}
for i in range(1, len(row) - 1):
rowDict[headers[i]] = row[i]
featureList.append(rowDict)
print(featureList)
#[{'age': 'youth', 'credit_rating': 'fair', 'student': 'no', 'income': 'high'}, {'age': 'youth', 'credit_rating': 'excellent', 'student': 'no', 'income': 'high'}, {'age': 'middle_aged', 'credit_rating': 'fair', 'student': 'no', 'income': 'high'}, {'age': 'senior', 'credit_rating': 'fair', 'student': 'no', 'income': 'medium'}, {'age': 'senior', 'credit_rating': 'fair', 'student': 'yes', 'income': 'low'}, {'age': 'senior', 'credit_rating': 'excellent', 'student': 'yes', 'income': 'low'}, {'age': 'middle_aged', 'credit_rating': 'excellent', 'student': 'yes', 'income': 'low'}, {'age': 'youth', 'credit_rating': 'fair', 'student': 'no', 'income': 'medium'}, {'age': 'youth', 'credit_rating': 'fair', 'student': 'yes', 'income': 'low'}, {'age': 'senior', 'credit_rating': 'fair', 'student': 'yes', 'income': 'medium'}, {'age': 'youth', 'credit_rating': 'excellent', 'student': 'yes', 'income': 'medium'}, {'age': 'middle_aged', 'credit_rating': 'excellent', 'student': 'no', 'income': 'medium'}, {'age': 'middle_aged', 'credit_rating': 'fair', 'student': 'yes', 'income': 'high'}, {'age': 'senior', 'credit_rating': 'excellent', 'student': 'no', 'income': 'medium'}]
print(labelList)
#['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()
print(vec.get_feature_names())
#['age=middle_aged', 'age=senior', 'age=youth', 'credit_rating=excellent', 'credit_rating=fair', 'income=high', 'income=low', 'income=medium', 'student=no', 'student=yes']
print("dummyX:", str(dummyX))
'''
dummyX: [[ 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
[ 0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
[ 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
[ 0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
[ 0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
[ 0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
[ 1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
[ 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
[ 0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
[ 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
[ 0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]
[ 1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
[ 1. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
[ 0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]
'''
#[ 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]等价于'age=youth','credit_rating=fair', 'income=high','student=no'
print("labelList:", str(labelList))
#打印对应的是否买电脑的标记
#labelList: ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
#将标记转换为0,1
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print("dummyY:", str(dummyY))
'''
dummyY: [[0]
[0]
[1]
[1]
[1]
[0]
[1]
[0]
[1]
[1]
[1]
[1]
[1]
[0]]
'''
构建决策树
#创建一个决策树分类器, criterion='entropy'表示选择ID3算法,信息熵
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
print("clf:", str(clf))
'''
clf: DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
'''
#要将数字化的0,1转为特征
with open('DataComputerBuys.dot', 'w') as f:
f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
上面的代码会在同级目录生成 DataComputerBuys.dot 文件,首先使用命令在控制台中
conda install graphviz
安装相应库,并将其bin目录加入环境变量,重启电脑,然后在控制台中进入该文件目录并使用命令将.dot文件转换为PDF,
dot -Tpdf DataComputerBuys.dot -o DataComputerBuys.pdf
打开PDF文件如下所示:
value = [5,9] 表示当前Class_buy_computers属性里有5个no,9个yes,即5个不买电脑,9个买电脑
value = [0,4]就表示4个买电脑
value = [1,0]就表示1个不买电脑
oneRowX = dummyX[0, :]
print("oneRowX:", str(oneRowX))
#oneRowX: [ 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX:", str(newRowX))
#newRowX: [ 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
# 表示'age=middle_aged','credit_rating=fair','income=high','student=no'
predictedY = clf.predict([newRowX])
print("predictedY:", str(predictedY))
#predictedY: [1],表示买电脑
X_ = [[ 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]]
# 表示'age=senior','credit_rating=fair','income=low','student=yes'
predictedX = clf.predict(X_)
print("predictX:", str(predictedX))
#predictX: [1],表示买电脑
构建决策树时通常采用自上而下的方法,在每一步选择一个最好的属性来分裂。“最好”的定义是使子节点中的训练集尽量的纯。不同的算法使用不同的指标来定义“最好”。常见的有信息增益(ID3,C4.5和C5.0算法)和基尼不纯度指标(CART算法)
使用scikit-learn库构建决策树还有很多可以记录,待更!