RID,age,income,student,credit_rating,class_buys_computer
1,youth,high,no,fair,no
2,youth,high,no,excellent,no
3,middle_aged,high,no,fair,yes
4,senior,medium,no,fair,yes
5,senior,low,yes,fair,yes
6,senior,low,yes,excellent,no
7,middle_aged,low,yes,excellent,yes
8,youth,medium,no,fair,no
9,youth,low,yes,fair,yes
10,senior,medium,yes,fair,yes
11,youth,medium,yes,excellent,yes
12,middle_aged,medium,no,excellent,yes
13,middle_aged,high,yes,fair,yes
14,senior,medium,no,excellent,no
RID,age,income,student,credit_rating,class_buys_computer
1,youth,high,no,fair,no
2,youth,high,no,excellent,no
3,middle_aged,high,no,fair,yes
4,senior,medium,no,fair,yes
5,senior,low,yes,fair,yes
6,senior,low,yes,excellent,no
7,middle_aged,low,yes,excellent,yes
8,youth,medium,no,fair,no
9,youth,low,yes,fair,yes
10,senior,medium,yes,fair,yes
11,youth,medium,yes,excellent,yes
12,middle_aged,medium,no,excellent,yes
13,youth,medium,no,excellent,yes
#coding=utf-8
#设置python编码
from sklearn.feature_extraction import DictVectorizer
import csv
import os
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
#---数据获取---
#使用CSV包,按行读取CSV数据
allElectronicsData = open(r'./AllElectronics.csv','rb')
reader = csv.reader(allElectronicsData)
#获取各个字段及其名
headers = reader.next()
print("headers : " ,headers)
#---数据预处理---
#sklearn只接受数值型的数据
#以CSV中第一行age数据为例
#age:youth middle_age senior
#矩阵: 1 0 0
#特征值List
featureList = []
#类别List , Yes/No
labelList = []
for row in reader:
#将每一行的结果放入labelList
labelList.append(row[len(row)-1])
#对每一行数据创建一个字典(将每行特征数据转为JSON格式),将headers中的字段与实际值对应 如age:youth
rowDict = {}
#i从1开始,取消RID的影响
for i in range(1,len(row)-1):
rowDict[headers[i]] = row[i]
featureList.append(rowDict)
# print labelList
# print featureList
#把featureList向量化
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()
print("dummyX : " + str(dummyX))
print ("feature mapping : " + str(vec.get_feature_names()))
#把labelList向量化,使用python自带LabelBinarizer
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
# print("dummyY : " + str(dummyY))
#使用tree分类器创建,使用信息熵 ID3算法
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX,dummyY)
print ("clf: " + str(clf))
#创建dot文件并输出树数据
with open('DTreeData.dot','w') as f:
f = tree.export_graphviz(clf,feature_names= vec.get_feature_names(),out_file=f)
os.system("dot -Tpdf D:\Data\MyCode\codepython\ML_Base_Demo\DecisionTree\DTreeData.dot -o D:\Data\MyCode\codepython\ML_Base_Demo\DecisionTree\DTree.pdf")
#利用生成的决策树进行预测
# oneRow = dummyX[1,:]
# print ("one row : " + str(oneRow))
#
# newRow = oneRow
# newRow[0] = 1
# newRow[2] = 0
# print("new row x : " + str(newRow))
#
# predictedY = clf.predict(newRow)
#
# print ("predict result : " + str(predictedY))
testSet = open(r'test_set.csv','rb')
reader = csv.reader(testSet)
reader.next()
testList = []
for row in reader:
#将每一行的结果放入labelList
labelList.append(row[len(row)-1])
#对每一行数据创建一个字典(将每行特征数据转为JSON格式),将headers中的字段与实际值对应 如age:youth
rowDict = {}
#i从1开始,取消RID的影响
for i in range(1,len(row)-1):
rowDict[headers[i]] = row[i]
testList.append(rowDict)
# print testList
#把testList向量化
vec = DictVectorizer()
testX = vec.fit_transform(testList).toarray()
print("testX : " + str(testX))
predictSet = clf.predict(testX)
print predictSet
('headers : ', ['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer'])
dummyX : [[ 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
[ 0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
[ 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
[ 0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
[ 0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
[ 0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
[ 1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
[ 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
[ 0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
[ 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
[ 0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]
[ 1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
[ 1. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
[ 0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]
feature mapping : ['age=middle_aged', 'age=senior', 'age=youth', 'credit_rating=excellent', 'credit_rating=fair', 'income=high', 'income=low', 'income=medium', 'student=no', 'student=yes']
clf: DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
Error: Could not open "D:\Data\MyCode\codepython\ML_Base_Demo\DecisionTree\DTree.pdf" for writing : Permission denied
testX : [[ 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
[ 0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
[ 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
[ 0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
[ 0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
[ 0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
[ 1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
[ 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
[ 0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
[ 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
[ 0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]
[ 1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
[ 0. 0. 1. 1. 0. 0. 0. 1. 1. 0.]]
[0 0 1 1 1 0 1 0 1 1 1 1 0]