import csv # 导入文件
import pandas as pd
from sklearn.feature_extraction import DictVectorizer # 用于生成哑变量
from sklearn import tree
melon = open(r".../data/watermelon_3.csv","rt")
reader = csv.reader(melon) # 迭代器
headers = next(reader) # python2语法为reader.next()
print(headers)
# ['编号', '色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率', '好瓜']
for row in reader:
print(row)
# ['1', '青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '0.697', '0.46', '是']
# ['2', '乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '0.774', '0.376', '是']
# ['3', '乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '0.634', '0.264', '是']
# ['4', '青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '0.608', '0.318', '是']
# ['5', '浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '0.556', '0.215', '是']
# ['6', '青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '0.403', '0.237', '是']
# ['7', '乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '0.481', '0.149', '是']
# ['8', '乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '0.437', '0.211', '是']
# ['9', '乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '0.666', '0.091', '否']
# ['10', '青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '0.243', '0.267', '否']
# ['11', '浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '0.245', '0.057', '否']
# ['12', '浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '0.343', '0.099', '否']
# ['13', '青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '0.639', '0.161', '否']
# ['14', '浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '0.657', '0.198', '否']
# ['15', '乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '0.36', '0.37', '否']
# ['16', '浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '0.593', '0.042', '否']
# ['17', '青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '0.719', '0.103', '否']
# melon = pd.read_csv(".../data/watermelon_3.csv",encoding='GB18030')
# headers = list(melon.columns )
# print(headers)
# ['编号', '色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率', '好瓜']
featureList = []
labelList = []
#存放在两个元祖中
for row in reader:
labelList.append(row[len(row)-1])
rowDic = {}
for i in range(1,len(row)-1):
rowDic[headers[i]] = row[i]
featureList.append(rowDic)
print(featureList )
print(labelList )
featureList = []
labelList = []
#存放在两个元祖中
for i in range(melon.shape[0]):
labelList.append(melon['好瓜'][i])
rowDic = {}
for j in range(1, melon.shape[1] - 1):
rowDic[headers[j]] = melon.loc[i][j]
featureList.append(rowDic)
print(featureList )
print(labelList )
[{'色泽': '青绿', '根蒂': '蜷缩', '敲声': '浊响', '纹理': '清晰', '脐部': '凹陷', '触感': '硬滑', '密度': '0.697', '含糖率': '0.46'},
{'色泽': '乌黑', '根蒂': '蜷缩', '敲声': '沉闷', '纹理': '清晰', '脐部': '凹陷', '触感': '硬滑', '密度': '0.774', '含糖率': '0.376'},
{'色泽': '乌黑', '根蒂': '蜷缩', '敲声': '浊响', '纹理': '清晰', '脐部': '凹陷', '触感': '硬滑', '密度': '0.634', '含糖率': '0.264'},
{'色泽': '青绿', '根蒂': '蜷缩', '敲声': '沉闷', '纹理': '清晰', '脐部': '凹陷', '触感': '硬滑', '密度': '0.608', '含糖率': '0.318'},
{'色泽': '浅白', '根蒂': '蜷缩', '敲声': '浊响', '纹理': '清晰', '脐部': '凹陷', '触感': '硬滑', '密度': '0.556', '含糖率': '0.215'},
{'色泽': '青绿', '根蒂': '稍蜷', '敲声': '浊响', '纹理': '清晰', '脐部': '稍凹', '触感': '软粘', '密度': '0.403', '含糖率': '0.237'},
{'色泽': '乌黑', '根蒂': '稍蜷', '敲声': '浊响', '纹理': '稍糊', '脐部': '稍凹', '触感': '软粘', '密度': '0.481', '含糖率': '0.149'},
{'色泽': '乌黑', '根蒂': '稍蜷', '敲声': '浊响', '纹理': '清晰', '脐部': '稍凹', '触感': '硬滑', '密度': '0.437', '含糖率': '0.211'},
{'色泽': '乌黑', '根蒂': '稍蜷', '敲声': '沉闷', '纹理': '稍糊', '脐部': '稍凹', '触感': '硬滑', '密度': '0.666', '含糖率': '0.091'},
{'色泽': '青绿', '根蒂': '硬挺', '敲声': '清脆', '纹理': '清晰', '脐部': '平坦', '触感': '软粘', '密度': '0.243', '含糖率': '0.267'},
{'色泽': '浅白', '根蒂': '硬挺', '敲声': '清脆', '纹理': '模糊', '脐部': '平坦', '触感': '硬滑', '密度': '0.245', '含糖率': '0.057'},
{'色泽': '浅白', '根蒂': '蜷缩', '敲声': '浊响', '纹理': '模糊', '脐部': '平坦', '触感': '软粘', '密度': '0.343', '含糖率': '0.099'},
{'色泽': '青绿', '根蒂': '稍蜷', '敲声': '浊响', '纹理': '稍糊', '脐部': '凹陷', '触感': '硬滑', '密度': '0.639', '含糖率': '0.161'},
{'色泽': '浅白', '根蒂': '稍蜷', '敲声': '沉闷', '纹理': '稍糊', '脐部': '凹陷', '触感': '硬滑', '密度': '0.657', '含糖率': '0.198'},
{'色泽': '乌黑', '根蒂': '稍蜷', '敲声': '浊响', '纹理': '清晰', '脐部': '稍凹', '触感': '软粘', '密度': '0.36', '含糖率': '0.37'},
{'色泽': '浅白', '根蒂': '蜷缩', '敲声': '浊响', '纹理': '模糊', '脐部': '平坦', '触感': '硬滑', '密度': '0.593', '含糖率': '0.042'},
{'色泽': '青绿', '根蒂': '蜷缩', '敲声': '沉闷', '纹理': '稍糊', '脐部': '稍凹', '触感': '硬滑', '密度': '0.719', '含糖率': '0.103'}]
['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否']
v = DictVectorizer(sparse=True) # sparse=True 表示生成稀疏矩阵
dummyX = v.fit_transform(featureList).toarray()
print("dummyX:",dummyX )
print(v.get_feature_names() )
print("labelList:"+str(labelList))
dummyX: [[0.46 0.697 0. 1. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 0. 0. 1. 1. 0. ]
[0.376 0.774 1. 0. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 1. 0. 0. 1. 0. ]
[0.264 0.634 0. 1. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 1. 0. 0. 1. 0. ]
[0.318 0.608 1. 0. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 0. 0. 1. 1. 0. ]
[0.215 0.556 0. 1. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 0. 1. 0. 1. 0. ]
[0.237 0.403 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.
0. 1. 0. 0. 1. 0. 1. ]
[0.149 0.481 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.
0. 1. 1. 0. 0. 0. 1. ]
[0.211 0.437 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.
0. 1. 1. 0. 0. 1. 0. ]
[0.091 0.666 1. 0. 0. 0. 1. 0. 0. 0. 1. 0.
0. 1. 1. 0. 0. 1. 0. ]
[0.267 0.243 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.
1. 0. 0. 0. 1. 0. 1. ]
[0.057 0.245 0. 0. 1. 1. 0. 0. 1. 0. 0. 0.
1. 0. 0. 1. 0. 1. 0. ]
[0.099 0.343 0. 1. 0. 0. 0. 1. 1. 0. 0. 0.
1. 0. 0. 1. 0. 0. 1. ]
[0.161 0.639 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 1. 1. 0. ]
[0.198 0.657 1. 0. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 1. 0. 1. 0. ]
[0.37 0.36 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.
0. 1. 1. 0. 0. 0. 1. ]
[0.042 0.593 0. 1. 0. 0. 0. 1. 1. 0. 0. 0.
1. 0. 0. 1. 0. 1. 0. ]
[0.103 0.719 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.
0. 1. 0. 0. 1. 1. 0. ]]
['含糖率', '密度', '敲声=沉闷', '敲声=浊响', '敲声=清脆', '根蒂=硬挺', '根蒂=稍蜷', '根蒂=蜷缩', '纹理=模糊', '纹理=清晰', '纹理=稍糊',
'脐部=凹陷', '脐部=平坦', '脐部=稍凹', '色泽=乌黑', '色泽=浅白', '色泽=青绿', '触感=硬滑', '触感=软粘']
labelList:['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否']
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print("dummyY:" + str(dummyY))
# dummyY:[[1]
# [1]
# [1]
# [1]
# [1]
# [1]
# [1]
# [1]
# [0]
# [0]
# [0]
# [0]
# [0]
# [0]
# [0]
# [0]
# [0]]
clf = tree.DecisionTreeClassifier(criterion="entropy") #创建一个分类器,entropy表示用ID3,可换成gini等其他
clf = clf.fit(dummyX, dummyY)
#预测
oneRowX = dummyX[0,:]
print("oneRowX:" +str(oneRowX) )
newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX:" +str(newRowX))
predictedY = clf.predict([newRowX])
print("predictedY:" + str(predictedY))
oneRowX:[1. 0.697 0. 1. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 0. 0. 1. 1. 0. ]
newRowX:[1. 0.697 0. 1. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 0. 0. 1. 1. 0. ]
predictedY:[1]
中文显示有问题,尚未解决成功。
# 先将melon的变量名全部换成英文,尽可能的让图清洗一些。
melon.rename(columns={'编号': 'num', '色泽': 'color', '根蒂': 'root', '敲声':'sound', '纹理': 'texture', '脐部': 'navel',
'触感': 'touch', '密度': 'density', '含糖率': 'sugar', '好瓜': 'G&B'}, inplace=True)
melon.loc[melon['G&B'] == '是', 'G&B'] = 'GOOD'
melon.loc[melon['G&B'] == '否', 'G&B'] = 'BAD'
#Visulize model
with open("C..../data/tree1.dot","w") as f:
f = tree.export_graphviz(clf, feature_names=v.get_feature_names(), out_file = f)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=v.get_feature_names(),
class_names=melon['G&B'].unique(),
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('.../data/tree1.pdf') # 下图root=□□ 为变量哑变量变量名,取值为0-1
下面根据iris数据集给一个完整的可视化图例
from sklearn.datasets import load_iris
from sklearn import tree
import pydotplus
iris = load_iris()
iris.feature_names
clf = tree.DecisionTreeClassifier()
clf_iris = clf.fit(iris.data, iris.target)
with open('.../data/iris.dot', 'w') as f:
f = tree.export_graphviz(clf_iris, out_file=f)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('.../data/iris.pdf')
参考博客:https://blog.csdn.net/gamer_gyt/article/details/51226904
https://blog.csdn.net/u012845311/article/details/77294973