决策树实例

# 决策树实例
# import numpy as np
from sklearn.feature_extraction import DictVectorizer
# from sklearn.model_selection import train_test_split
import csv
from sklearn import tree
from sklearn import preprocessing
# from six import StringIO
import graphviz

# Read in the csv file and put features into list of dict and list of class label
# 从csv文件中读取数据 并且打印标题栏
allElectronicsData = open(r'../file/AllElectronics.csv', 'rt')
reader = csv.reader(allElectronicsData)
headers = next(reader)
print(headers)

# 并将原始数据中的数据转换为数字形式
# axis=1,代表列,是要把data数据集中的所有数据按第四、五列之间分割为X集和Y集
# x, y = np.split(data,(4,),axis=1)

# 拆分训练数据与测试数据,为了进行交叉验证
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=2)
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

featureList = []
labelList = []
# 将特征向量和标签分别放在featureList和labelList中
for row in reader:
    labelList.append(row[len(row)-1])
    rowDict = {}
    for i in range(1, len(row)-1):
        rowDict[headers[i]] = row[i]
    featureList.append(rowDict)

print('labellist:', labelList)
print('featureList:', featureList )

# 对字典列表featureList进行转换,转换成特征矩阵
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList) .toarray()
# 可以通过inverse_transform将特征矩阵还原成原始数据
# print(vec.inverse_transform(dummyX) == featureList)

# 特征矩阵行代表数据,列代表特征,0表示该数据没有该特征
print("dummyX: " + str(dummyX))
# 获取打印特征列名
print(vec.get_feature_names_out())

print("labelList: " + str(labelList))

# 将标签二值化
lb = preprocessing.LabelBinarizer(sparse_output=False)
dummyY = lb.fit_transform(labelList)
print("dummyY: " + str(dummyY))

# Using decision tree for classification
# 使用信息熵作为划分标准,对决策树进行训练
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
print("clf: " + str(clf))

# 把决策树结构写入文件
#with open("allElectronicInformationGainOri.dot", 'w') as f:
f = tree.export_graphviz(clf, feature_names=vec.get_feature_names_out(), filled=True, rounded=True)

# 系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大
print('feature_importances_:', clf.feature_importances_)

# 构造一个测试样本 预测输出
oneRowX = dummyX[0, :]
print("oneRowX: " + str(oneRowX))
newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
newRowX = [newRowX]
print("newRowX: " + str(newRowX))
predictedY = clf.predict(newRowX).reshape(-1, 1)
print("predictedY: " + str(predictedY))

# 有测试样本的情况下使用下面语句测试
# score = clf.score(x_test, y_test)  # 返回预测的精确度accuracy
# print(score)

graph = graphviz.Source(f)
# 有中文字符的话使用下面语句
# graph = graphviz.Source(f.replace('helvetica', '"Microsoft YaHei"'), encoding='utf-8')
graph.render('AllElectronics')

你可能感兴趣的:(机器学习,python相关,scikit-learn)