参考https://blog.csdn.net/qq_39011567/article/details/102732543对数据集的介绍
参考https://blog.csdn.net/qq_20106375/article/details/94158472对决策树算法的介绍
1、使用算法:DecisionTreeClassifier分类算法
2、实现过程:
1、建立工程,导入sklearn相关包
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
2、加载数据,划分训练集和测试集
X_train,Y_train=load_datasets(featurePaths[:4], labelPaths[:4])#前四个文件作为训练集
X_test,Y_test=load_datasets(featurePaths[4:], labelPaths[4:])#后两个文件作为测试集
X_train,x_,Y_train,y_=train_test_split(X_train,Y_train,test_size=0.0)#将训练集打乱
3、创建GaussianNB算法实例GNB,并进行训练,获得标签
dt = DecisionTreeClassifier()
dt.fit(X_train,Y_train)
4、调用dt.score()方法查看分类准确率,调用classification_report(Y_test,Y_predict)查看分类效果
accrucy=dt.score(X_test,Y_test)
classification_report=classification_report(Y_test,Y_predict)
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 28 17:16:41 2019
@author: zhang
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
def load_datasets(feature_paths, label_paths):
feature = np.ndarray(shape=(0,41))
label = np.ndarray(shape=(0,1))
for file in feature_paths:
df = pd.read_table(file, delimiter=',', na_values='?', header=None)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(df)
df = imp.transform(df)
feature = np.concatenate((feature, df))
for file in label_paths:
df = pd.read_table(file, header=None)
label = np.concatenate((label, df))
label = np.ravel(label)
return feature, label
if __name__=="__main__":
featurePaths = [r'C:\Users\l\Desktop\ML\naive_bayes\A.feature',r'C:\Users\l\Desktop\ML\naive_bayes\B.feature',r'C:\Users\l\Desktop\ML\naive_bayes\C.feature',r'C:\Users\l\Desktop\ML\naive_bayes\D.feature',r'C:\Users\l\Desktop\ML\naive_bayes\E.feature']
labelPaths = [r'C:\Users\l\Desktop\ML\naive_bayes\A.label',r'C:\Users\l\Desktop\ML\naive_bayes\B.label',r'C:\Users\l\Desktop\ML\naive_bayes\C.label',r'C:\Users\l\Desktop\ML\naive_bayes\D.label',r'C:\Users\l\Desktop\ML\naive_bayes\E.label']
X_train,Y_train=load_datasets(featurePaths[:4], labelPaths[:4])#前四个文件作为训练集
X_test,Y_test=load_datasets(featurePaths[4:], labelPaths[4:])#后两个文件作为测试集
X_train,x_,Y_train,y_=train_test_split(X_train,Y_train,test_size=0.0)#将训练集打乱
dt = DecisionTreeClassifier()
dt.fit(X_train,Y_train)
Y_predict=dt.predict(X_test)
accrucy=dt.score(X_test,Y_test)
classification_report=classification_report(Y_test,Y_predict)
print(accrucy)
print(classification_report)