python机器学习——数据的分类(knn,决策树,贝叶斯)代码笔记

import pandas as pd
import numpy as np

from sklearn.preprocessing import Imputer#导入数据预处理模块处理原始数据
from sklearn.model_selection import train_test_split#导入自动生成训练集和测试集的模块
from sklearn.metrics import classification_report#导入预测结果评估模块

from sklearn.neighbors import KNeighborsClassifier#knn近邻算法
from sklearn.tree import DecisionTreeClassifier#决策树算法模块
from sklearn.naive_bayes import GaussianNB#贝叶斯算法模块

#数据导入模块
def loadDataSet(feature_paths,label_paths):
    #创建空数组
    feature=np.ndarray(shape=(0,41))
    label=np.ndarray(shape=(0,1))

    #处理数据
    for file in feature_paths:
        #逗号分隔符读取特征数据,问号替换为缺失值,不读取表头。
        df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        #补全函数:指明丢失数据为缺失值,采用平均值补全缺失值。
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        #函数方法调用接头
        imp.fit(df)
        df=imp.transform(df)#transform会将一个函数应用到各个分组。
        #将新的数据合并到特征集合中
        feature = np.concatenate((feature, df))
    for file in label_paths:
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))

    #将标签规整为一维向量。
    label = np.ravel(label)
    return feature, label

#主函数模块
if __name__ =='__main__':
    #设置数据标签块
    feature_paths=['A.feature','B.feature','C.feature','D.feature','E.feature']
    label_paths = ['A.label','B.label','C.label','D.label','E.label']
    #读入训练数据
    x_train,y_train=loadDataSet(feature_paths[:4],label_paths[:4])
    #读入测试数据
    x_test,y_test=loadDataSet(feature_paths[4:],label_paths[4:])
    #使用train_test_split函数打乱训练数据
    x_train,x_,y_train,y_=train_test_split(x_train,y_train,test_size=0.0)#如果test_size=0那么得到的训练数据就是完整的原始数据,只是打乱了顺序。

    #创建k邻近分类器
    #打印说明,说明要使用k邻近器
    print('start trainning knn……')
    knn=KNeighborsClassifier().fit(x_train ,y_train)
    print('I have already trainning the data you just gave me!')
    answer_knn=knn.predict(x_test)
    print('Prediction done!')

    #创建决策树分类器
    print('start trainning DecisionTreeClassifier……')
    dt = DecisionTreeClassifier().fit(x_train, y_train)
    print('I have already trainning the data you just gave me!')
    answer_dt=dt.predict(x_test)
    print('Prediction done!')

    #创建贝叶斯分类器
    print('start trainning Bayes……')
    gnb = GaussianNB().fit(x_train, y_train)
    print('I have already trainning the data you just gave me!')
    answer_gnb= gnb.predict(x_test)
    print('Prediction done!')


    #评价结果
    #classification_report()将对数据从精确率precision,召回率recall,f1—scoref1值,以及支持度support四个维度对数据结果进行评价
    print('\n\nThe classification report for knn:')
    print(classification_report(y_test, answer_knn))
    print('\n\nThe classification report for DT:')
    print(classification_report(y_test, answer_dt))
    print('\n\nThe classification report for Bayes:')
    print(classification_report(y_test, answer_gnb))
注:关于transform函数的使用还不是很明确。

你可能感兴趣的:(算法)