朴素贝叶斯算法实现

朴素贝叶斯,名字中的朴素二字就代表着该算法对概率事件做了很大的简化,简化内容就是各个要素之间是相互独立的。

朴素贝叶斯算法实现_第1张图片

 

实现步骤:
1,创建Beyes类
2,类中包括四个方法,初始化方法用来创建保存中间计算结果的容器
3,fit 方法用来计算数据长度、计算条件概率所需数据集、
P(X)=∑kP(X|Y=Yk)P(Yk)
计算P(Yk):p_train_target
4,p_test_data用来计算一个样本的分类结果
5,classifier用来计算测试集所有分类结果
6,score用来计算准确率

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import time


class Beyes(object):
    def __init__(self):
        self.length = -1
        self.train_target_list = []
        self.p_train_target = {}
        self.split_data_lis = []
        self.feature_p_lis = []
        self.predict = []        

    def fit(self, train_data, train_target):
        """
        P(X)=∑kP(X|Y=Yk)P(Yk)
        计算P(Yk):p_train_target
        准备计算条件概率所需数据集:self.split_data_lis
        :param train_data:
        :param train_target:
        :return:
        """
        train_length = train_data.shape[0]
        self.length = train_length
        target_list = list(set(train_target))  
        self.train_target_list = target_list    
        target_classifier = dict(Counter(train_target))   
        train_data = pd.DataFrame(train_data)
        train_data['target'] = train_target  
        for target in self.train_target_list:
            self.p_train_target[target] = target_classifier[target]/self.length   
            split_data = train_data[train_data['target'] == target]
            self.split_data_lis.append(split_data)
        print('model had trained please use classifier() to get result')

    def p_test_data(self, sample):
        """
        :param sample:Serise
        :return self.train_target_list[position]:概率最大的类别
        """
        result_p = []
        for j in range(len(self.train_target_list)):
            p_label = 1
            this_target = self.train_target_list[j]
            this_data = self.split_data_lis[j]
            for i in range(0, sample.shape[0]):
                feature_num_dict = dict(Counter(this_data[i]))   
                if sample[i] in feature_num_dict:
                    label_num = feature_num_dict.get(sample[i])
                    p_label = p_label*(label_num/this_data.shape[0])   
                else:
                    
                    p_label = p_label*(1/(this_data.shape[0]+len(feature_num_dict)))
            this_target_p = p_label*self.p_train_target.get(this_target)   
            result_p.append(this_target_p)
        position = result_p.index(max(result_p))  
        return self.train_target_list[position]

    def classifier(self, test_data):
        """ 
        :param test_data:Serise
        :return:
        """
        if self.length == -1:
            raise ValueError('please use fit() to train the train data set ')
        else:
            test_data = pd.DataFrame(test_data)
            test_data['target'] = test_data.apply(self.p_test_data, axis=1)  #
            self.predict = list(test_data['target'])
            print('classfier result:', self.predict)

    def score(self, test_target):
        """ 
        :param test_target:
        :return:
        """

        if len(self.predict) == 0:
            raise ValueError('please use classifier() to get classifier target')
        else:
            count = 0
            for i in range(0, test_target.shape[0]):
                if test_target[i] == self.predict[i]:
                    count += 1
            score = count/(test_target.shape[0])
            print('the Classification accuracy is:', score)


if __name__ == '__main__':
    iris = load_iris()
    x = iris.data
    y = iris.target
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    print('训练集数据量', x_train.shape)
    print('测试集数据量', x_test.shape)
    start_time = time.time()
    classifier = Beyes()
    classifier.fit(x_train, y_train)
    classifier.classifier(x_test)
    classifier.score(y_test)
    end_time = time.time()
    time_d = end_time-start_time
    print("spend time:", time_d)

朴素贝叶斯算法实现_第2张图片

 

你可能感兴趣的:(算法,python,pycharm)