python 动手实现朴素贝叶斯

最近尝试不调用scikit-learn来实现一下朴素贝叶斯,发现还是不那么容易上手,我这里分享一下我的实现过程,也欢迎大家来批评指正哈

  • 导入库和数据
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from collections import defaultdict

data = pd.read_csv("A&E Synthetic Data Excerpt.csv",index_col = "Index")
labelEncoder = LabelEncoder()
labelEncoder.fit(data['AE_HRG'].astype('str'))
data['AE_HRG_num'] = labelEncoder.fit_transform(data['AE_HRG'].astype('str'))

data['AE_Arrive_Date'] = pd.to_datetime(data['AE_Arrive_Date'])
data['year'] = data['AE_Arrive_Date'].dt.year
data['month'] = data['AE_Arrive_Date'].dt.month
data['day']=data['AE_Arrive_Date'].dt.day

labelEncoder1 = LabelEncoder()
labelEncoder1.fit(data['Age_Band'].astype('str'))
data['Age_Band_num'] = labelEncoder1.fit_transform(data['Age_Band'].astype('str'))

labelEncoder2 = LabelEncoder()
labelEncoder2.fit(data['AE_Arrive_HourOfDay'].astype('str'))
data['AE_Arrive_HourOfDay_num'] = labelEncoder2.fit_transform(data['AE_Arrive_HourOfDay'].astype('str'))

feature = data[['IMD_Decile_From_LSOA', 'Sex', 'AE_Time_Mins', 'AE_Num_Diagnoses',
       'AE_Num_Investigations', 'AE_Num_Treatments', 'AE_Arrival_Mode',
       'Provider_Patient_Distance_Miles', 'AE_HRG_num',
       'year', 'month', 'day', 'Age_Band_num', 'AE_Arrive_HourOfDay_num']]
label = data["Admitted_Flag"]

这里面的数据就是一些表格数据,数据不方便公开,但是就是那个意思了,我把日期进行了离散化,其他的字段也做响应的离散化,后面想象不离散化也可以,本来就是离散的哈哈,如果不是离散的,就离散化一下就行了。

  • 朴素贝叶斯
class NaiveBayesScratch():
    """朴素贝叶斯算法Scratch实现"""
    def __init__(self):
        # 存储先验概率 P(Y=ck)
        self._prior_prob = defaultdict(float)
        # 存储似然概率 P(X|Y=ck)
        self._likelihood = defaultdict(defaultdict)
        # 存储每个类别的样本在训练集中出现次数
        self._ck_counter = defaultdict(float)
        # 存储每一个特征可能取值的个数
        self._Sj = defaultdict(float)

    def fit(self, X, y):
        """
        模型训练,参数估计使用贝叶斯估计
        X:
            训练集,每一行表示一个样本,每一列表示一个特征或属性
        y:
            训练集标签
        """
        n_sample, n_feature = X.shape
        # 计算每个类别可能的取值以及每个类别样本个数
        ck, num_ck = np.unique(y, return_counts=True)
        self._ck_counter = dict(zip(ck, num_ck))
        for label, num_label in self._ck_counter.items():
            # 计算先验概率,做了拉普拉斯平滑处理
            self._prior_prob[label] = (num_label + 1) / (n_sample + ck.shape[0])

        # 记录每个类别样本对应的索引
        ck_idx = []
        for label in ck:
            label_idx = np.squeeze(np.argwhere(y == label))
            ck_idx.append(label_idx)

        # 遍历每个类别
        for label, idx in zip(ck, ck_idx):
            xdata = X[idx]
            # 记录该类别所有特征对应的概率
            label_likelihood = defaultdict(defaultdict)
            # 遍历每个特征
            for i in range(n_feature):
                # 记录该特征每个取值对应的概率
                feature_val_prob = defaultdict(float)
                # 获取该列特征可能的取值和每个取值出现的次数
                feature_val, feature_cnt = np.unique(xdata[:, i], return_counts=True)
                self._Sj[i] = feature_val.shape[0]
                feature_counter = dict(zip(feature_val, feature_cnt))
                for fea_val, cnt in feature_counter.items():
                    # 计算该列特征每个取值的概率,做了拉普拉斯平滑
                    feature_val_prob[fea_val] = (cnt + 1) / (self._ck_counter[label] + self._Sj[i])
                label_likelihood[i] = feature_val_prob
            self._likelihood[label] = label_likelihood

    def predict(self, x):
        """
        输入样本,输出其类别,本质上是计算后验概率
        **注意计算后验概率的时候对概率取对数**,概率连乘可能导致浮点数下溢,取对数将连乘转化为求和
        """
        # 保存分类到每个类别的后验概率
        post_prob = defaultdict(float)
        # 遍历每个类别计算后验概率
        for label, label_likelihood in self._likelihood.items():
            prob = np.log(self._prior_prob[label])
            # 遍历样本每一维特征
            for i, fea_val in enumerate(x):
                feature_val_prob = label_likelihood[i]
                # 如果该特征值出现在训练集中则直接获取概率
                if fea_val in feature_val_prob:
                    prob += np.log(feature_val_prob[fea_val])
                else:
                    # 如果该特征没有出现在训练集中则采用拉普拉斯平滑计算概率
                    laplace_prob = 1 / (self._ck_counter[label] + self._Sj[i])
                    prob += np.log(laplace_prob)
            post_prob[label] = prob
        prob_list = list(post_prob.items())
        prob_list.sort(key=lambda v: v[1], reverse=True)
        # 返回后验概率最大的类别作为预测类别
        return prob_list[0][0]

这部分代码不是原创,我只是搬过来套用调通一下哈哈

  • 训练和验证
features = np.array(feature)
labels= np.array(label)
xtrain, xtest, ytrain, ytest = train_test_split(features, labels, train_size=0.8, shuffle=True)

model = NaiveBayesScratch()
model.fit(xtrain, ytrain)


n_test = xtest.shape[0]
n_right = 0
n_wrong=0
for i in range(n_test):
    y_pred = model.predict(xtest[i])
    if y_pred == ytest[i]:
        n_right += 1
    else:
        n_wrong+=1
print(n_right)
print(n_wrong)
print(n_right/n_test)

 

  • K-fold验证
from random import seed
from random import randrange
# https://machinelearningmastery.com/implement-resampling-methods-scratch-python/
# Split a dataset into k folds
def custom_k_fold(dataset, folds=3):
    dataset_split = list()
    dataset_copy = list(range(0,dataset.shape[0]))
    fold_size = len(dataset) // folds
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

fold_10=custom_k_fold(features,10) 
for i in range(10): # 选其中的N-1份作为训练集(training set),剩余的1份作为验证集(validation set)
    test_index=fold_10[i]
    x_train = feature.drop(feature.index[test_index])
    y_train=label.drop(label.index[test_index])
    print(x_train.shape)
    x_test =  feature.iloc[test_index]
    y_test=label.iloc[test_index]
    
    x_train = np.array(x_train)
    y_train= np.array(y_train)
    model.fit(x_train, y_train)
    
    x_test = np.array(x_test)
    y_test= np.array(y_test)
    n_test = x_test.shape[0]
    n_right = 0
    n_wrong=0
    TP=0
    FP=0
    TN=0
    FN=0
    for i in range(n_test):
        y_pred = model.predict(x_test[i])
#         print(y_pred)
#         print(y_test)
        if(y_pred==1 and y_test[i]==1):
            TP+=1
        elif(y_pred==1 and y_test[i]==0):
            FP+=1
        elif(y_pred==0 and y_test[i]==0):
            TN+=1
        elif(y_pred==0 and y_test[i]==1):
            FN+=1
    precision=TP / (TP + FP)
    recall=TP / (TP + FN)
    F1_Score = 2*precision*recall/(precision+recall)
    print('precision:{}'.format(precision))
    print('recall:{}'.format(precision))
    print('f1_score:{}'.format(precision))

好久没这么密集的贴代码了,哈哈,说实话还是有点不适应,想了想,如果放在电脑里,或许我一辈子都不会看了,但是放在网上,或许能够帮助到别人。

 

参考文献

https://github.com/lookenwu/lihang/blob/master/naive_bayes.py

你可能感兴趣的:(Naive,Bayes,机器学习)