西瓜数 课后习题7.6 AODE 独依赖,半朴素贝叶斯分类器

import csv
from math import log


def read_data(filename):
    '''
    读数据,西瓜数据集3.0, 只取离散属性数据
    :return: 返回离散数据集X 和标签集Y
    '''
    X, Y = [], []
    with open(filename) as f:
        reader = csv.reader(f)
        header_row = next(reader)
        for line in reader:
            X.append(line[1:7])
            Y.append(line[9])
    return X, Y


def AODE(X, Y, num_feat=6):
    '''
    AODE模型训练,得出参数
    :param X: 数据集,不包含标签,只取离散属性
    :param Y: 标签集
    :param num_feat: 数据集标签数默认为6
    :return:返回概率P(c,xi)和P(xj|c,xi),分别以三层字典和五层字典形式表示
    '''
    num_total = len(X)
    unique_labels = unique_list(Y)
    num_class = len(unique_labels)
    # 每种离散属性可以取值范围
    attrs = []
    for i in range(num_feat):
        attr = []
        for n in range(num_total):
            if X[n][i] not in attr:
                attr.append(X[n][i])
        attrs.append(attr)
    # 计数字典和概率字典初始化
    count_c_xi = {}
    prob_c_xi = {}
    count_xj_c_xi = {}
    prob_xj_c_xi = {}
    for c in range(num_class):
        count_c_xi[unique_labels[c]] = {}
        prob_c_xi[unique_labels[c]] = {}
        count_xj_c_xi[unique_labels[c]] = {}
        prob_xj_c_xi[unique_labels[c]] = {}
        for i in range(num_feat):
            count_c_xi[unique_labels[c]][i] = {}
            prob_c_xi[unique_labels[c]][i] = {}
            count_xj_c_xi[unique_labels[c]][i] = {}
            prob_xj_c_xi[unique_labels[c]][i] = {}
            for attr_i in attrs[i]:
                count_c_xi[unique_labels[c]][i][attr_i] = 0
                prob_c_xi[unique_labels[c]][i][attr_i] = 0
                count_xj_c_xi[unique_labels[c]][i][attr_i] = {}
                prob_xj_c_xi[unique_labels[c]][i][attr_i] = {}
                for j in range(num_feat):
                    count_xj_c_xi[unique_labels[c]][i][attr_i][j] = {}
                    prob_xj_c_xi[unique_labels[c]][i][attr_i][j] = {}
                    for attr_j in attrs[j]:
                        count_xj_c_xi[unique_labels[c]][i][attr_i][j][attr_j] = 0
                        prob_xj_c_xi[unique_labels[c]][i][attr_i][j][attr_j] = 0
    # 计数,并存入字典
    for n in range(num_total):
        for i in range(num_feat):
            count_c_xi[Y[n]][i][X[n][i]] += 1
            for j in range(num_feat):
                count_xj_c_xi[Y[n]][i][X[n][i]][j][X[n][j]] += 1
    # 计算概率,并存入字典
    for c in range(num_class):
        for i in range(num_feat):
            num_value_i = len(attrs[i])
            for attr_i_value, num_c_xi in count_c_xi[unique_labels[c]][i].items():
                prob_c_xi[unique_labels[c]][i][attr_i_value] = float(num_c_xi + 1) / (
                        num_total + num_class * num_value_i)
                for j in range(num_feat):
                    num_value_j = len(attrs[j])
                    for attr_j_value, num_c_xi_xj in count_xj_c_xi[unique_labels[c]][i][attr_i_value][j].items():
                        prob_xj_c_xi[unique_labels[c]][i][attr_i_value][j][attr_j_value] = float(num_c_xi_xj + 1) / (
                                num_c_xi + num_value_j)
    return prob_c_xi, prob_xj_c_xi


def unique_list(data_list):
    '''
    属性不同取值次数
    :param data_list:输入一种属性数据
    :return: 字典形式返回
    '''
    unique_list = []
    for e in data_list:
        if e not in unique_list:
            unique_list.append(e)
    return unique_list


def predict(data_test, prob_c_xi, prob_xj_c_xi, num_feat=6):
    '''
    利用AODE模型进行预测
    :param data_test: 预测数据集,一条离散数据集
    :param prob_c_xi: P(c,xi)概率,三层字典形式
    :param prob_xj_c_xi:P(xj|c,xi)概率,五层字典形式
    :param num_feat:特征维度,默认6
    :return:返回预测结果,及概率值
    '''
    unique_labels = unique_list(Y)
    num_class = len(unique_labels)
    probs = []
    for c in range(num_class):
        p_c = 0
        p_c_xi = 0
        for i in range(num_feat):
            p_c_xi = log(prob_c_xi[unique_labels[c]][i][data_test[i]])
            for j in range(num_feat):
                p_c_xi += log(prob_xj_c_xi[unique_labels[c]][i][data_test[i]][j][data_test[j]])
        p_c += p_c_xi
        probs.append(p_c)
    prob = max(probs)
    index = probs.index(prob)
    print(probs)
    return prob, unique_labels[index]


if __name__ == '__main__':
    filename = "C:\\Users\\14399\\Desktop\\西瓜3.0.csv"
    X, Y = read_data(filename)
    # 训练模型参数
    prob_c_xi, prob_xj_c_xi = AODE(X, Y)
    # 测试用例
    data_test = ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
    # 预测结果
    predict_result = predict(data_test, prob_c_xi, prob_xj_c_xi)
    print('predict result:', predict_result)

结果:[-3.9807872793600967, -7.823817413301291]
           predict result: (-3.9807872793600967, '是')

西瓜3.0数据集:链接:https://pan.baidu.com/s/1RXTUG9gP1Jn9HKFCiEzOlA         密码:3h6n

参考:https://blog.csdn.net/VictoriaW/article/details/78293291        

 

你可能感兴趣的:(机器学习)