西瓜书《机器学习》课后答案——chapter7_7.3AODE

AODE算法的难点在于存构建存储计数的数据结构,这里采用三层字典表示 P(c,xi) ,五层字典表示 P(xj|c,xi) 。由于数据集比较小,我们取 m=0 。另外,对于连续属性,不知道怎么处理,所以这里只考虑离散属性。

# -*-coding:gbk -*-
""" @Author: Victoria @Date: 2017.10.19 21:30 """
import xlrd

class AODE():
    def __init__(self, d, class_num = 2):
        #discrete features number
        self.d = d
        self.class_num = class_num

    def train(self, X, y):
        """ The training process of AODE is to save estimated joint probability. """
        count_xj_c_xi = {}
        count_c_xi = {}
        prob_xj_c_xi = {}
        prob_c_xi = {}

        N = len(X)

        attrs = []
        for i in range(self.d):
            attr = []
            for n in range(N):
                if X[n][i] not in attr:
                    attr.append(X[n][i])
            attrs.append(attr) 

        for c in range(self.class_num):
            count_c_xi[c] = {}
            prob_c_xi[c] = {}
            count_xj_c_xi[c] = {}
            prob_xj_c_xi[c] = {}

            for i in range(self.d):
                count_c_xi[c][i] = {}
                prob_c_xi[c][i] = {}
                count_xj_c_xi[c][i] = {}
                prob_xj_c_xi[c][i] = {}
                for attr_i in attrs[i]:
                    count_c_xi[c][i][attr_i] = 0
                    prob_c_xi[c][i][attr_i] = 0
                    count_xj_c_xi[c][i][attr_i] = {}
                    prob_xj_c_xi[c][i][attr_i] = {}
                    for j in range(self.d):
                        count_xj_c_xi[c][i][attr_i][j] = {}
                        prob_xj_c_xi[c][i][attr_i][j] = {}
                        for attr_j in attrs[j]:
                            count_xj_c_xi[c][i][attr_i][j][attr_j] = 0
                            prob_xj_c_xi[c][i][attr_i][j][attr_j] = 0


        for n in range(N):
            for i in range(self.d):
                    count_c_xi[y[n]][i][X[n][i]] += 1

                    for j in range(self.d):
                        count_xj_c_xi[y[n]][i][X[n][i]][j][X[n][j]] += 1


        for c in range(self.class_num):
            for i in range(self.d):
                #the values number of i-th attribution
                v_i = len(attrs[i])
                for attr_i_value, N_c_xi in count_c_xi[c][i].items():
                    prob_c_xi[c][i][attr_i_value] = float(N_c_xi + 1) / (N + self.class_num *v_i)

                    for j in range(self.d):
                        v_j = len(attrs[j])
                        for attr_j_value, N_c_xi_xj in count_xj_c_xi[c][i][attr_i_value][j].items():
                            prob_xj_c_xi[c][i][attr_i_value][j][attr_j_value] = float(N_c_xi_xj + 1) / (N_c_xi + v_j)

        self.count_xj_c_xi = count_xj_c_xi
        self.count_c_xi = count_c_xi
        self.prob_xj_c_xi = prob_xj_c_xi
        self.prob_c_xi = prob_c_xi

    def predict(self, x):
        probs = []
        for c in range(self.class_num):
            prob_c = 0
            for i in range(self.d):

                prob_j_c_i_product = 1.0
                for j in range(self.d):
                    prob_j_c_i_product *= self.prob_xj_c_xi[c][i][x[i]][j][x[j]]
                prob_c_i_term = self.prob_c_xi[c][i][x[i]] * prob_j_c_i_product
            prob_c += prob_c_i_term
            probs.append(prob_c)
        label = probs.index(max(probs))
        prob = max(probs)
        return label, prob



if __name__=="__main__":
    workbook = xlrd.open_workbook("../../数据/3.0.xlsx")
    sheet = workbook.sheet_by_name("Sheet1")
    X = []
    for i in range(17):
        x = sheet.col_values(i)[0:6]
        for j in range(6):
            x[j] = int(x[j])
        print x
        X.append(x)
    y = sheet.row_values(8)
    y = [int(i) for i in y]
    aode = AODE(d=6)
    aode.train(X, y)
    label, prob = aode.predict([1, 1, 1, 1, 1, 1])
    print "the predict label is {} with prob {}".format(label, prob)

预测结果:

the predict label is 1 with prob 0.0186709343088 #预测为正例

你可能感兴趣的:(机器学习)