python复现论文:Gaussian Switch Sampling: A Second OrderApproach to Active Learning

python复现论文:Gaussian Switch Sampling: A Second OrderApproach to Active Learning_第1张图片

The proposed approach, shorthanded GauSS for Gaussian Swith Sampling.


import xlwt
import xlrd
import numpy as np
import pandas as pd
from pathlib import Path
from collections import OrderedDict
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from time import time
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_kernels, pairwise_distances
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.utils.validation import check_X_y
from numpy.linalg import inv
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
np.seterr(divide='ignore',invalid='ignore')
from sklearn.neighbors import NearestNeighbors


class GauSS():
    def __init__(self,X, y, labeled, budget, batch, X_test, y_test):
        self.X = X
        self.y = y
        self.nSample, self.nDim = X.shape
        self.labels = sorted(np.unique(self.y))
        self.nClass = len(self.labels)
        self.M = np.array([[(i - j) ** 2 for i in range(self.nClass)] for j in range(self.nClass)])
        self.X_test = X_test
        self.y_test = y_test
        self.labeled = list(deepcopy(labeled))
        self.distMatrix = pairwise_distances(X=self.X, metric='euclidean')
        self.K = -self.distMatrix
        self.c = 0.01  #1/C C = 100
        self.model_initialization()

        self.switch_event = np.zeros(self.nSample, dtype=float)
        self._prevoius_pred = np.zeros(self.nSample,dtype=float)

        self.budgetLeft = deepcopy(budget)
        self.batch = batch
        # -------------------------------
        self.MZElist = []
        self.MAElist = []
        self.F1list = []
        self.ALC_MZE = 0.0
        self.ALC_MAE = 0.0
        self.ALC_F1 = 0.0
        self.Redundancy = 0.0
        # -------------------------------

    def model_initialization(self):
        # ------------训练初始KELMOC模型----------------
        n = len(self.labeled)
        self.T_labeled = self.M[self.y[self.labeled],:]
        self.Kernel_labeled = self.K[np.ix_(self.labeled,self.labeled)]
        self.Kernel_labeled_inv = np.linalg.inv(self.c * np.eye(n) + self.Kernel_labeled)
        self.Beta = self.Kernel_labeled_inv @ self.T_labeled

        # ------------生成初始无标记样本池----------------
        self.unlabeled = [i for i in range(self.nSample)]
        for idx in self.labeled:
            self.unlabeled.remove(idx)
        return self

    def Batch_Block_Matrix_Inverse(self,A11_inv, A12, A21, A22):
        """
        A11 A12  -1    ==   B11   B12
        A21 A22             B21  B22
        """
        n = A11_inv.shape[0]
        m = A22.shape[0]
        new_M = np.zeros((m+n, m+n))
        B22 = inv(A22 - A21 @ A11_inv @ A12)
        new_M[n:,n:] = B22
        new_M[:n,:n] = A11_inv + (A11_inv @ A12) @ B22 @ (A21 @ A11_inv)
        new_M[:n,n:] = -A11_inv @ A12 @ B22
        new_M[n:,:n] = -B22 @ A21 @ A11_inv
        return new_M


    def Point_Block_Matrix_Inverse(self,A11_inv, A12, A21, A22):
        n = A11_inv.shape[0]
        M = np.zeros((n+1, n+1))
        B22 = inv(A22 - A21 @ A11_inv @ A12)
        M[n,n] = B22
        M[:n,:n] = A11_inv + (A11_inv @ A12) @ B22 @ (A21 @ A11_inv)
        M[:n,n] = (-A11_inv @ A12 @ B22).reshape(-1)
        M[n,:n] = (-B22 @ A21 @ A11_inv).reshape(-1)
        return M



    def model_incremental_train(self, new_ids):
        A11_inv = self.Kernel_labeled_inv
        A12 = self.K[np.ix_(self.labeled, new_ids)]
        A21 = A12.T
        A22 = self.K[np.ix_(new_ids, new_ids)] + self.c * np.eye(len(new_ids))
        Kernel_Bar_Inv = self.Batch_Block_Matrix_Inverse(A11_inv=A11_inv, A12=A12, A21=A21, A22=A22)
        T_Bar = np.vstack((self.T_labeled, self.M[self.y[new_ids],:]))  # 已经验证该行无误
        Beta_Bar = Kernel_Bar_Inv @ T_Bar
        # -------------------------
        self.Kernel_labeled_inv = Kernel_Bar_Inv
        self.T_labeled = T_Bar
        self.Beta = Beta_Bar



    def tmp_incremental_train(self, tmp_idx):
        A11_inv = self.Kernel_labeled_inv
        A12 = self.K[np.ix_(self.labeled, [tmp_idx])]
        A21 = A12.T
        A22 = self.K[tmp_idx, tmp_idx] + self.c
        K_bar_inv = self.Point_Block_Matrix_Inverse(A11_inv=A11_inv, A12=A12, A21=A21, A22=A22)
        return K_bar_inv


    def predict_proba(self, X):
        K = -pairwise_distances(X,self.X[self.labeled], metric='euclidean')
        coded_tmp = K.dot(self.Beta)
        predictions = np.linalg.norm(coded_tmp[:, None] - self.M, axis=2, ord=2)
        predictions = -predictions
        predictions = np.exp(predictions)
        predictions_sum = np.sum(predictions, axis=1, keepdims=True)
        proba_matrix = predictions / predictions_sum
        return proba_matrix


    def predict(self, X):
        K = -pairwise_distances(X=X, Y=self.X[self.labeled], metric="euclidean")
        # print("K:",K.shape)
        # print("Beta:",self.Beta.shape)
        coded_preds = K.dot(self.Beta)
        predictions = np.argmin(np.linalg.norm(coded_preds[:, None] - self.M, axis=2, ord=1), axis=1)
        # predictions = self.le_.inverse_transform(predictions)
        return predictions


    def evaluation(self):
        y_hat = self.predict(X=self.X_test)
        self.MZElist.append(1-accuracy_score(self.y_test, y_hat))
        self.MAElist.append(mean_absolute_error(self.y_test, y_hat))
        self.F1list.append(f1_score(self.y_test, y_hat, average='macro'))


    def select(self):
        self.evaluation()
        self._prevoius_pred = self.predict(X=self.X)
        # print("self._prevoius_pred:",self._prevoius_pred)
        iterations = 1
        while self.budgetLeft > 0:
            if self.batch > self.budgetLeft:
                self.batch = self.budgetLeft
            selected_ids = []
            if iterations == 1:
                selected_ids = np.random.choice(self.unlabeled, size=self.batch, replace=False)
                self._current_pred = self.predict(X=self.X)
                tmp_switchs = np.zeros(self.nSample)
                tmp_switchs[self._prevoius_pred !=self._current_pred] = 1
                self.switch_event += tmp_switchs
                self._prevoius_pred = self._current_pred
            else:
                self._current_pred = self.predict(X=self.X)
                tmp_switchs = np.zeros(self.nSample)
                tmp_switchs[self._prevoius_pred !=self._current_pred] = 1
                self.switch_event += tmp_switchs
                # print("self.switch_event:",self.switch_event)
                self._prevoius_pred = self._current_pred
                switch_list = []
                switch_list_index = []
                for i, idx in enumerate(self.unlabeled):
                    if self.switch_event[idx] != 0:
                        switch_list.append(self.switch_event[idx])
                        switch_list_index.append(idx)

                switch_list = np.array(switch_list)
                switch_list_index = np.array(switch_list_index)

                if len(set(switch_list)) == 1 and len(switch_list) >=self.batch:
                    selected_ids = np.random.choice(switch_list_index, size=self.batch, replace=False)
                elif len(switch_list) < self.batch:
                    nswitch_list_index = []
                    for idx in self.unlabeled:
                        if idx not in switch_list_index:
                            nswitch_list_index.append(idx)
                    #-------------------------------------
                    for idx in switch_list_index:
                        selected_ids.append(idx)
                    remain_need_num = self.batch - len(switch_list)
                    for idx in np.random.choice(nswitch_list_index, size=remain_need_num, replace=False):
                        selected_ids.append(idx)
                elif len(set(switch_list)) != 1 and len(switch_list) >= self.batch:
                    gmm = GaussianMixture(n_components=2).fit(switch_list.reshape(-1,1))
                    pred = gmm.predict(switch_list.reshape(-1,1))
                    probs = gmm.predict_proba(switch_list.reshape(-1,1))
                    component_0_mean_values = []
                    component_1_mean_values = []
                    for e, ele in enumerate(pred):
                        if ele == 0:
                            component_0_mean_values.append(switch_list[e])
                        else:
                            component_1_mean_values.append(switch_list[e])
                    relevent_globle_inds = []
                    relevent_local_inds = []
                    flag = None
                    if np.mean(component_0_mean_values) <= np.mean(component_1_mean_values):
                        flag = 1
                        for e, ele in enumerate(pred):
                            if ele == 1:
                                relevent_local_inds.append(e)
                                relevent_globle_inds.append(switch_list_index[e])
                    else:
                        flag = 0
                        for e, ele in enumerate(pred):
                            if ele == 0:
                                relevent_local_inds.append(e)
                                relevent_globle_inds.append(switch_list_index[e])

                    if len(relevent_globle_inds) == self.batch:
                        selected_ids = relevent_globle_inds
                    elif len(relevent_globle_inds) < self.batch:
                        ord_switch = np.flipud(np.argsort(switch_list))
                        for i in range(self.batch):
                            selected_ids.append(switch_list_index[ord_switch[i]])
                    elif len(relevent_globle_inds) > self.batch:
                        relevent_probs = probs[:,flag][relevent_local_inds]
                        # print("relevent_probs:",relevent_probs)
                        norm_probs = relevent_probs / np.sum(relevent_probs)
                        selected_ids = np.random.choice(relevent_globle_inds, size=self.batch, replace=False, p=norm_probs)


            self.model_incremental_train(new_ids=selected_ids)
            # ----------将选择的样本从无标记样本池中剔除--------------
            # print("selected_ids:",selected_ids)
            for idx in selected_ids:
                self.unlabeled.remove(idx)
            # ----------将选择的样本加入训练样本集-------------------
            for idx in selected_ids:
                self.labeled.append(idx)


            self.budgetLeft -= self.batch
            self.evaluation()
            iterations += 1
            self.ALC_MZE += 0.5 * (self.MZElist[-2] + self.MZElist[-1]) * self.batch
            self.ALC_MAE += 0.5 * (self.MAElist[-2] + self.MAElist[-1]) * self.batch
            self.ALC_F1 += 0.5 * (self.F1list[-2] + self.F1list[-1]) * self.batch

        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(X=self.X[self.labeled])
        self.Redundancy = (1/np.mean(neigh.kneighbors()[0].flatten()))




if __name__ == '__main__':
    method = "EMOC_LT"
    names_list = ["Newthyroid", "Balance-scale","Knowledge",
                  "Nursery","Toy","Housing-5bin",
                    "Eucalyptus", "Automobile1",
                  "Glass",  "Winequality-red", "Stock-10bin","Obesity1",
                  "Stock-10bin","Computer-10bin","CTGs","Thyroid1","Thyroid2"]


    class results():
        def __init__(self):
            self.MZEList = []
            self.MAEList = []
            self.F1List = []
            self.ALC_MZE = []
            self.ALC_MAE = []
            self.ALC_F1 = []
            self.Redun = []

    class stores():
        def __init__(self):
            self.Redun_mean = []
            self.Redun_std = []
            #-----------------------
            self.MZEList_mean = []
            self.MZEList_std = []
            # -----------------
            self.MAEList_mean = []
            self.MAEList_std = []
            # -----------------
            self.F1List_mean = []
            self.F1List_std = []
            # -----------------
            # -----------------
            self.ALC_MZE_mean = []
            self.ALC_MZE_std = []
            # -----------------
            self.ALC_MAE_mean = []
            self.ALC_MAE_std = []
            # -----------------
            self.ALC_F1_mean = []
            self.ALC_F1_std = []
            # -----------------
            self.ALC_MZE_list = []
            self.ALC_MAE_list = []
            self.ALC_F1_list = []


    for name in names_list:
        print("########################{}".format(name))
        data_path = Path(r"E:\GGGG_BIYE\DataSet")
        partition_path = Path(r"E:\GGGG_BIYE\Partition")
        """--------------read the whole data--------------------"""
        read_data_path = data_path.joinpath(name + ".csv")
        data = np.array(pd.read_csv(read_data_path, header=None))
        X = np.asarray(data[:, :-1], np.float64)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        y = data[:, -1]
        y -= y.min()
        nClass = len(np.unique(y))
        Budget = 25 * nClass
        batch = nClass

        """--------read the partitions--------"""
        read_partition_path = str(partition_path.joinpath(name + ".xls"))
        book_partition = xlrd.open_workbook(read_partition_path)
        # --------------------------------------
        RESULT = results()
        STORE = stores()
        # --------------------------------------
        workbook = xlwt.Workbook()
        count = 0
        for SN in book_partition.sheet_names():
            S_Time = time()
            train_idx = []
            test_idx = []
            labeled = []
            table_partition = book_partition.sheet_by_name(SN)
            for idx in table_partition.col_values(0):
                if isinstance(idx,float):
                    train_idx.append(int(idx))
            for idx in table_partition.col_values(1):
                if isinstance(idx,float):
                    test_idx.append(int(idx))
            for idx in table_partition.col_values(2):
                if isinstance(idx,float):
                    labeled.append(int(idx))

            X_train = X[train_idx]
            y_train = y[train_idx].astype(np.int32)
            X_test = X[test_idx]
            y_test = y[test_idx]

            model = GauSS(X=X_train, y=y_train, labeled=labeled, budget=Budget, batch=batch, X_test=X_test, y_test=y_test)
            model.select()
            RESULT.MZEList.append(model.MZElist)
            RESULT.MAEList.append(model.MAElist)
            RESULT.F1List.append(model.F1list)
            RESULT.ALC_MZE.append(model.ALC_MZE)
            RESULT.ALC_MAE.append(model.ALC_MAE)
            RESULT.ALC_F1.append(model.ALC_F1)
            RESULT.Redun.append(model.Redundancy)

            print("SN===",SN, "time:",time()-S_Time)


        STORE.Redun_mean = np.mean(RESULT.Redun)
        STORE.Redun_std = np.std(RESULT.Redun)
        STORE.MZEList_mean = np.mean(RESULT.MZEList, axis=0)
        STORE.MZEList_std = np.std(RESULT.MZEList, axis=0)
        STORE.MAEList_mean = np.mean(RESULT.MAEList, axis=0)
        STORE.MAEList_std = np.std(RESULT.MAEList, axis=0)
        STORE.F1List_mean = np.mean(RESULT.F1List, axis=0)
        STORE.F1List_std = np.std(RESULT.F1List, axis=0)
        STORE.ALC_MZE_mean = np.mean(RESULT.ALC_MZE)
        STORE.ALC_MZE_std = np.std(RESULT.ALC_MZE)
        STORE.ALC_MAE_mean = np.mean(RESULT.ALC_MAE)
        STORE.ALC_MAE_std = np.std(RESULT.ALC_MAE)
        STORE.ALC_F1_mean = np.mean(RESULT.ALC_F1)
        STORE.ALC_F1_std = np.std(RESULT.ALC_F1)
        STORE.ALC_MZE_list = RESULT.ALC_MZE
        STORE.ALC_MAE_list = RESULT.ALC_MAE
        STORE.ALC_F1_list = RESULT.ALC_F1

        sheet_names = ["MZE_mean", "MZE_std", "MAE_mean", "MAE_std", "F1_mean", "F1_std",
                       "ALC_MZE_list","ALC_MAE_list","ALC_F1_list",
                       "ALC_MZE", "ALC_MAE", "ALC_F1","Redun"]
        workbook = xlwt.Workbook()
        for sn in sheet_names:
            print("sn::",sn)
            sheet = workbook.add_sheet(sn)
            n_col = len(STORE.MZEList_mean)
            if sn == "MZE_mean":
                sheet.write(0, 0, method)
                for j in range(1,n_col + 1):
                    sheet.write(0,j,STORE.MZEList_mean[j - 1])
            elif sn == "MZE_std":
                sheet.write(0, 0, method)
                for j in range(1,n_col + 1):
                    sheet.write(0,j,STORE.MZEList_std[j - 1])
            elif sn == "MAE_mean":
                sheet.write(0, 0, method)
                for j in range(1,n_col + 1):
                    sheet.write(0,j,STORE.MAEList_mean[j - 1])
            elif sn == "MAE_std":
                sheet.write(0, 0, method)
                for j in range(1,n_col + 1):
                    sheet.write(0,j,STORE.MAEList_std[j - 1])
            elif sn == "F1_mean":
                sheet.write(0, 0, method)
                for j in range(1,n_col + 1):
                    sheet.write(0,j,STORE.F1List_mean[j - 1])
            elif sn == "F1_std":
                sheet.write(0, 0, method)
                for j in range(1,n_col + 1):
                    sheet.write(0,j,STORE.F1List_std[j - 1])

            # ---------------------------------------------------
            elif sn == "ALC_MZE_list":
                sheet.write(0, 0, method)
                for j in range(1,len(STORE.ALC_MZE_list) + 1):
                    sheet.write(0,j,STORE.ALC_MZE_list[j - 1])
            elif sn == "ALC_MAE_list":
                sheet.write(0, 0, method)
                for j in range(1,len(STORE.ALC_MAE_list) + 1):
                    sheet.write(0,j,STORE.ALC_MAE_list[j - 1])
            elif sn == "ALC_F1_list":
                sheet.write(0, 0, method)
                for j in range(1,len(STORE.ALC_F1_list) + 1):
                    sheet.write(0,j,STORE.ALC_F1_list[j - 1])
            # -----------------
            elif sn == "ALC_MZE":
                sheet.write(0, 0, method)
                sheet.write(0, 1, STORE.ALC_MZE_mean)
                sheet.write(0, 2, STORE.ALC_MZE_std)
            elif sn == "ALC_MAE":
                sheet.write(0, 0, method)
                sheet.write(0, 1, STORE.ALC_MAE_mean)
                sheet.write(0, 2, STORE.ALC_MAE_std)
            elif sn == "ALC_F1":
                sheet.write(0, 0, method)
                sheet.write(0, 1, STORE.ALC_F1_mean)
                sheet.write(0, 2, STORE.ALC_F1_std)
            elif sn == "Redun":
                sheet.write(0, 0, method)
                sheet.write(0, 1, STORE.Redun_mean)
                sheet.write(0, 2, STORE.Redun_std)


        save_path = Path(r"E:\EMOC_Batch\GauSS")
        save_path = str(save_path.joinpath(name + ".xls"))
        workbook.save(save_path)

你可能感兴趣的:(python,开发语言)