主动学习ALEC——python实现

主动学习ALEC——python实现

时间:2022/6/29

文章目录

  • 主动学习ALEC——python实现
    • 算法实现
    • 算法运行测试

关于ALEC的算法思想在java实现的那一篇中 基于Density Peak的主动学习_木桷的博客-CSDN博客,而DP算法的思想则在 密度峰值聚类(Density Peak Cluster,DPC)——Python实现_木桷的博客-CSDN博客,同时DP算法的代码也在其中。

在ALEC中,DP算法的作用主要是为了选取代表性最高的样本作为分块查询的样本,根据DP算法的思想,代表性最高的样本在局部空间中最能代表本区域。查询该样本可以提高查询分类的速度和准确度。同时在分裂块时按照密度封装聚类的思想可用其master进行分块。

算法实现

将DP算法封装成模块来调用。

# Coding:utf-8
# @Time:2022/6/27,11:27
# @Auther:zhang
# @file:ALEC.py
# @Software:PyCharm
import math

from scipy.io import arff

from DP.DensityPeak import DensityPeak as DP
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


class ALEC:
    def __init__(self, datasetPath):
        '''
        构造方法,初始化
        :param datasetPath: 数据集地址
        '''
        # 距离矩阵
        self.distanceMatrix = None
        # 数据集实例标签
        self.labels_l = None
        # 数据集
        self.dataset = None
        # 半径计算比率
        self.dcRatio = 0
        # 最大查询数量
        self.maxNumQuery = 0
        # 最小块
        self.minBlock = 0
        # 预测标签
        self.predicts_l = []
        # 实例状态:0:未处理;1:已查询;2:已分类
        self.instanceStatus_l = []
        # 数据集实例数量
        self.numInstance = 0
        # 当前已查询数量
        self.numQuery = 0
        # 类标签
        self.classValue = []
        # 标签取值数量
        self.numClasses = None
        # master向量
        self.masters_l = None
        # 优先级
        self.priority_l = None
        # 簇分块下标,用于划分块
        self.clusterIndies = None
        # 读取数据集
        self.readData(datasetPath)

    def readData(self, datasetPath):
        '''
        读取数据集
        :param datasetPath: 数据集地址
        :return:
        '''
        # 读入arff文件
        # 使用pandas读入
        data = arff.loadarff(datasetPath)
        dataset = pd.DataFrame(data[0])
        samp = dataset.values[:, 0:len(dataset.values[0]) - 1]
        y = dataset.values[:, -1]
        templ = [(str(i).split("'")[1]) for i in y]
        tempClass = np.unique(templ)

        # 使用sklearn包将标称属性转化成数值
        le = LabelEncoder()
        le.fit(tempClass)
        tempLabel = le.transform(templ)

        # 初始化数据集相关参数
        self.dataset = np.array(samp, dtype=float)
        self.classValue = tempClass
        self.numClasses = len(self.classValue)
        self.numInstance = len(samp)
        self.labels_l = tempLabel

    def initALEC(self):
        '''
        初始化ALEC
        :return:
        '''

        # 计算数据集距离矩阵
        distanceMatrix = []
        for i in range(len(self.dataset)):
            tempdistances_l = [DP.getDistanceByEuclid(self.dataset[i], self.dataset[j]) for j in
                               range(len(self.dataset))]
            distanceMatrix.append(tempdistances_l)
        self.distanceMatrix = np.array(distanceMatrix)

        # 使用密度峰值聚类算法计算优先级和master
        dp = DP(self.distanceMatrix, self.dcRatio, dcType='max')
        self.priority_l = dp.representativeness_l
        self.masters_l = dp.masters_l
        self.predicts_l = -np.ones(self.numInstance, dtype=int)
        self.instanceStatus_l = np.zeros(self.numInstance, dtype=int)

    def vote(self, paraBlock):
        '''
        对当前块进行简单投票,选取最多的类标号作为本分块的类标号
        :param paraBlock: 需要投票的分块
        :return:
        '''

        # 统计分块类标号,选取最多的类标号
        tempclassCount = np.zeros(len(self.classValue), dtype=int)
        for i in range(len(paraBlock)):
            tempclassCount[self.labels_l[paraBlock[i]]] += 1
        tempClass = np.argmax(tempclassCount)

        # 对分分块实例赋予类标号
        for i in paraBlock:
            if self.instanceStatus_l[i] == 0:
                self.instanceStatus_l[i] = 2
                self.predicts_l[i] = tempClass

    def coincideWithMaster(self, i):
        '''
        读取当前实例的簇号,没有簇号的使用其master的簇号
        :param i: 欲分类实例索引
        :return: 实例簇号
        '''

        if self.clusterIndies[i] == -1:
            tempMaster = self.masters_l[i]
            self.clusterIndies[i] = self.coincideWithMaster(tempMaster)
        return self.clusterIndies[i]

    def splitInTwo(self, paraBlock):
        '''
        将当前分块分裂成两个块
        :param paraBlock: 欲分裂块
        :return: 分裂结果
        '''

        # 初始簇号
        self.clusterIndies = -np.ones(self.numInstance, dtype=int)
        for i in range(2):
            self.clusterIndies[paraBlock[i]] = i

        # 对块内实例划分簇号
        for index in paraBlock:
            if self.clusterIndies[index] != -1:
                continue
            self.clusterIndies[index] = self.coincideWithMaster(self.masters_l[index])
        # 按簇号进行分块
        resultBlocks = [[], []]
        for j in paraBlock:
            if self.clusterIndies[j] == 0:
                resultBlocks[0].append(j)
            else:
                resultBlocks[1].append(j)
        print("Split (", len(paraBlock), ") instances\n ", paraBlock.__str__()
              + "\n to (", len(resultBlocks[0]), ") instances\n " + resultBlocks[0].__str__()
              + "\nand (", len(resultBlocks[1]), ") instances\n", resultBlocks[1].__str__())

        return resultBlocks

    def clusterBasedActiveLearning(self, dcRatio, maxNumQuery, minBlock):
        '''
        启动ALEC
        :param dcRatio: 半径计算比率
        :param maxNumQuery: 最大查询数量
        :param minBlock: 最小分块
        :return: 
        '''

        self.dcRatio = dcRatio
        self.maxNumQuery = maxNumQuery
        self.minBlock = minBlock
        self.initALEC()
        block = np.argsort(self.priority_l)[::-1]
        print(block)
        self.numQuery = 0
        self.cluserBasedActiveLearning(block)

    def cluserBasedActiveLearning(self, paraBlock):
        '''
        递归的对数据块进行分类
        :param paraBlock: 分块
        :return: 
        '''

        print("clusterBasedActiveLearning for block ", paraBlock)

        # 计算当前块可查询数量
        tempExpectedQuery = int(math.sqrt(len(paraBlock)))
        tempQuery = 0
        # 统计已经查询的数量
        for i in range(len(paraBlock)):
            if self.instanceStatus_l[i] == 1:
                tempQuery += 1
        # 若当前块小于指定的最小块,或者没有可查询的数量时,进行投票
        if tempQuery >= tempExpectedQuery or len(paraBlock) <= self.minBlock:
            print("", tempQuery, " instances are queried, vote for block: \r\n", paraBlock)
            self.vote(paraBlock)
            return
        # 进行查询
        for i in range(tempExpectedQuery):
            if self.numQuery >= self.maxNumQuery:
                print("", tempQuery, " instances are queried, vote for block: \r\n", paraBlock)
                self.vote(paraBlock)
                return
            if self.instanceStatus_l[paraBlock[i]] == 0:
                self.instanceStatus_l[paraBlock[i]] = 1
                self.predicts_l[paraBlock[i]] = self.labels_l[paraBlock[i]]
                self.numQuery += 1

        # 检查当前分块是否是纯的,若不纯则进行分裂
        tempClass = self.predicts_l[paraBlock[0]]
        isPure = True
        for i in paraBlock:
            if self.predicts_l[i] != tempClass:
                isPure = False
                break

        if isPure:
            print("Classify for pure block: ", paraBlock)
            for i in paraBlock:
                if self.instanceStatus_l[i] == 0:
                    self.predicts_l[i] = tempClass
                    self.instanceStatus_l[i] = 2
            return
        else:
            tempBlocks = self.splitInTwo(paraBlock)
            for i in range(len(tempBlocks)):
                self.cluserBasedActiveLearning(tempBlocks[i])

    def __str__(self):
        '''
        便于展示结果及计算准确率
        :return: 
        '''

        tempCorrect = 0
        tempStatusCounts = [0, 0, 0]
        for i in range(self.numInstance):
            tempStatusCounts[self.instanceStatus_l[i]] += 1
            if self.predicts_l[i] == self.labels_l[i]:
                tempCorrect += 1
        resultString = "(unhandled, queried, classified) = " + tempStatusCounts.__str__()

        resultString += "\nCorrect = " + str(tempCorrect) + ", accuracy = " + str(tempCorrect / self.numInstance)

        return resultString


if __name__ == '__main__':
    alec = ALEC('../dataset/iris.arff')

    alec.clusterBasedActiveLearning(0.15, 30, 3)
    print(alec.__str__())

算法运行测试

使用iris数据集进行测试
主动学习ALEC——python实现_第1张图片

你可能感兴趣的:(python学习,python,学习,机器学习)