时间:2022/6/29
在ALEC中,DP算法的作用主要是为了选取代表性最高的样本作为分块查询的样本,根据DP算法的思想,代表性最高的样本在局部空间中最能代表本区域。查询该样本可以提高查询分类的速度和准确度。同时在分裂块时按照密度封装聚类的思想可用其master进行分块。
将DP算法封装成模块来调用。
# Coding:utf-8
# @Time:2022/6/27,11:27
# @Auther:zhang
# @file:ALEC.py
# @Software:PyCharm
import math
from scipy.io import arff
from DP.DensityPeak import DensityPeak as DP
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
class ALEC:
def __init__(self, datasetPath):
'''
构造方法,初始化
:param datasetPath: 数据集地址
'''
# 距离矩阵
self.distanceMatrix = None
# 数据集实例标签
self.labels_l = None
# 数据集
self.dataset = None
# 半径计算比率
self.dcRatio = 0
# 最大查询数量
self.maxNumQuery = 0
# 最小块
self.minBlock = 0
# 预测标签
self.predicts_l = []
# 实例状态:0:未处理;1:已查询;2:已分类
self.instanceStatus_l = []
# 数据集实例数量
self.numInstance = 0
# 当前已查询数量
self.numQuery = 0
# 类标签
self.classValue = []
# 标签取值数量
self.numClasses = None
# master向量
self.masters_l = None
# 优先级
self.priority_l = None
# 簇分块下标,用于划分块
self.clusterIndies = None
# 读取数据集
self.readData(datasetPath)
def readData(self, datasetPath):
'''
读取数据集
:param datasetPath: 数据集地址
:return:
'''
# 读入arff文件
# 使用pandas读入
data = arff.loadarff(datasetPath)
dataset = pd.DataFrame(data[0])
samp = dataset.values[:, 0:len(dataset.values[0]) - 1]
y = dataset.values[:, -1]
templ = [(str(i).split("'")[1]) for i in y]
tempClass = np.unique(templ)
# 使用sklearn包将标称属性转化成数值
le = LabelEncoder()
le.fit(tempClass)
tempLabel = le.transform(templ)
# 初始化数据集相关参数
self.dataset = np.array(samp, dtype=float)
self.classValue = tempClass
self.numClasses = len(self.classValue)
self.numInstance = len(samp)
self.labels_l = tempLabel
def initALEC(self):
'''
初始化ALEC
:return:
'''
# 计算数据集距离矩阵
distanceMatrix = []
for i in range(len(self.dataset)):
tempdistances_l = [DP.getDistanceByEuclid(self.dataset[i], self.dataset[j]) for j in
range(len(self.dataset))]
distanceMatrix.append(tempdistances_l)
self.distanceMatrix = np.array(distanceMatrix)
# 使用密度峰值聚类算法计算优先级和master
dp = DP(self.distanceMatrix, self.dcRatio, dcType='max')
self.priority_l = dp.representativeness_l
self.masters_l = dp.masters_l
self.predicts_l = -np.ones(self.numInstance, dtype=int)
self.instanceStatus_l = np.zeros(self.numInstance, dtype=int)
def vote(self, paraBlock):
'''
对当前块进行简单投票,选取最多的类标号作为本分块的类标号
:param paraBlock: 需要投票的分块
:return:
'''
# 统计分块类标号,选取最多的类标号
tempclassCount = np.zeros(len(self.classValue), dtype=int)
for i in range(len(paraBlock)):
tempclassCount[self.labels_l[paraBlock[i]]] += 1
tempClass = np.argmax(tempclassCount)
# 对分分块实例赋予类标号
for i in paraBlock:
if self.instanceStatus_l[i] == 0:
self.instanceStatus_l[i] = 2
self.predicts_l[i] = tempClass
def coincideWithMaster(self, i):
'''
读取当前实例的簇号,没有簇号的使用其master的簇号
:param i: 欲分类实例索引
:return: 实例簇号
'''
if self.clusterIndies[i] == -1:
tempMaster = self.masters_l[i]
self.clusterIndies[i] = self.coincideWithMaster(tempMaster)
return self.clusterIndies[i]
def splitInTwo(self, paraBlock):
'''
将当前分块分裂成两个块
:param paraBlock: 欲分裂块
:return: 分裂结果
'''
# 初始簇号
self.clusterIndies = -np.ones(self.numInstance, dtype=int)
for i in range(2):
self.clusterIndies[paraBlock[i]] = i
# 对块内实例划分簇号
for index in paraBlock:
if self.clusterIndies[index] != -1:
continue
self.clusterIndies[index] = self.coincideWithMaster(self.masters_l[index])
# 按簇号进行分块
resultBlocks = [[], []]
for j in paraBlock:
if self.clusterIndies[j] == 0:
resultBlocks[0].append(j)
else:
resultBlocks[1].append(j)
print("Split (", len(paraBlock), ") instances\n ", paraBlock.__str__()
+ "\n to (", len(resultBlocks[0]), ") instances\n " + resultBlocks[0].__str__()
+ "\nand (", len(resultBlocks[1]), ") instances\n", resultBlocks[1].__str__())
return resultBlocks
def clusterBasedActiveLearning(self, dcRatio, maxNumQuery, minBlock):
'''
启动ALEC
:param dcRatio: 半径计算比率
:param maxNumQuery: 最大查询数量
:param minBlock: 最小分块
:return:
'''
self.dcRatio = dcRatio
self.maxNumQuery = maxNumQuery
self.minBlock = minBlock
self.initALEC()
block = np.argsort(self.priority_l)[::-1]
print(block)
self.numQuery = 0
self.cluserBasedActiveLearning(block)
def cluserBasedActiveLearning(self, paraBlock):
'''
递归的对数据块进行分类
:param paraBlock: 分块
:return:
'''
print("clusterBasedActiveLearning for block ", paraBlock)
# 计算当前块可查询数量
tempExpectedQuery = int(math.sqrt(len(paraBlock)))
tempQuery = 0
# 统计已经查询的数量
for i in range(len(paraBlock)):
if self.instanceStatus_l[i] == 1:
tempQuery += 1
# 若当前块小于指定的最小块,或者没有可查询的数量时,进行投票
if tempQuery >= tempExpectedQuery or len(paraBlock) <= self.minBlock:
print("", tempQuery, " instances are queried, vote for block: \r\n", paraBlock)
self.vote(paraBlock)
return
# 进行查询
for i in range(tempExpectedQuery):
if self.numQuery >= self.maxNumQuery:
print("", tempQuery, " instances are queried, vote for block: \r\n", paraBlock)
self.vote(paraBlock)
return
if self.instanceStatus_l[paraBlock[i]] == 0:
self.instanceStatus_l[paraBlock[i]] = 1
self.predicts_l[paraBlock[i]] = self.labels_l[paraBlock[i]]
self.numQuery += 1
# 检查当前分块是否是纯的,若不纯则进行分裂
tempClass = self.predicts_l[paraBlock[0]]
isPure = True
for i in paraBlock:
if self.predicts_l[i] != tempClass:
isPure = False
break
if isPure:
print("Classify for pure block: ", paraBlock)
for i in paraBlock:
if self.instanceStatus_l[i] == 0:
self.predicts_l[i] = tempClass
self.instanceStatus_l[i] = 2
return
else:
tempBlocks = self.splitInTwo(paraBlock)
for i in range(len(tempBlocks)):
self.cluserBasedActiveLearning(tempBlocks[i])
def __str__(self):
'''
便于展示结果及计算准确率
:return:
'''
tempCorrect = 0
tempStatusCounts = [0, 0, 0]
for i in range(self.numInstance):
tempStatusCounts[self.instanceStatus_l[i]] += 1
if self.predicts_l[i] == self.labels_l[i]:
tempCorrect += 1
resultString = "(unhandled, queried, classified) = " + tempStatusCounts.__str__()
resultString += "\nCorrect = " + str(tempCorrect) + ", accuracy = " + str(tempCorrect / self.numInstance)
return resultString
if __name__ == '__main__':
alec = ALEC('../dataset/iris.arff')
alec.clusterBasedActiveLearning(0.15, 30, 3)
print(alec.__str__())