Shattering Distribution for Active Learning
IEEE TRANSACTIONS ON NEURAL NETWORKS AND LEARNING SYSTEMS
"""
Code of SDAL for paper: Shattering Distribution for Active Learning
This Code is exactly the same as the original codes.
"""
import xlwt
import xlrd
import math
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from collections import OrderedDict
from sklearn.svm import SVC
from scipy.special import expit
from copy import deepcopy
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from KBS_NEW.PointwiseQuery.ALOR import ALOR
from sklearn.metrics import accuracy_score, mean_squared_error
from time import time
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.utils.validation import check_X_y
from scipy.linalg import pinv, pinv2, pinvh
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
class sdal():
def __init__(self, X, y, labeled, budget, X_test, y_test):
self.X = X
self.y = y
self.nSample, self.nDim = X.shape
self.labels = sorted(np.unique(self.y))
self.nClass = len(self.labels)
self.X_test = X_test
self.y_test = y_test
self.budget = deepcopy(budget)
self.budgetLeft = deepcopy(budget)
self.labeled = list(deepcopy(labeled))
self.unlabeled = self.initialization()
self.K = rbf_kernel(X=self.X, gamma=0.1)
self.lamb = 10e-4
self.halving_ids = self.get_halving()
def initialization(self):
unlabeled = list(range(self.nSample))
for idx in self.labeled:
unlabeled.remove(idx)
return unlabeled
def get_halving(self):
"""Corresponding to the Halving function in the original code"""
Halving_ids = []
num_unlabeled = len(self.unlabeled)
num_half = int(np.floor(num_unlabeled))
if num_half < self.budget:
num_half = self.budget
Tmp_unlabeled = deepcopy(self.unlabeled)
Halving_left = deepcopy(num_half)
while Halving_left > 0:
score = OrderedDict()
for idx in Tmp_unlabeled:
score[idx] = np.linalg.norm(self.K[idx,:]) / (self.K[idx,idx] + self.lamb)
tar_idx = max(score, key=score.get)
Halving_ids.append(tar_idx)
self.K = self.K - np.outer(self.K[tar_idx],self.K[tar_idx]) / (self.K[tar_idx,tar_idx] + self.lamb)
Halving_left -= 1
return Halving_ids
def NumberDensity(self, data, Center, Radius):
f = 0.
for i in range(len(data)):
Ball_dist = []
dist = []
for j in range(len(Center)):
dist.append(np.linalg.norm(data[i, :] - Center[j, :]))
if dist[j] < Radius:
a=np.array(dist[j])
Ball_dist.append(dist[j])
f = f + sum(np.exp(np.array(Ball_dist) / 1.8) ** 2) / (len(Ball_dist) + 1)
return f
def select(self):
"""Corresponding to the SDAL function in the original codes"""
if self.budget == len(self.halving_ids):
for idx in self.halving_ids:
self.labeled.append(idx)
self.unlabeled.remove(idx)
self.budgetLeft -= 1
else:
data = self.X[self.halving_ids]
clf = KMeans(n_clusters=self.budget)
clf.fit(data)
Center = clf.cluster_centers_
Radi = 0.25
T = 0
L = data.shape[0]
f = self.NumberDensity(data, Center, Radi)
while T < 50:
for j in range(self.budget):
Ball = []
dist = []
for i in range(L):
dist.append(np.linalg.norm(data[i, :] - Center[j, :]))
if dist[i] < Radi:
Ball.append(data[i, :])
if len(Ball) == 0:
Center[j, :] = Center[j, :]
else:
Center[j, :] = np.mean(np.array(Ball), 0)
F = self.NumberDensity(data, Center, Radi)
cul = np.zeros((len(Center), len(Center)))
flag = 0
for j in range(len(Center)):
for i in range(len(Center)):
cul[i, j] = np.linalg.norm(Center[i, :] - Center[j, :])
if i != j and cul[i, j] < 2 * Radi:
flag = 1
if F - f == 0 or flag:
break
else:
f = F
T += 1
Radi = (1 + 0.1) * Radi
# -----------------------------
selected_ids = np.zeros(self.budget)
for b in range(self.budget):
min_dist = np.inf
tmp_center = Center[b]
for idx in self.halving_ids:
dist = np.linalg.norm(tmp_center - self.X[idx])
if dist <= min_dist:
min_dist = dist
selected_ids[b] = idx
# --------------------------
for idx in selected_ids:
self.labeled.append(idx)
return self
if __name__ == '__main__':
names_list = ["PowerPlant-5bin"]
for name in names_list:
print("########################{}".format(name))
p = Path("D:\OCdata")
data_path = Path(r"D:\OCdata")
partition_path = Path(r"E:\CCCCC_Result\DataPartitions")
# kmeans_path = Path(r"E:\CCCCC_Result\KmeansResult")
"""--------------read the whole data--------------------"""
read_data_path = data_path.joinpath(name + ".csv")
data = np.array(pd.read_csv(read_data_path, header=None))
X = np.asarray(data[:, :-1], np.float64)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = data[:, -1]
y -= y.min()
nClass = len(np.unique(y))
Budget = 10 * nClass
"""--------read the partitions--------"""
read_partition_path = str(partition_path.joinpath(name + ".xls"))
book_partition = xlrd.open_workbook(read_partition_path)
"""-----read the kmeans results according to the partition-----"""
# read_kmeans_path = str(kmeans_path.joinpath(name + ".xls"))
# book_kmeans = xlrd.open_workbook(read_kmeans_path)
workbook = xlwt.Workbook()
count = 0
for SN in book_partition.sheet_names():
S_Time = time()
train_idx = []
test_idx = []
labeled = []
table_partition = book_partition.sheet_by_name(SN)
for idx in table_partition.col_values(0):
if isinstance(idx,float):
train_idx.append(int(idx))
for idx in table_partition.col_values(1):
if isinstance(idx,float):
test_idx.append(int(idx))
for idx in table_partition.col_values(2):
if isinstance(idx,float):
labeled.append(int(idx))
X_train = X[train_idx]
y_train = y[train_idx].astype(np.int32)
X_test = X[test_idx]
y_test = y[test_idx]
model = sdal(X=X_train, y=y_train, labeled=labeled, budget=Budget, X_test=X_test, y_test=y_test)
model.select()
# SheetNames = "{}".format(count)
sheet = workbook.add_sheet(SN)
for i, idx in enumerate(train_idx):
sheet.write(i, 0, int(idx))
for i, idx in enumerate(test_idx):
sheet.write(i, 1, int(idx))
for i, idx in enumerate(labeled):
sheet.write(i, 2, int(idx))
for i, idx in enumerate(model.labeled):
sheet.write(i, 3, int(idx))
print("SN:",SN," Time:",time()-S_Time)
save_path = Path(r"E:\CCCCC_Result\SelectedResult\SDAL")
save_path = str(save_path.joinpath(name + ".xls"))
workbook.save(save_path)