本文复现的代码为论文----Attributed Graph Clustering with Dual Redundancy Reduction(IJCAI-2022)。
属性图聚类是图数据探索的一种基本而又必要的方法。最近在图对比学习方面的努力已经取得了令人印象深刻的聚类性能。然而,作者观察到:
为此,作者开发了一种新的方法,称为双冗余减少的属性图聚类(AGC-DRR),以减少输入空间和潜在特征空间中的信息冗余。特别地,
import opt
import torch
import numpy as np
from GAE import IGAE,IGAE_encoder
from view_learner import ViewLearner
from utils import *
from train import Train_gae
from sklearn.decomposition import PCA
import warnings
'''
main.py
'''
# 忽略警告,警告不必输出
warnings.filterwarnings('ignore')
setup()
print("use cuda: {}".format(opt.args.cuda))
device = opt.args.device
# 读取图数据集
x, y, adj = load_graph_data(opt.args.name)
# PCA降维
pca1 = PCA(n_components=opt.args.n_components)
x1 = pca1.fit_transform(x)
edge_index1 = np.array(adj.nonzero())
adj = normlize_adj(adj, True)
data = torch.from_numpy(x1).float()
adj = torch.from_numpy(adj).float()
adj = adj.to_dense()
model_gae = IGAE(
gae_n_enc_1=opt.args.gae_n_enc_1,
gae_n_enc_2=opt.args.gae_n_enc_2,
gae_n_enc_3=opt.args.gae_n_enc_3,
n_input=data.shape[1]
).to(device)
view_learner = ViewLearner(
IGAE_encoder(gae_n_enc_1=opt.args.gae_n_enc_1,
gae_n_enc_2=opt.args.gae_n_enc_2,
gae_n_enc_3=opt.args.gae_n_enc_3,
n_input=data.shape[1]),
).to(device)
Train_gae(model_gae, view_learner, data.to(device), adj.to(device), y, edge_index1)
import numpy as np
import scipy.sparse as sp
import torch
import random
import opt
import numpy as np
from sklearn import metrics
from munkres import Munkres
import torch.nn.functional as F
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
'''
utils.py
'''
# set up--------------------------------------------------------------------------start
def setup():
setup_seed(np.random.randint(1000))
if opt.args.name == 'acm':
print('acm...............')
opt.args.n_clusters = 3
opt.args.t = 2
opt.args.view_lr = 1e-4
opt.args.lr = 5e-4
# opt.args.n_input = 100
elif opt.args.name == 'dblp':
print('dblp...............')
opt.args.n_clusters = 4
opt.args.t = 2
opt.args.view_lr = 1e-4
opt.args.lr = 5e-4
# opt.args.n_input = 50
elif opt.args.name == 'amap':
print('amap...............')
opt.args.n_clusters = 8
opt.args.t = 5
opt.args.view_lr = 1e-4
opt.args.lr = 5e-4
elif opt.args.name == 'cite':
print('cite...............')
opt.args.n_clusters = 6
opt.args.t = 2
opt.args.view_lr = 1e-3
opt.args.lr = 1e-4
elif opt.args.name == 'cora':
print('cora...............')
opt.args.n_clusters = 7
opt.args.t = 2
opt.args.view_lr = 1e-4
opt.args.lr = 5e-4
elif opt.args.name == 'corafull':
print('corafull...............')
opt.args.n_clusters = 70
opt.args.t = 2
opt.args.view_lr = 1e-4
opt.args.lr = 5e-4
else:
print("error!")
exit(0)
opt.args.device = torch.device(f"cuda:{opt.args.gpu}" if opt.args.cuda else "cpu")
def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# set up-------------------------------------------------------------------------------end
# torch-numpy transform--------------------------------------------------------------start
def numpy_to_torch(a, sparse=False):
if sparse:
a = torch.sparse.Tensor(a)
a = a.to_sparse()
else:
a = torch.FloatTensor(a)
return a
def torch_to_numpy(t):
return t.numpy()
# torch-numpy transform----------------------------------------------------------------end
def load_graph_data(dataset_name, show_details=False):
load_path = "dataset/" + dataset_name + "/" + dataset_name
feat = np.load(load_path+"_feat.npy", allow_pickle=True)
label = np.load(load_path+"_label.npy", allow_pickle=True)
adj = np.load(load_path+"_adj.npy", allow_pickle=True)
if show_details:
print("-----details of graph dataset------")
print("dataset name: ", dataset_name)
print("feature shape: ", feat.shape)
print("label shape: ", label.shape)
print("adj shape: ", adj.shape)
print("undirected edge num: ", int(np.nonzero(adj)[0].shape[0]/2))
print("category num: ", max(label)-min(label)+1)
print("category distribution: ")
for i in range(max(label)+1):
print("label", i, end=":")
print(len(label[np.where(label == i)]))
print("++++++++++++++++++++++++++++++")
opt.args.n_input = feat.shape[1]
return feat, label, adj
def new_graph(edge_index, weight, n, device):
edge_index = edge_index.cpu().numpy()
indices = torch.from_numpy(
np.vstack((edge_index[0], edge_index[1])).astype(np.int64)).to(device)
values = weight
shape = torch.Size((n,n))
return torch.sparse.FloatTensor(indices, values, shape)
def adjust_learning_rate(optimizer, epoch):
lr = 0.001 * (0.1 ** (epoch // 50))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def parameter(model):
params = list(model.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
print("sum:" + str(k))
return str(k)
# load model parameter------------------------------------------------------------------start
def load_pretrain_parameter(model):
pretrained_dict = torch.load('model_pretrain/{}_pretrain.pkl'.format(opt.args.name), map_location='cpu')
model_dict = model.state_dict()
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)
return model
def model_init(model, X, y):
model = load_pretrain_parameter(model)
with torch.no_grad():
Z1, Z2, Z, Q, X_ = model(X)
acc, nmi, ari, f1, centers = clustering(Z, y)
return centers
# load model parameter--------------------------------------------------------------------end
# Two methods to add nosie--------------------------------------------------------------start
def gaussian_noised_feature(X):
"""
add gaussian noise to the attribute matrix X
Args:
X: the attribute matrix
Returns: the noised attribute matrix Y
"""
N = torch.Tensor(np.random.normal(0, 0.01, X.shape)).to(opt.args.device)
Y = X + N
return Y
def gaussian_noised_feature_(X):
"""
add gaussian noise to the attribute matrix X
Args:
X: the attribute matrix
Returns: the noised attribute matrix Y
"""
N = torch.Tensor(np.random.normal(1, 0.01, X.shape)).to(opt.args.device)
Y = X * N
return Y
# Two methods to add nosie----------------------------------------------------------------end
# Norm something------------------------------------------------------------------------start
def normlize_adj(adj, symmetry = True):
adj = adj + np.eye(adj.shape[0])
# calculate degree matrix and it's inverse matrix
row_sum = adj.sum(1)
if symmetry:
d1 = np.diag(np.power(row_sum, -0.5))
norm_adj = np.matmul(np.matmul(d1, adj), d1) # symmetry normalize: D^{-0.5} A D^{-0.5}
else:
d2 = np.diag(np.power(row_sum, -1))
norm_adj = np.matmul(d2, adj) # non-symmetry normalize: D^{-1} A
return norm_adj
def normalize(mx):
rowsum = np.array(mx.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return mx
def normalize_lap_(adj, self_loop=True, symmetry=False):
ident = np.eye(adj.shape[0])
if self_loop:
adj_tmp = adj + ident
else:
adj_tmp = adj
# calculate degree matrix and it's inverse matrix
row_sum = adj_tmp.sum(1)
L = np.diag(row_sum) - adj_tmp
if symmetry:
d1 = np.diag(np.power(row_sum, -0.5))
norm_L = np.matmul(np.matmul(d1, L), d1) # symmetry normalize: D^{-0.5} A D^{-0.5}
else:
d2 = np.diag(np.power(row_sum, -1))
norm_L = np.matmul(d2, L) # non-symmetry normalize: D^{-1} A
return norm_L
def get_adjs(adj, norm = True):
ident = 1 * np.eye(adj.shape[0])
norm_L = normalize_lap_(adj, True, norm)
reg = [1] * (2)
adjs = []
for i in range(len(reg)):
adjs.append(ident-(reg[i] * norm_L))
return adjs
def get_laps(adj, norm = True):
norm_L = normalize_lap_(adj, True, norm)
laps = []
t = 2
for i in range(t):
laps.append(norm_L)
return laps
# Norm something------------------------------------------------------------------------end
# Calculating loss---------------------------------------------------------------------start
def target_distribution(Q):
weight = Q ** 2 / Q.sum(0)
P = (weight.t() / weight.sum(1)).t()
return P
def cross_correlation(X, Y):
return torch.mm(X, Y.t())
def cross_view_loss(X, Y, A):
S = cross_correlation(X, Y)
L_cv = (A-S).pow(2).mean()
return L_cv
def distribution_loss(Q, P):
loss = F.kl_div(Q.log(), P, reduction='batchmean')
return loss
def reconstruction_loss(X, X_):
loss_rec = F.mse_loss(X, X_)
return loss_rec
# Calculating loss-----------------------------------------------------------------------end
# Clustering and Evaluation-------------------------------------------------------------start
def clustering(Z, y):
model = KMeans(n_clusters=opt.args.n_clusters, n_init=20)
cluster_id = model.fit_predict(Z.data.cpu().numpy())
acc, nmi, ari, f1 = eva(y, cluster_id, show_details=opt.args.show_details)
return acc, nmi, ari, f1, model.cluster_centers_
def cluster_acc(y_true, y_pred):
y_true = y_true - np.min(y_true)
l1 = list(set(y_true))
num_class1 = len(l1)
l2 = list(set(y_pred))
num_class2 = len(l2)
ind = 0
if num_class1 != num_class2:
for i in l1:
if i in l2:
pass
else:
y_pred[ind] = i
ind += 1
l2 = list(set(y_pred))
numclass2 = len(l2)
if num_class1 != numclass2:
print('error')
return
cost = np.zeros((num_class1, numclass2), dtype=int)
for i, c1 in enumerate(l1):
mps = [i1 for i1, e1 in enumerate(y_true) if e1 == c1]
for j, c2 in enumerate(l2):
mps_d = [i1 for i1 in mps if y_pred[i1] == c2]
cost[i][j] = len(mps_d)
m = Munkres()
cost = cost.__neg__().tolist()
indexes = m.compute(cost)
new_predict = np.zeros(len(y_pred))
for i, c in enumerate(l1):
c2 = l2[indexes[i][1]]
ai = [ind for ind, elm in enumerate(y_pred) if elm == c2]
new_predict[ai] = c
acc = metrics.accuracy_score(y_true, new_predict)
f1_macro = metrics.f1_score(y_true, new_predict, average='macro')
return acc, f1_macro
def eva(y_true, y_pred, show_details=False):
acc, f1 = cluster_acc(y_true, y_pred)
nmi = nmi_score(y_true, y_pred, average_method='arithmetic')
ari = ari_score(y_true, y_pred)
if show_details:
print(':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari),
', f1 {:.4f}'.format(f1))
return acc, nmi, ari, f1
# Clustering and Evaluation----------------------------------------------------------------end
import torch
from opt import args
from utils import *
from torch.optim import Adam
import torch.nn.functional as F
'''
train.py
'''
def Train_gae(model, view_learner, data, adj, label, edge_index):
acc_reuslt = []
nmi_result = []
ari_result = []
f1_result = []
view_optimizer = torch.optim.Adam(view_learner.parameters(), lr=args.view_lr)
optimizer = Adam(model.parameters(), lr=args.lr)
n = data.shape[0]
model.eval()
for epoch in range(args.epoch):
# ----------------------Training view_learner---------------------
view_learner.train()
view_learner.zero_grad()
z_igae, c = model(data, adj)
edge_logits = view_learner(data, adj, edge_index)
batch_aug_edge_weight = torch.sigmoid(edge_logits).squeeze() # p
aug_adj= new_graph(torch.tensor(edge_index).to('cuda'),batch_aug_edge_weight,n,'cuda')
aug_adj = aug_adj.to_dense()
aug_adj = aug_adj * adj
aug_adj = aug_adj.cpu().detach().numpy() + np.eye(n)
aug_adj = torch.from_numpy(normalize(aug_adj)).to(torch.float32).to('cuda')
aug_z_igae, aug_c = model(data, aug_adj)
edge_drop_out_prob = 1 - batch_aug_edge_weight
reg = edge_drop_out_prob.mean()
view_loss = (args.reg_lambda * reg) + model.calc_loss(c.T, aug_c.T) + model.calc_loss(c, aug_c)
(-view_loss).backward()
view_optimizer.step()
# ----------------------Training model---------------------
view_learner.eval()
model.train()
model.zero_grad()
z_igae, c = model(data, adj)
edge_logits = view_learner(data, adj, edge_index)
batch_aug_edge_weight = torch.sigmoid(edge_logits).squeeze() # p
aug_adj = new_graph(torch.tensor(edge_index).to('cuda'), batch_aug_edge_weight, n,'cuda')
aug_adj = aug_adj.to_dense()
aug_adj = aug_adj * adj
aug_adj = aug_adj.cpu().detach().numpy() + np.eye(n)
aug_adj = torch.from_numpy(normalize(aug_adj)).to(torch.float32).to('cuda')
aug_z_igae, aug_c = model(data, aug_adj)
z_mat = torch.matmul(z_igae, aug_z_igae.T)
model_loss = model.calc_loss(c.T, aug_c.T) + F.mse_loss(z_mat, torch.eye(n).to('cuda')) + model.calc_loss(c, aug_c)
model_loss.backward()
optimizer.step()
model.eval()
# ---------------------evaluation----------------------------
print('{} loss: {}'.format(epoch, model_loss))
z = (c + aug_c)/2
i = z.argmax(dim=-1)
acc, nmi, ari, f1 = eva(label, i.data.cpu().numpy(), epoch)
acc_reuslt.append(acc)
nmi_result.append(nmi)
ari_result.append(ari)
f1_result.append(f1)
return acc_reuslt, nmi_result, ari_result, f1_result
import argparse
'''
opt.py
'''
parser = argparse.ArgumentParser(description='AGC-DRR', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--name', type=str, default='dblp')
parser.add_argument('--lr', type=float, default=1e-4)
parser.add_argument('--k', type=int, default=None)
parser.add_argument('--n_clusters', type=int, default=3)
parser.add_argument('--n_z', type=int, default=10)
parser.add_argument('--n_input', type=int, default=100)
parser.add_argument('--gamma_value', type=float, default=1)
parser.add_argument('--data_path', type=str, default='.txt')
parser.add_argument('--label_path', type=str, default='.txt')
parser.add_argument('--save_path', type=str, default='.txt')
parser.add_argument('--cuda', type=bool, default=True)
parser.add_argument('--gpu', type=int, default=0)
parser.add_argument('--n_components', type=int, default=50)
parser.add_argument('--batch_size', type=int, default=1600)
parser.add_argument('--epoch', type=int, default=400)
parser.add_argument('--acc', type=float, default=-1)
parser.add_argument('--shuffle', type=bool, default=True)
parser.add_argument('--gae_n_enc_1', type=int, default=1000)
parser.add_argument('--gae_n_enc_2', type=int, default=500)
parser.add_argument('--gae_n_enc_3', type=int, default=500)
parser.add_argument('--emb_dim', type=int, default=500,help='embedding dimension')
parser.add_argument('--dataset', type=str, default='ogbg-molesol',help='Dataset')
parser.add_argument('--view_lr', type=float, default=1e-4,help='View Learning rate.')
parser.add_argument('--num_gc_layers', type=int, default=5,help='Number of GNN layers before pooling')
parser.add_argument('--pooling_type', type=str, default='standard',help='GNN Pooling Type Standard/Layerwise')
parser.add_argument('--mlp_edge_model_dim', type=int, default=128,help='embedding dimension')
parser.add_argument('--pred_dim', type=int, default=64,help='embedding dimension')
parser.add_argument('--drop_ratio', type=float, default=0.0,help='Dropout Ratio / Probability')
parser.add_argument('--reg_lambda', type=float, default=1, help='View Learner Edge Perturb Regularization Strength')
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Module, Parameter, Dropout
from opt import args
'''
GAE.py
'''
class GNNLayer(Module):
def __init__(self, in_features, out_features):
super(GNNLayer, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.w = Parameter(torch.FloatTensor(out_features, in_features))
self.act = nn.Tanh()
torch.nn.init.xavier_uniform_(self.w)
def forward(self, features, adj, active):
if active:
support = self.act(F.linear(features, self.w)) # add bias
else:
support = F.linear(features, self.w) # add bias
output = torch.mm(adj, support)
return output
class IGAE_encoder(nn.Module):
def __init__(self, gae_n_enc_1, gae_n_enc_2, gae_n_enc_3, n_input):
super(IGAE_encoder, self).__init__()
self.gnn_1 = GNNLayer(n_input, gae_n_enc_1)
self.gnn_2 = GNNLayer(gae_n_enc_1, gae_n_enc_2)
self.gnn_3 = GNNLayer(gae_n_enc_2, gae_n_enc_3)
self.s = nn.Sigmoid()
def forward(self, x, adj):
z = self.gnn_1(x, adj, active=True)
z = self.gnn_2(z, adj, active=True)
z_igae = self.gnn_3(z, adj, active=False)
return z_igae
class Cluster_layer(nn.Module):
def __init__(self, in_dims, out_dims):
super(Cluster_layer, self).__init__()
self.l = nn.Sequential(nn.Linear(in_dims, out_dims),
nn.Softmax())
def forward(self, h):
c = self.l(h)
return c
class IGAE(nn.Module):
def __init__(self, gae_n_enc_1, gae_n_enc_2, gae_n_enc_3, n_input):
super(IGAE, self).__init__()
self.encoder = IGAE_encoder(
gae_n_enc_1=gae_n_enc_1,
gae_n_enc_2=gae_n_enc_2,
gae_n_enc_3=gae_n_enc_3,
n_input=n_input,
)
self.cluster = Cluster_layer(
in_dims=gae_n_enc_3,
out_dims=args.n_clusters,
)
def forward(self, x, adj):
z_igae = self.encoder(x, adj)
c = self.cluster(z_igae)
return z_igae, c
@staticmethod
def calc_loss(x, x_aug, temperature=0.2, sym=True):
batch_size = x.shape[0]
x_abs = x.norm(dim=1)
x_aug_abs = x_aug.norm(dim=1)
sim_matrix = torch.einsum('ik,jk->ij', x, x_aug) / torch.einsum('i,j->ij', x_abs, x_aug_abs)
sim_matrix = torch.exp(sim_matrix / temperature)
pos_sim = sim_matrix[range(batch_size), range(batch_size)]
if sym:
loss_0 = pos_sim / (sim_matrix.sum(dim=0) - pos_sim)
loss_1 = pos_sim / (sim_matrix.sum(dim=1) - pos_sim)
# print(pos_sim, sim_matrix.sum(dim=0))
loss_0 = - torch.log(loss_0).mean()
loss_1 = - torch.log(loss_1).mean()
loss = (loss_0 + loss_1) / 2.0
else:
loss = pos_sim / (sim_matrix.sum(dim=1) - pos_sim)
loss = - torch.log(loss).mean()
return loss
import torch
from torch.nn import Sequential, Linear, ReLU
from opt import args
'''
View_Learner.py
'''
class ViewLearner(torch.nn.Module):
def __init__(self, encoder, mlp_edge_model_dim=64):
super(ViewLearner, self).__init__()
self.encoder = encoder
self.input_dim = args.emb_dim
self.mlp_edge_model = Sequential(
Linear(self.input_dim * 2, 1),
# ReLU(),
# Linear(mlp_edge_model_dim, 1)
)
self.init_emb()
def init_emb(self):
for m in self.modules():
if isinstance(m, Linear):
torch.nn.init.xavier_uniform_(m.weight.data)
if m.bias is not None:
m.bias.data.fill_(0.0)
def forward(self, x, adj, edge_index):
node_emb= self.encoder(x, adj)
src, dst = edge_index[0], edge_index[1]
emb_src = node_emb[src]
emb_dst = node_emb[dst]
# print(emb_src.shape)
edge_emb = torch.cat([emb_src, emb_dst], 1)
# print(edge_emb.shape)
edge_logits = self.mlp_edge_model(edge_emb)
return edge_logits
【1】https://blog.csdn.net/qq_51392112/article/details/129429108
【2】https://blog.csdn.net/qq_51392112/article/details/129298794