对于顶点 i i i ,通过计算该节点与它的邻居节点 j ∈ N i j\in {{N}_{i}} j∈Ni 的注意力系数:
s h a p e : h i , h j = [ 1 , i n _ f e a s ] , W = [ i n _ f e a s , o u t f e a s ] , a = [ 2 ∗ o u t _ f e a s , 1 ] shape:{{h}_{i}},{{h}_{j}}=[1,in\_feas],W=[in\_feas,out_feas],a=[2*out\_feas,1] shape:hi,hj=[1,in_feas],W=[in_feas,outfeas],a=[2∗out_feas,1]
其中, h i , h j {h}_{i},{h}_{j} hi,hj 为节点 i i i 和节点 j j j 的特征向量, W 、 a W、a W、a 是模型需要训练的参数。
有了注意力系数 e i j {e}_{ij} eij(未归一化),再只需 s o f t m a x softmax softmax 归一化即可得到注意力权重,论文在 s o f t m a x softmax softmax 之前加了个 L e a k y R e L U LeakyReLU LeakyReLU 进行非线性激活,即得到最终的节点 i i i 对节点 j j j 的图注意力系数 i i i :
此外,论文参考了self-attention的多头注意力机制(multi-head attention),通过多个注意力头来增强节点表示。
完成第一步,已经成功一大半了。第二步很简单,根据计算好的注意力系数,把特征加权求和(aggregate)一下。
式中: h i ′ h_{i}^{'} hi′ 就是 GAT 输出的对于每个顶点 i i i 的新特征(融合了邻域信息), σ ( ∙ ) \sigma(\bullet ) σ(∙) 是激活函数。
为了提高聚合器的表现,论文中采用了multi-head attention, 即使用 k 个独立的注意力机制(采用不同的 a a a 和 W W W),然后将得到的结果再次拼接:
但是,这会导致 h i ′ h_{i}^{'} hi′ 有更高的维度 ( 1 , k h ′ ) (1,kh^{'}) (1,kh′),所以只可以做中间层而不可以做输出层。所以,对于输出层,一种聚合方式是将各注意力机制的 h ′ h^{'} h′ 平均:
1)定义图注意力层(Graph Attention Layer):
import torch
import torch.nn as nn
from torch.nn import functional as F
class GraphAttentionLayer(nn.Module):
"""
Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
图注意力层
"""
def __init__(self, in_features, out_features, dropout, alpha, concat=True):
super(GraphAttentionLayer, self).__init__()
self.in_features = in_features # 节点表示向量的输入特征维度
self.out_features = out_features # 节点表示向量的输出特征维度
self.dropout = dropout # dropout参数
self.alpha = alpha # leakyrelu激活的参数
self.concat = concat # 如果为true, 再进行elu激活
# 定义可训练参数,即论文中的W和a
self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
nn.init.xavier_uniform_(self.W.data, gain=1.414) # xavier初始化
self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1)))
nn.init.xavier_uniform_(self.a.data, gain=1.414) # xavier初始化
# 定义leakyrelu激活函数
self.leakyrelu = nn.LeakyReLU(self.alpha)
def forward(self, inp, adj):
"""
inp: input_fea [N, in_features] in_features表示节点的输入特征向量元素个数
adj: 图的邻接矩阵 维度[N, N] 非零即一,数据结构基本知识
"""
h = torch.mm(inp, self.W) # [N, out_features]
N = h.size()[0] # N 图的节点数
a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
# [N, N, 2*out_features]
e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
# [N, N, 1] => [N, N] 图注意力的相关系数(未归一化)
zero_vec = -1e12 * torch.ones_like(e) # 将没有连接的边置为负无穷
attention = torch.where(adj > 0, e, zero_vec) # [N, N]
# 表示如果邻接矩阵元素大于0时,则两个节点有连接,该位置的注意力系数保留,
# 否则需要mask并置为非常小的值,原因是softmax的时候这个最小值会不考虑。
attention = F.softmax(attention, dim=1) # softmax形状保持不变 [N, N],得到归一化的注意力权重!
attention = F.dropout(attention, self.dropout, training=self.training) # dropout,防止过拟合
h_prime = torch.matmul(attention, h) # [N, N].[N, out_features] => [N, out_features]
# 得到由周围节点通过注意力权重进行更新的表示
if self.concat:
return F.elu(h_prime)
else:
return h_prime
def __repr__(self):
return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
1)定义图注意力网络(GAT):
在图注意力层基础上加入multi-head机制
import torch
import torch.nn as nn
from torch.nn import functional as F
class GAT(nn.Module):
def __init__(self, n_feat, n_hid, n_class, dropout, alpha, n_heads):
"""Dense version of GAT
n_heads 表示有几个GAL层,最后进行拼接在一起,类似self-attention
从不同的子空间进行抽取特征。
"""
super(GAT, self).__init__()
self.dropout = dropout
# 定义multi-head的图注意力层
self.attentions = [GraphAttentionLayer(n_feat, n_hid, dropout=dropout, alpha=alpha, concat=True) for _ in
range(n_heads)]
for i, attention in enumerate(self.attentions):
self.add_module('attention_{}'.format(i), attention) # 加入pytorch的Module模块
# 输出层,也通过图注意力层来实现,可实现分类、预测等功能
self.out_att = GraphAttentionLayer(n_hid * n_heads, n_class, dropout=dropout, alpha=alpha, concat=False)
def forward(self, x, adj):
x = F.dropout(x, self.dropout, training=self.training) # dropout,防止过拟合
x = torch.cat([att(x, adj) for att in self.attentions], dim=1) # 将每个head得到的表示进行拼接
x = F.dropout(x, self.dropout, training=self.training) # dropout,防止过拟合
x = F.elu(self.out_att(x, adj)) # 输出并激活
return F.log_softmax(x, dim=1) # log_softmax速度变快,保持数值稳定
1)聚合相邻节点:
import torch
from torch.nn import functional as F
Wh = torch.randn(3,5) # 3个节点,每个节点5个特征
A = torch.randn(3,3) # 注意力系数矩阵
# 邻接矩阵
adj = torch.tensor([[0,1,1],
[1,0,0],
[1,0,0]])
zero_vec = -9e15*torch.ones_like(A)
# 使用adj作为掩码,将没有边连接的点对的注意力系数置为0
attention = torch.where(adj>0, A, zero_vec)
attention = F.softmax(attention, dim=1)
# h_prime.shape=(3,5),得到了每个节点的聚合新特征
h_prime = torch.matmul(attention, Wh)
print(h_prime)
# tensor([[ 0.3225, -0.8215, -0.0458, -0.0458, -0.4536],
# [-0.6674, 1.2871, 0.5735, 1.6474, 1.5386],
# [-0.6674, 1.2871, 0.5735, 1.6474, 1.5386]])
2)节点特征向量的标准化:
对于单个节点的标准化就是 h 1 → s u m ( h → 1 ) \frac{\overrightarrow{{{h}_{1}}}}{sum({{\overrightarrow{h}}_{1}})} sum(h1)h1 ,那么以矩阵运算的方法进行标准化。
import torch
import numpy as np
from torch.nn import functional as F
H = torch.ones(3,5)
# 特征求和
rowsum = np.array(H.sum(1))
# 倒数
r_inv = np.power(rowsum,-1).flatten()
# 解决除0问题
r_inv[np.isinf(r_inv)] = 0.
# 转换为对角阵
r_mat_inv = np.diag(r_inv)
# 对角阵乘以H,得到标准化矩阵
H = r_mat_inv.dot(H)
print(H)
# [[0.2 0.2 0.2 0.2 0.2]
# [0.2 0.2 0.2 0.2 0.2]
# [0.2 0.2 0.2 0.2 0.2]]
3)邻接矩阵的标准化:
import torch
import numpy as np
from torch.nn import functional as F
A = np.ones((4,4))
rowsum = A.sum(1)
r_inv_sqrt = np.power(rowsum, -0.5).flatten()
r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
r_mat_inv_sqrt = np.diag(r_inv_sqrt)
A = A.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)
print(A)
# [[0.25 0.25 0.25 0.25]
# [0.25 0.25 0.25 0.25]
# [0.25 0.25 0.25 0.25]
# [0.25 0.25 0.25 0.25]]
def load_data(path="./data/cora/", dataset="cora"):
"""Load citation network dataset (cora only for now)"""
print('Loading {} dataset...'.format(dataset))
# idx_features_labels.shape = (2708,1435);
# 第一列是节点编号,最后一列是节点类别,中间列是节点的特征
idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
# 提取特征并按行压缩为稀疏矩阵
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
# 将标签转换为one-hot编码
labels = pd.get_dummies(idx_features_labels[:,-1]).values
# build graph
# 节点编号
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
# (节点编号:出现顺序)
idx_map = {j: i for i, j in enumerate(idx)}
# 边表,shape = (5429,2)
edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32)
# 使用idx_map映射edges_unordered中节点的编号
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
# 将edges转换为邻接矩阵
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)
# 转换为对称矩阵
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
features = normalize_features(features)
adj = normalize_adj(adj + sp.eye(adj.shape[0]))
idx_train = range(140)
idx_val = range(200, 500)
idx_test = range(500, 1500)
adj = torch.FloatTensor(np.array(adj.todense()))
features = torch.FloatTensor(np.array(features.todense()))
labels = torch.LongTensor(np.where(labels)[1])
idx_train = torch.LongTensor(idx_train)
idx_val = torch.LongTensor(idx_val)
idx_test = torch.LongTensor(idx_test)
return adj, features, labels, idx_train, idx_val, idx_test
def normalize_adj(mx):
"""Row-normalize sparse matrix"""
rowsum = np.array(mx.sum(1))
r_inv_sqrt = np.power(rowsum, -0.5).flatten()
r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
return mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)
def normalize_features(mx):
"""Row-normalize sparse matrix"""
rowsum = np.array(mx.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return mx
adj, features, labels, idx_train, idx_val, idx_test = load_data()
print(adj.shape)
print(features.shape)
print(labels.shape)
1)参数:
hidden = 8
dropout = 0.6
nb_heads = 8
alpha = 0.2
lr = 0.005
weight_decay = 5e-4
epochs = 10000
patience = 100
cuda = torch.cuda.is_available()
2)实例化模型和优化器:
# 实例化模型
model = GAT(nfeat=features.shape[1],
nhid=hidden,
nclass=int(labels.max()) + 1,
dropout=dropout,
nheads=nb_heads,
alpha=alpha)
# 优化器
optimizer = optim.Adam(model.parameters(),
lr=lr,
weight_decay=weight_decay)
if cuda:
model.cuda()
features = features.cuda()
adj = adj.cuda()
labels = labels.cuda()
idx_train = idx_train.cuda()
idx_val = idx_val.cuda()
idx_test = idx_test.cuda()
features, adj, labels = Variable(features), Variable(adj), Variable(labels)
3)训练:
def train(epoch):
t = time.time()
# trian
model.train()
optimizer.zero_grad()
output = model(features, adj)
loss_train = F.nll_loss(output[idx_train], labels[idx_train])
acc_train = accuracy(output[idx_train], labels[idx_train])
loss_train.backward()
optimizer.step()
# eval
model.eval()
output = model(features, adj)
loss_val = F.nll_loss(output[idx_val], labels[idx_val])
acc_val = accuracy(output[idx_val], labels[idx_val])
print('Epoch: {:04d}'.format(epoch+1),
'loss_train: {:.4f}'.format(loss_train.data.item()),
'acc_train: {:.4f}'.format(acc_train.data.item()),
'loss_val: {:.4f}'.format(loss_val.data.item()),
'acc_val: {:.4f}'.format(acc_val.data.item()),
'time: {:.4f}s'.format(time.time() - t))
return loss_val.data.item()
def compute_test():
model.eval()
output = model(features, adj)
loss_test = F.nll_loss(output[idx_test], labels[idx_test])
acc_test = accuracy(output[idx_test], labels[idx_test])
print("Test set results:",
"loss= {:.4f}".format(loss_test.item()),
"accuracy= {:.4f}".format(acc_test.item()))
def accuracy(output, labels):
preds = output.max(1)[1].type_as(labels)
correct = preds.eq(labels).double()
correct = correct.sum()
return correct / len(labels)
t_total = time.time()
loss_values = []
bad_counter = 0
best = epochs + 1
best_epoch = 0
for epoch in range(epochs):
# 训练模型并保存loss
loss_values.append(train(epoch))
# 保存模型
torch.save(model.state_dict(), '{}.pkl'.format(epoch))
# 记录loss最小的epoch
if loss_values[-1] < best:
best = loss_values[-1]
best_epoch = epoch
bad_counter = 0
else:
bad_counter += 1
# 如果连续patience个epoch,最小Loss都没有变则终止模型训练
if bad_counter == patience:
break
# 删除不是最优的模型
files = glob.glob('*.pkl')
for file in files:
epoch_nb = int(file.split('.')[0])
if epoch_nb < best_epoch:
os.remove(file)
files = glob.glob('*.pkl')
for file in files:
epoch_nb = int(file.split('.')[0])
if epoch_nb > best_epoch:
os.remove(file)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
# 加载最优模型
print('Loading {}th epoch'.format(best_epoch))
model.load_state_dict(torch.load('{}.pkl'.format(best_epoch)))
compute_test()