本文将以跆拳道俱乐部的例子作为讲解,可以参见Zachary_karate_club
D G L DGL DGL框架依赖于 M x n e t Mxnet Mxnet、 P y t o r c h Pytorch Pytorch、 T e n s o r f l o w Tensorflow Tensorflow,本文将以 P y t o r c h Pytorch Pytorch框架来讲解。
## DGL框架的使用
import dgl
def build_club():
g = dgl.DGLGraph()
# 添加34个结点到图中
g.add_nodes(34)
# 生成78条边
edge_list = [(1, 0), (2, 0), (2, 1), (3, 0), (3, 1), (3, 2),
(4, 0), (5, 0), (6, 0), (6, 4), (6, 5), (7, 0), (7, 1),
(7, 2), (7, 3), (8, 0), (8, 2), (9, 2), (10, 0), (10, 4),
(10, 5), (11, 0), (12, 0), (12, 3), (13, 0), (13, 1), (13, 2),
(13, 3), (16, 5), (16, 6), (17, 0), (17, 1), (19, 0), (19, 1),
(21, 0), (21, 1), (25, 23), (25, 24), (27, 2), (27, 23),
(27, 24), (28, 2), (29, 23), (29, 26), (30, 1), (30, 8),
(31, 0), (31, 24), (31, 25), (31, 28), (32, 2), (32, 8),
(32, 14), (32, 15), (32, 18), (32, 20), (32, 22), (32, 23),
(32, 29), (32, 30), (32, 31), (33, 8), (33, 9), (33, 13),
(33, 14), (33, 15), (33, 18), (33, 19), (33, 20), (33, 22),
(33, 23), (33, 26), (33, 27), (33, 28), (33, 29), (33, 30),
(33, 31), (33, 32)]
# # zip的用法
# seq = ['one', 'two', 'three']
# seq1=[1,2,3]
# seq3=[4,5,6]
# list(zip(seq,seq1))
# out:
# [('one', 1), ('two', 2), ('three', 3)]
# # 每个列表为一个元素:
# zz=zip(seq,seq1)
# list(zip(*zz))
# out:
# [('one', 'two', 'three'), (1, 2, 3)]
# # 把两个列表转化为一个字典
# dict(zip(seq,seq1))
# out:
# {'one': 1, 'three': 3, 'two': 2}
# 使用zip和tuple将edge_list转化为两列
src, dst = tuple(zip(*edge_list))
g.add_edges(src, dst)
# 使边无向
g.add_edges(dst, src)
return g
G = build_club()
print(G.number_of_nodes()) # 结点的数目
print(G.number_of_edges()) # 边的数目
# 34
# 156
# 使用networkx作可视化
import networkx as nx
# 将其转换为networkx并使其无向
nx_G = G.to_networkx().to_undirected()
# 使用Kamada-Kawaii布局来呈现整张图
pos = nx.kamada_kawai_layout(nx_G)
nx.draw(nx_G, pos, with_labels=True, node_color=[[.7, .7, .7]])
# 为边或结点添加特征
# 为每个结点使用one-hot编码
import torch
G.ndata['feat'] = torch.eye(34)
print(G.nodes[2].data['feat'])
print(G.nodes[1].data['feat'])
#tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
# 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
#tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
# 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
G C N GCN GCN的原理可以参见数据挖掘博客中拉普拉斯矩阵相关内容,主要分为以下两部:
# 定义GCN网络
import torch.nn as nn
import torch.nn.functional as F
# 定义消息传递函数
def message_fun(edges):
return {'m':edges.src['h']}
# 定义消息聚集函数
# 每个结点收集信息通过mailbox收集来自其邻居结点的信息
def reduce_fun(nodes):
return {'h': torch.sum(nodes.mailbox['m'], dim=1)}
# 定义一层GCN要进行的操作
class GCNlayer(nn.Module):
def __init__(self, in_feats, out_feats):
super(GCNlayer, self).__init__()
self.linear = nn.Linear(in_feats, out_feats) # 对输入的infeats个特征做线性变换,得到out_feats个特征
def forward(self ,g, inputs):
h = self.linear(inputs)
g.ndata['h'] = h
g.update_all(message_fun, reduce_fun) # 消息传递、收集、更新
return g.ndata.pop('h')
# 定义两层GCN网络
class GCN(nn.Module):
def __init__(self, in_feats, hidden_size, number_class):
super(GCN, self).__init__()
self.gcn1 = GCNlayer(in_feats, hidden_size)
self.gcn2 = GCNlayer(hidden_size, number_class)
def forward(self, g, inputs):
h = self.gcn1(g, inputs)
h = torch.relu(h)
h = self.gcn2(g, h)
return h
# 第一层:特征的维度从34到5,第二层:特征的维度从5到2(因为整个跆拳道俱乐部有两类)
net = GCN(34, 5, 2)
关于 U D F UDF UDF函数中 M e s s a g e Message Message与 R e d u c e Reduce Reduce两个函数可以参考官方APIMessage Passing Tutorial
# 创建输入
inputs = torch.eye(34)
labeled_nodes = torch.tensor([0, 33]) # only the instructor and the president nodes are labeled
labels = torch.tensor([0, 1]) # their labels are different
# 使用PyTorch进行训练
# (1) create an optimizer, (2) feed the inputs to the model,
#(3) calculate the loss and (4) use autograd to optimize the model.
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
all_logits = []
for epoch in range(30):
logits = net(G, inputs)
# we save the logits for visualization later
all_logits.append(logits.detach())
logp = F.log_softmax(logits, 1)
# we only compute loss for labeled nodes
loss = F.nll_loss(logp[labeled_nodes], labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Epoch %d | Loss: %.4f' % (epoch, loss.item()))
The GCN model we have implemented above uses the unnormalized adjacency matrix A A A. In this exercise, you will implement the normalization. Based on the equation:
Y = D − 1 2 A D − 1 2 X W Y = D^{-\frac{1}{2}}AD^{-\frac{1}{2}}XW Y=D−21AD−21XW
, the message and reduce function will be rewritten as follows:
message phase: m j → i = 1 d j h j \text{message phase: }m_{j\rightarrow i}=\frac{1}{\sqrt{d_j}}h_j message phase: mj→i=dj1hj
reduce phase: h ~ i = 1 d i ∑ j ∈ N ( i ) m j → i \text{reduce phase: }\tilde{h}_i=\frac{1}{\sqrt{d_i}}\sum_{j\in\mathcal{N}(i)}m_{j\rightarrow i} reduce phase: h~i=di1j∈N(i)∑mj→i
Now your task here is to implement this normalization.
Hint 1: you can use G.in_degrees(G.nodes())
to get a 1-D tensor containing the degrees of all the nodes.
Hint 2: you can try UDF message function to perform multiplication between source node feature and edge feature.
# Exercise
# 定义GCN网络
import torch.nn as nn
import torch.nn.functional as F
# 定义消息传递函数
def message_fun(edges):
return {'m':edges.src['h'] / edges.src['norm']}
# 定义消息聚集函数
# 每个结点收集信息通过mailbox收集来自其邻居结点的信息
def reduce_fun(nodes):
return {'h': torch.sum(nodes.mailbox['m'], dim=1)}
# 定义一层GCN要进行的操作
class GCNlayer(nn.Module):
def __init__(self, in_feats, out_feats):
super(GCNlayer, self).__init__()
self.linear = nn.Linear(in_feats, out_feats) # 对输入的infeats个特征做线性变换,得到out_feats个特征
def forward(self ,g, inputs):
degs = []
for i in g.nodes():
degs.append(g.in_degree(i))
deg = torch.tensor(degs).float()
norm= 1.0 / torch.sqrt(deg).reshape(-1, 1)
g.ndata['norm'] = norm
h = self.linear(inputs)
g.ndata['h'] = h
g.update_all(message_fun, reduce_fun) # 消息传递、收集、更新
g.ndata['h'] * g.ndata['norm']
return g.ndata.pop('h')
从上面的代码我们应该注意的是:g.ndata['h’]、g.ndata[‘norm’]、g.ndata[‘m’]都是自己给出的定义。