本文在github上:https://github.com/Mrxiahelong/Unsupervised-Author-Disambiguation/实现
这里我们使用三张图结构 分别是paper-coauhor-paper,paper-cotitle-paper,paper-covenue-paper,也就是结点类型一种,边类型三种,我们的metapath类型为coauthor-cotitle-coauthor-cotitle.
def positive_sampler(path):
'''
对每一条path建立一个window大小的滑动窗口,
例如 0 1 2 3 4,window大小为2,则返回 pos_u=[0,0,1,1,1.....],pos_v=[1,2,0,2,3....]
'''
pos_u,pos_v=[],[]
for i in range(len(path)):
if len(path)==1:
continue
u=path[i]
v=np.concatenate([path[max(i-window,0):i],path[i+1:i+window+1]],axis=0)
pos_u.extend([u]*len(v))
pos_v.extend(v)
return pos_u,pos_v
def get_negative_ratio(metapath):
'''
对所有结点根据出现频率建立negative_ratio,出现频率越大的越有可能出现在负采样中
返回的ratio 是对每一个结点被负采样的概率
'''
node_frequency=dict()
sentence_count,node_count=0,0
for path in metapath:
for node in path:
node_frequency[node]=node_frequency.get(node,0)+1
node_count+=1
pow_frequency=np.array(list(map(lambda x:x[-1],sorted(node_frequency.items(),key=lambda asd:asd[0]))))**0.75
node_pow=np.sum(pow_frequency)
ratio=pow_frequency/node_pow
return ratio
def negative_sampler(path,ratio,nodes):
'''
根据上一个函数的到负采样的概率表ratio,进行负采样
'''
negtives_size=5
negatives=[]
while len(negatives)<5:
temp=np.random.choice(nodes, size=negtives_size-len(negatives), replace=False, p=ratio)
negatives.extend([node for node in temp if node not in path])
return negatives
def create_node2node_dict(graph):
'''
输入的是dgl建立的图
返回的是个字典类型,保存的是在该图中,每个结点可以到达的结点
'''
src_dst={}
for src,dst in zip(graph.edges()[0],graph.edges()[1]):
src,dst=src.item(),dst.item()
if src not in src_dst.keys():
src_dst[src]=[]
src_dst[src].append(dst)
return src_dst
window=2# 这里是取metapath时的窗口大小
metapaths=[]#所有的metapath
num_walks=10#每个结点run 多少遍
walk_len=100#每个path的长度
metapath_type=['coauthor','covenue','coauthor','cotitle']#根据论文,作者使用的是AVAT
edge_per_graph={}#对应每个图,建立个字典,每个字典的key为结点编号,value为key在该图中可以到达的结点编号
edge_per_graph['coauthor']=create_node2node_dict(coauthor_graph)
edge_per_graph['cotitle']=create_node2node_dict(cotitle_graph)
edge_per_graph['covenue']=create_node2node_dict(covenue_graph)
weights_all_graph={'coauthor':weights_coauthor,'cotitle':weights_cotitle,'covenue':weights_covenue}
def Is_isolate(node):
for rel in metapath_type:
if node in edge_per_graph[rel].keys():
return 0
return 1
for walk in tqdm(range(num_walks)):
for cur_node in list(range(len(labels))):#对图里的每个结点循环一次
stop=0
path=[]
path.append(cur_node)
while len(path)
这儿得到的metapaths就是总的metapath,pos_us和pos_vs的元素一 一对应作为正对,neg_vs作为负对,对应着skip_gram模型的ui,uc,uj,如下
#单纯的metapath2vec
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
"""
u_embedding: Embedding for center word.
v_embedding: Embedding for neighbor words.
"""
class SkipGramModel(nn.Module):
def __init__(self, emb_size, emb_dimension):
super(SkipGramModel, self).__init__()
self.emb_size = emb_size
self.emb_dimension = emb_dimension
self.u_embeddings = nn.Embedding(emb_size, emb_dimension)
self.v_embeddings = nn.Embedding(emb_size, emb_dimension)
initrange = 1.0 / self.emb_dimension
init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
init.constant_(self.v_embeddings.weight.data, 0)
def forward(self, pos_u, pos_v, neg_v):
emb_u = self.u_embeddings(pos_u)
emb_v = self.v_embeddings(pos_v)
emb_neg_v = self.v_embeddings(neg_v)
score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
score = torch.clamp(score, max=10, min=-10)
score = -F.logsigmoid(score)
neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze()
neg_score = torch.clamp(neg_score, max=10, min=-10)
neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)
return torch.mean(score + neg_score)
skip_model=SkipGramModel(sum_papers,64)
optimizer=torch.optim.Adam(skip_model.parameters(),lr=0.001)
for epoch in range(500):
optimizer.zero_grad()
loss=skip_model(torch.tensor(pos_us),torch.tensor(pos_vs),torch.tensor(neg_vs))
loss.backward()
optimizer.step()
losses.append(loss.item())
if epoch %100==0:
print('epoch {0} loss {1}'.format(epoch,loss))
embedding=skip_model.u_embeddings.weight.cpu().data.numpy()
这儿embedding就是得到的每个结点的embdding,可以用来做下游任务