Alias method讲解
def preprocess_transition_probs(self):
'''
Preprocessing of transition probabilities for guiding the random walks.
'''
G = self.G
is_directed = self.is_directed
"""结点"""
alias_nodes = {
}
"""
G.nodes()返回一个结点列表
"""
for node in G.nodes():
"""
使用G.neighbors(node)得到node的邻接结点列表
得到unnormalized_probs列表,其中每个元素是边的权重
"""
unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
"""
sum权重求和,作为分母
normalized_probs得到归一化后的,到各个结点的概率
"""
norm_const = sum(unnormalized_probs)
normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
"""
alias_setup输入概率,得到对应两组数,一个是别名,一个是概率。
alias_nodes[node]是一个字典,元素为'节点序号':'(node到邻接节点转移概率对应的别名列表(sorted的前后序号,不是节点序号), 到邻接节点的转移概率)' 的形式。
"""
alias_nodes[node] = alias_setup(normalized_probs)
"""边"""
alias_edges = {
}
triads = {
}
"""
get_alias_edge得到结点到结点的概率
alias_edges[edge]是一个字典,元素为'节点序号':'(node到邻接节点转移概率对应的别名列表(sorted的前后序号,不是节点序号), 到邻接节点的转移概率(受到p或q的影响))' 的形式。
"""
if is_directed:
"""
G.edges()返回列表元组,列表中是边的关系,如(1,2),表示结点1、2之间有一条边。
"""
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
else:
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
alias_edgfes[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
self.alias_nodes = alias_nodes
self.alias_edges = alias_edges
return
def simulate_walks(self, num_walks, walk_length):
'''
Repeatedly simulate random walks from each node.
'''
"""
对每个结点,根据num_walks得出多条随机游走路径。
num_walks表示路径的条数
walk_length表示每条路径的长度
"""
G = self.G
walks = []
nodes = list(G.nodes())
print 'Walk iteration:'
for walk_iter in range(num_walks):
print str(walk_iter+1), '/', str(num_walks)
random.shuffle(nodes)
for node in nodes:
walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
return walks
def node2vec_walk(self, walk_length, start_node):
'''
Simulate a random walk starting from start node.
从初始结点计算一条随机游走路径
walk_length表示随机游走序列长度
'''
G = self.G
alias_nodes = self.alias_nodes
alias_edges = self.alias_edges
walk = [start_node]
while len(walk) < walk_length:
cur = walk[-1]
'''求当前结点的邻居结点'''
cur_nbrs = sorted(G.neighbors(cur))
'''存在邻居结点'''
if len(cur_nbrs) > 0:
'''如果序列中只有一个结点'''
if len(walk) == 1:
"""
结合cur_nbrs = sorted(G.neighbor(cur)) h和alias_nodes/alias_edges的序号,才能确定节点的ID。所以路径上的每个节点在确定下一个节点是哪个的时候,都要经过sorted(G.neighbors(cur))这一步。
alias_draw使用alias采样从一个非均匀离散分布中采样
"""
walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
else:'''如果序列中不只一个结点'''
prev = walk[-2]'''找到当前游走的结点的前一个结点'''
next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
alias_edges[(prev, cur)][1])]
walk.append(next)
else:
break
return walk