node2vec代码详解

初衷

为了训练向量表示,选择这个模型,然而,实在太吃内存了。就想着怎么优化一下,就把源码过了一遍。里面多余得代码是自己为了监控打的日志,其他没有做任何修改。
具体如下:

# coding=utf-8
import numpy as np
import networkx as nx
import random
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
                    filename='node2vec.log',
                    filemode='a+')


class Graph():
    def __init__(self, nx_G, is_directed, p, q):
        self.G = nx_G
        self.is_directed = is_directed
        self.p = p
        self.q = q

    def node2vec_walk(self, walk_length, start_node):
        """
        Simulate a random walk starting from start node.
         从一个初始结点计算一个随机游走
        :param walk_length: 随机游走序列长度
        :param start_node: 初始结点
        :return: 列表,随机游走序列
        """
        G = self.G
        alias_nodes = self.alias_nodes
        alias_edges = self.alias_edges

        walk = [start_node]
        logging.info(str(start_node) + "random walk start...")
        while len(walk) < walk_length:
            cur = walk[-1]
            # 求当前结点的邻居结点
            cur_nbrs = sorted(G.neighbors(cur))
            # 如果存在邻居结点
            if len(cur_nbrs) > 0:
                # 如果序列中仅有一个结点,即第一次游走
                if len(walk) == 1:
                    """
                    结合cur_nbrs = sorted(G.neighbor(cur)) 和 alias_nodes/alias_edges的序号,
                    才能确定节点的ID。
                    所以路径上的每个节点在确定下一个节点是哪个的时候,都要经过sorted(G.neighbors(cur))这一步。
                    """
                    walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
                # 如果序列中有多个结点
                else:
                    # 找到当前游走结点的前一个结点
                    prev = walk[-2]
                    next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
                                               alias_edges[(prev, cur)][1])]
                    walk.append(next)
            else:
                break
        logging.info(str(start_node) + "random walk end...")
        return walk

    def simulate_walks(self, num_walks, walk_length):
        """
        Repeatedly simulate random walks from each node.
        对每个结点,根据num_walks得出其多条随机游走路径
        """
        logging.info("Repeatedly simulate random walks from each node...")
        G = self.G
        walks = []
        logging.info("all nodes to list")
        nodes = list(G.nodes())
        logging.info('Walk iteration:')
        for walk_iter in range(num_walks):
            logging.info(str(walk_iter + 1), '/', str(num_walks))
            random.shuffle(nodes)
            for node in nodes:
                walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
        logging.info("Walk iteration end")
        return walks

    def get_alias_edge(self, src, dst):
        """
        Get the alias edge setup lists for a given edge.
        :param src:  随机游走序列种的上一个结点
        :param dst:  当前结点
        :return:
        """
        G = self.G
        p = self.p
        q = self.q

        unnormalized_probs = []
        # 这里可以进行优化,默认是选取所有的邻居结点
        # TODO:可以设置于一个阈值?
        # 三种情况
        for dst_nbr in sorted(G.neighbors(dst)):
            # 返回源结点
            if dst_nbr == src:
                unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p)
            # 源结点和这个目标结点的邻居结点之间有直连边
            elif G.has_edge(dst_nbr, src):
                unnormalized_probs.append(G[dst][dst_nbr]['weight'])
            # 没有直连边
            else:
                unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q)
        norm_const = sum(unnormalized_probs)
        # 概率归一化
        normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
        # 第一个返回值是Alias列表,第二个返回值是转移概率列表
        return alias_setup(normalized_probs)

    def preprocess_transition_probs(self):
        """
        Preprocessing of transition probabilities for guiding the random walks.
        """
        logging.info("Start Preprocessing of transition probabilities for guiding the random walks.")
        G = self.G
        is_directed = self.is_directed

        # 存储每个结点对应的两个采样列表
        alias_nodes = {}
        i = 0
        logging.info("nodes build start...")
        #G.nodes()返回一个结点列表
        for node in G.nodes():
            i = i + 1
            if i % 100000 == 0:
                logging.info(str(i) + " nodes have been build")
            # 得到当前结点的邻居结点(有直连关系)的权值列表,[1,1,1,1...]
            unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
            # 权重求和
            norm_const = sum(unnormalized_probs)
            # 求每个权重的占的比重,权重大的占的比重就大
            normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
            alias_nodes[node] = alias_setup(normalized_probs)
        logging.info("nodes build end...")
        alias_edges = {}
        triads = {}
        logging.info("edges build start...")
        if is_directed:
            for edge in G.edges():
                alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
        else:
            j = 0
            # G.edges()返回一个列表元组,列表里面是边关系,形如[(1,2), (1,3), ...]
            # (1,2)代表结点1和结点2之间有一条边
            for edge in G.edges():
                j = j + 1
                if j % 100000 == 0:
                    logging.info(str(j) + " alias_edges have been build")
                # 先构建(1,2),再构建(2,1)
                # 这里复杂度较高,需要优化
                alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
                alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
        logging.info("edges build end...")
        # alias_nodes形式为{1:(J, q), 2:(J,q)...},1和2代表结点id
        # alias_edges形式为{(1,2):(J,q), (2,1):(J,q),(1,3):(J,q)...} (1,2)代表一条边
        self.alias_nodes = alias_nodes
        self.alias_edges = alias_edges
        logging.info("End --- Preprocessing of transition probabilities for guiding the random walks.")
        return


def alias_setup(probs):
    """
    Compute utility lists for non-uniform sampling from discrete distributions.
    Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
    for details
    alias_setup的作用是根据二阶random walk输出的概率变成每个节点对应两个数,被后面的alias_draw函数所进行抽样
    :param probs: 结点之间权重所占比例向量,是一个列表
    :return: 输入概率,得到对应的两个列表,
             一个是在原始的prob数组[0.4,0.8,0.6,1],
             另外就是在上面补充的Alias数组,其值代表填充的那一列的序号索引
             具体的可以参见博客 https://blog.csdn.net/haolexiao/article/details/65157026
             方便后面的抽样调用
    """
    # J和q数组和probs数组大小一致
    # probs长度由当前结点的邻居节点数量决定
    K = len(probs)
    q = np.zeros(K)
    J = np.zeros(K, dtype=np.int)
    # 将数据分类为具有概率的结果 大于或者小于1 / K.
    # 这两个列表里存放的是结点的下标
    smaller = []
    larger = []
    for kk, prob in enumerate(probs):
        q[kk] = K * prob
        if q[kk] < 1.0:
            smaller.append(kk)
        else:
            larger.append(kk)
    # 然后循环并创建少量二元混合分布
    # 在整个均匀混合分布中适当地分配更大的结果。
    # 假如每条边权重都为1,实际上这里的while循环不会执行,因为每条边概率都是一样的,相当于不需要采样
    while len(smaller) > 0 and len(larger) > 0:
        small = smaller.pop()   # smaller自己也会减少最右边的值
        large = larger.pop()
        # 在代码中的实现思路: 构建方法: 
        # 1.找出其中面积小于等于1的列,如i列,这些列说明其一定要被别的事件矩形填上,所以在Prab[i]中填上其面积 
        # 2.然后从面积大于1的列中,选出一个,比如j列,用它将第i列填满,然后Alias[i] = j,第j列面积减去填充用掉的面积。
        J[small] = large
        q[large] = q[large] + q[small] - 1.0
        if q[large] < 1.0:
            smaller.append(large)
        else:
            larger.append(large)
    return J, q


def alias_draw(J, q):
    """
    Draw sample from a non-uniform discrete distribution using alias sampling.
    抽样函数
    使用alias采样从一个非均匀离散分布中采样
    :param J:
    :param q:
    :return:
    """
    K = len(J)

    # 从整体均匀混合分布中采样
    kk = int(np.floor(np.random.rand() * K))
    # 从二元混合中采样,要么保留较小的,要么选择更大的
    if np.random.rand() < q[kk]:
        return kk
    else:
        return J[kk]

结论

可以发现,这个模型在运行过程中,保存了两个相当大得字典。我们都知道,python里面数据类型都是对象,是非常吃内存的。所以GitHub上有人做测评,400W+结点,8000W+边,内存消耗750G!!
能想到得解决办法就是把这两个字典存到数据库中,这样势必降低了训练速度。
但是可以考虑,在得到随机游走序列以后,对于word2vec部分不采用框架中得,而是使用GPU进行加速训练,这样又能弥补一段时间。
如果实在不行,可以考虑换DeepWalk。(本人就在纠结中…
以上。

你可能感兴趣的:(Graph-Embedding,Graph,Embedding,node2vec)