[paper]http://www.perozzi.net/publications/14_kdd_deepwalk.pdf
[code]https://github.com/phanein/deepwalk
abstract
DeepWalk uses local information obtained from truncated random walks to learn latent representations by treating walks as the equivalent of sentences.
deepwalk通过随机截断游走得到的序列,并且这些序列看作是相等的句子,然后从中学习隐层表示获得本地信息。
DeepWalk’s representations can provide F1 scores up to 10% higher than competing methods when labeled data is sparse. In some experiments, DeepWalk’s representations are able to outperform all baseline methods while using 60% less training data.
当标签十分稀疏的时候,deepwalk表现出能够提高10%的f1-score。在一些实验中,deepwalk可以使用少于60%的训练数据但表现却超于baseline。
introduction
主要的三个贡献
• We introduce deep learning as a tool to analyze graphs, to build robust representations that are suitable for statistical modeling. DeepWalk learns structural regularities present within short random walks.
• We extensively evaluate our representations on multi- label classification tasks on several social networks. We show significantly increased classification performance in the presence of label sparsity, getting improvements 5%-10% of Micro F1, on the sparsest problems we consider. In some cases, DeepWalk’s representations can outperform its competitors even when given 60% less training data.
• We demonstrate the scalability of our algorithm by building representations of web-scale graphs, (such as YouTube) using a parallel implementation. Moreover, we describe the minimal changes necessary to build a streaming version of our approach.
问题定义
定义网络图。
算法流程
基于的版本是python2.x
结构如下:
__main__.py(函数里面主要是对参数的解析以及运行。)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from . import graph
from . import walks as serialized_walks
from gensim.models import Word2Vec
from .skipgram import Skipgram
from six import text_type as unicode
from six import iteritems
from six.moves import range
import psutil
from multiprocessing import cpu_count
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
logger = logging.getLogger(__name__)
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
def debug(type_, value, tb):
if hasattr(sys, 'ps1') or not sys.stderr.isatty():
sys.__excepthook__(type_, value, tb)
else:
import traceback
import pdb
traceback.print_exception(type_, value, tb)
print(u"\n")
pdb.pm()
def process(args):
if args.format == "adjlist":
G = graph.load_adjacencylist(args.input, undirected=args.undirected)
elif args.format == "edgelist":
G = graph.load_edgelist(args.input, undirected=args.undirected)
elif args.format == "mat":
G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
else:
raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)
print("Number of nodes: {}".format(len(G.nodes())))
num_walks = len(G.nodes()) * args.number_walks
print("Number of walks: {}".format(num_walks))
data_size = num_walks * args.walk_length
print("Data size (walks*length): {}".format(data_size))
"""
这里的data_size不是实际的内存大小,只是节点使用总次数的计数。
在这个if-else代码段中,更常出现的情况是if,而else里的代码几乎从来不会出现,除非data_size超过了max_memory_data_size
max_memory_data_size默认的是10亿次。
"""
if data_size < args.max_memory_data_size:
print("Walking...")
"调用graph.py里面的build_deepwalk_corpus,构建随机游走的语料库"
walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
print("Training...")
"直接调用word2vec,size是最后embedding结果的大小"
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
else:
print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size))
print("Walking...")
walks_filebase = args.output + ".walks"
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
num_workers=args.workers)
print("Counting vertex frequency...")
if not args.vertex_freq_degree:
vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
else:
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())
print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)
"调用包装过的skipgram函数"
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=args.representation_size,
window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
model.wv.save_word2vec_format(args.output)
def main():
"""
前面主要是对读取的参数进行处理
"""
parser = ArgumentParser("deepwalk",
formatter_class=ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument("--debug", dest="debug", action='store_true', default=False,
help="drop a debugger if an exception is raised.")
parser.add_argument('--format', default='adjlist',
help='File format of input file')
parser.add_argument('--input', nargs='?', required=True,
help='Input graph file')
parser.add_argument("-l", "--log", dest="log", default="INFO",
help="log verbosity level")
parser.add_argument('--matfile-variable-name', default='network',
help='variable name of adjacency matrix inside a .mat file.')
parser.add_argument('--max-memory-data-size', default=1000000000, type=int,
help='Size to start dumping walks to disk, instead of keeping them in memory.')
parser.add_argument('--number-walks', default=10, type=int,
help='Number of random walks to start at each node')
parser.add_argument('--output', required=True,
help='Output representation file')
parser.add_argument('--representation-size', default=64, type=int,
help='Number of latent dimensions to learn for each node.')
parser.add_argument('--seed', default=0, type=int,
help='Seed for random walk generator.')
parser.add_argument('--undirected', default=True, type=bool,
help='Treat graph as undirected.')
parser.add_argument('--vertex-freq-degree', default=False, action='store_true',
help='Use vertex degree to estimate the frequency of nodes '
'in the random walks. This option is faster than '
'calculating the vocabulary.')
parser.add_argument('--walk-length', default=40, type=int,
help='Length of the random walk started at each node')
parser.add_argument('--window-size', default=5, type=int,
help='Window size of skipgram model.')
parser.add_argument('--workers', default=1, type=int,
help='Number of parallel processes.')
args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
logging.basicConfig(format=LOGFORMAT)
logger.setLevel(numeric_level)
if args.debug:
sys.excepthook = debug
process(args)
if __name__ == "__main__":
sys.exit(main())
graph.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Graph utilities."""
import logging
import sys
from io import open
from os import path
from time import time
from glob import glob
from six.moves import range, zip, zip_longest
from six import iterkeys
from collections import defaultdict, Iterable
import random
from random import shuffle
from itertools import product,permutations
from scipy.io import loadmat
from scipy.sparse import issparse
logger = logging.getLogger("deepwalk")
__author__ = "Bryan Perozzi"
__email__ = "[email protected]"
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
class Graph(defaultdict):
"""
defaultdict与dict用法几乎一样,只是比dict多了一些用法而已
我们更愿意使用defaultdict而不是dict主要原因是defaultdict比dict更不容易报错例如
>>>x_defaultdict=defaultdict(int)
>>>print(x_defaultdict["key1"])
程序输出:0
>>>x_dict=dict()
>>>print(x_dict["key1"])
程序输出:KeyError: 'key1'
对于dict类型,你可以为没有出现过的key进行赋值,但是不可以直接使用
例如可以做 x_dict["key1"]=1
但是不可以做 x_dict["key1"]+=1
"""
"""Efficient basic implementation of nx `Graph' – Undirected graphs with self loops"""
def __init__(self):
"""
构建的图是一个字典,key是节点,value是该点邻居。
继承与defaultdict的__init__(default_factory=None, **kwargs),这里的default_factory为list,即每一个节点对应的value为一个list。
"""
super(Graph, self).__init__(list)
def nodes(self):
"""
返回图中所有节点。
"""
return self.keys()
def adjacency_iter(self):
"""
返回节点和它的邻居。
iteritems() 用于返回本身字典列表操作后的迭代,不占用额外的内存。
"""
return self.iteritems()
def subgraph(self, nodes={}):
"""
返回指定nodes构成的子图。
这里self相当于字典,n in self可以理解为关键字是否在字典中
self[n]是找字典中关键字为n的对应的value,上面一行的代码意思是把nodes中的节点和对应的边添加到子图subgraph中
"""
subgraph = Graph()
for n in nodes:
if n in self:
subgraph[n] = [x for x in self[n] if x in nodes]
return subgraph
def make_undirected(self):
"""
将有向图变成无向图。
"""
t0 = time()
"""
对于每一对邻接的点,遍历当前节点v的list,若此时遍历到的other和v不相等,就往other的list中添加v
"""
for v in list(self):
for other in self[v]:
if v != other:
self[other].append(v)
t1 = time()
logger.info('make_directed: added missing edges {}s'.format(t1-t0))
"""
去重,排序,删除自循环
"""
self.make_consistent()
return self
def make_consistent(self):
t0 = time()
"""
去重,排序
"""
for k in iterkeys(self):
self[k] = list(sorted(set(self[k])))
t1 = time()
logger.info('make_consistent: made consistent in {}s'.format(t1-t0))
self.remove_self_loops()
return self
def remove_self_loops(self):
"""
删除自循环
"""
removed = 0
t0 = time()
for x in self:
if x in self[x]:
self[x].remove(x)
removed += 1
t1 = time()
logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0)))
return self
def check_self_loops(self):
"""
检查是否存在自环。
true:存在
false:不存在
"""
for x in self:
for y in self[x]:
if x == y:
return True
return False
def has_edge(self, v1, v2):
"""
检查v1、v2之间是否存在边。
true:存在
false:不存在
"""
if v2 in self[v1] or v1 in self[v2]:
return True
return False
def degree(self, nodes=None):
"""
返回节点的度。
若是有向图,则返回的入度或是出度,由node的list构建方式决定。
:params nodes 可以是单个的节点,也可以是可迭代对象。
"""
if isinstance(nodes, Iterable):
return {v:len(self[v]) for v in nodes}
else:
return len(self[nodes])
def order(self):
"Returns the number of nodes in the graph"
return len(self)
def number_of_edges(self):
"Returns the number of nodes in the graph"
"注:这里是针对无向图"
return sum([self.degree(x) for x in self.keys()])/2
def number_of_nodes(self):
"Returns the number of nodes in the graph"
return self.order()
def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
""" Returns a truncated random walk.
path_length: Length of the random walk.
alpha: probability of restarts.
控制随机游走的下一个节点
start: the start node of the random walk.
"""
G = self
"""
指定起点start则path从start开始
若是没有指定,随机指定节点为起点
这里的start即为算法里的vi
"""
if start:
path = [start]
else:
# Sampling is uniform w.r.t V, and not w.r.t E
path = [rand.choice(list(G.keys()))]
"""
当path长度还没达到限制,继续游走
这里path_length即为算法里的t
"""
while len(path) < path_length:
cur = path[-1]
if len(G[cur]) > 0:
"""
若是当前生成的随机数大于等于alpha,则继续往下随机挑选节点走
否则,从起点重新游走
"""
if rand.random() >= alpha:
path.append(rand.choice(G[cur]))
else:
path.append(path[0])
else:
break
"返回随机游走序列"
return [str(node) for node in path]
# TODO add build_walks in here
def build_deepwalk_corpus(G, num_paths, path_length, alpha=0,
rand=random.Random(0)):
"构建deepwalk的语料库"
walks = []
"节点列表"
nodes = list(G.nodes())
"对每个节点进行num_paths次的随机游走"
for cnt in range(num_paths):
rand.shuffle(nodes)
for node in nodes:
walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
return walks
def build_deepwalk_corpus_iter(G, num_paths, path_length, alpha=0,
rand=random.Random(0)):
walks = []
nodes = list(G.nodes())
for cnt in range(num_paths):
rand.shuffle(nodes)
for node in nodes:
yield G.random_walk(path_length, rand=rand, alpha=alpha, start=node)
def clique(size):
"建一个大小为size的团 返回graph类型"
return from_adjlist(permutations(range(1,size+1)))
# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def grouper(n, iterable, padvalue=None):
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)
def parse_adjacencylist(f):
adjlist = []
for l in f:
if l and l[0] != "#":
introw = [int(x) for x in l.strip().split()]
row = [introw[0]]
row.extend(set(sorted(introw[1:])))
adjlist.extend([row])
return adjlist
def parse_adjacencylist_unchecked(f):
adjlist = []
for l in f:
if l and l[0] != "#":
adjlist.extend([[int(x) for x in l.strip().split()]])
return adjlist
def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True):
if unchecked:
parse_func = parse_adjacencylist_unchecked
convert_func = from_adjlist_unchecked
else:
parse_func = parse_adjacencylist
convert_func = from_adjlist
adjlist = []
t0 = time()
total = 0
with open(file_) as f:
for idx, adj_chunk in enumerate(map(parse_func, grouper(int(chunksize), f))):
adjlist.extend(adj_chunk)
total += len(adj_chunk)
t1 = time()
logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0))
t0 = time()
G = convert_func(adjlist)
t1 = time()
logger.info('Converted edges to graph in {}s'.format(t1-t0))
if undirected:
t0 = time()
G = G.make_undirected()
t1 = time()
logger.info('Made graph undirected in {}s'.format(t1-t0))
return G
def load_edgelist(file_, undirected=True):
G = Graph()
with open(file_) as f:
for l in f:
x, y = l.strip().split()[:2]
x = int(x)
y = int(y)
G[x].append(y)
if undirected:
G[y].append(x)
G.make_consistent()
return G
def load_matfile(file_, variable_name="network", undirected=True):
mat_varables = loadmat(file_)
mat_matrix = mat_varables[variable_name]
return from_numpy(mat_matrix, undirected)
def from_networkx(G_input, undirected=True):
G = Graph()
for idx, x in enumerate(G_input.nodes()):
for y in iterkeys(G_input[x]):
G[x].append(y)
if undirected:
G.make_undirected()
return G
def from_numpy(x, undirected=True):
G = Graph()
if issparse(x):
cx = x.tocoo()
for i,j,v in zip(cx.row, cx.col, cx.data):
G[i].append(j)
else:
raise Exception("Dense matrices not yet supported.")
if undirected:
G.make_undirected()
G.make_consistent()
return G
def from_adjlist(adjlist):
G = Graph()
for row in adjlist:
node = row[0]
neighbors = row[1:]
G[node] = list(sorted(set(neighbors)))
return G
def from_adjlist_unchecked(adjlist):
G = Graph()
for row in adjlist:
node = row[0]
neighbors = row[1:]
G[node] = neighbors
return G
walks.py
import logging
from io import open
from os import path
from time import time
from multiprocessing import cpu_count
import random
from concurrent.futures import ProcessPoolExecutor
from collections import Counter
from six.moves import zip
from . import graph
logger = logging.getLogger("deepwalk")
__current_graph = None
# speed up the string encoding
__vertex2str = None
def count_words(file):
""" Counts the word frequences in a list of sentences.
Note:
This is a helper function for parallel execution of `Vocabulary.from_text`
method.
统计每个节点的出现频次
"""
c = Counter()
with open(file, 'r') as f:
for l in f:
words = l.strip().split()
c.update(words)
return c
def count_textfiles(files, workers=1):
"""
多线程统计
"""
c = Counter()
"""
开启workers个线程处理器,为它们命名为executor
executor.map(called_function, input_array)
这句代码的意思是并行化的完成下面的功能:
result=[]
for item in input_array:
re=called_function(item)
result.append(re)
不同的是,这段代码不能并行计算,而executor.map(called_function, input_array)
可以实现并行计算,并且显然在代码上更加简洁
"""
with ProcessPoolExecutor(max_workers=workers) as executor:
for c_ in executor.map(count_words, files):
c.update(c_)
return c
def count_lines(f):
"统计有几个rand_walks"
if path.isfile(f):
num_lines = sum(1 for line in open(f))
return num_lines
else:
return 0
def _write_walks_to_disk(args):
num_paths, path_length, alpha, rand, f = args
G = __current_graph
t_0 = time()
with open(f, 'w') as fout:
for walk in graph.build_deepwalk_corpus_iter(G=G, num_paths=num_paths, path_length=path_length,
alpha=alpha, rand=rand):
fout.write(u"{}\n".format(u" ".join(v for v in walk)))
logger.debug("Generated new file {}, it took {} seconds".format(f, time() - t_0))
return f
def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=random.Random(0), num_workers=cpu_count(),
always_rebuild=True):
global __current_graph
__current_graph = G
files_list = ["{}.{}".format(filebase, str(x)) for x in list(range(num_paths))]
expected_size = len(G)
args_list = []
files = []
if num_paths <= num_workers:
paths_per_worker = [1 for x in range(num_paths)]
else:
paths_per_worker = [len(list(filter(lambda z: z!= None, [y for y in x])))
for x in graph.grouper(int(num_paths / num_workers)+1, range(1, num_paths+1))]
with ProcessPoolExecutor(max_workers=num_workers) as executor:
for size, file_, ppw in zip(executor.map(count_lines, files_list), files_list, paths_per_worker):
if always_rebuild or size != (ppw*expected_size):
args_list.append((ppw, path_length, alpha, random.Random(rand.randint(0, 2**31)), file_))
else:
files.append(file_)
with ProcessPoolExecutor(max_workers=num_workers) as executor:
for file_ in executor.map(_write_walks_to_disk, args_list):
files.append(file_)
return files
class WalksCorpus(object):
def __init__(self, file_list):
self.file_list = file_list
def __iter__(self):
for file in self.file_list:
with open(file, 'r') as f:
for line in f:
yield line.split()
def combine_files_iter(file_list):
for file in file_list:
with open(file, 'r') as f:
for line in f:
yield line.split()
skipgram.py(主要使用的是gensim里word2vec的代码库,只是包装了些参数)
from collections import Counter, Mapping
from concurrent.futures import ProcessPoolExecutor
import logging
from multiprocessing import cpu_count
from six import string_types
from gensim.models import Word2Vec
from gensim.models.word2vec import Vocab
logger = logging.getLogger("deepwalk")
class Skipgram(Word2Vec):
"""A subclass to allow more customization of the Word2Vec internals."""
def __init__(self, vocabulary_counts=None, **kwargs):
self.vocabulary_counts = None
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["workers"] = kwargs.get("workers", cpu_count())
kwargs["size"] = kwargs.get("size", 128)
kwargs["sentences"] = kwargs.get("sentences", None)
kwargs["window"] = kwargs.get("window", 10)
kwargs["sg"] = 1
kwargs["hs"] = 1
if vocabulary_counts != None:
self.vocabulary_counts = vocabulary_counts
super(Skipgram, self).__init__(**kwargs)