- DeepWalk是一种图嵌入算法,其原理见【图表示学习】word2vec与DeepWalk
- DeepWalk的源代码Github为https://github.com/phanein/deepwalk
- 本文是对源代码的简化和重新整理,方便读者可以快速的理解DeepWalk的原理
- 源代码包含了并行处理、数据序列化来处理大规模的数据,但是本文将这部分删除
- 源代码主要在文件夹deepwalk下,代码入口为文件__main__.py;数据在文件夹example_graphs
from six import iterkeys
from six.moves import range, zip, zip_longest
from collections import defaultdict, Iterable
from scipy.io import loadmat
from scipy.sparse import issparse
from gensim.models import Word2Vec, KeyedVectors
from sklearn.utils import shuffle as skshuffle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle as skshuffle
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import random
import numpy
import warnings
warnings.filterwarnings('ignore')
D:\work\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
This is separate from the ipykernel package so we can avoid doing imports until
使用邻接表的方式存储图
class Graph(defaultdict):
"""
以字典的形式存储图信息(也就是邻接表),其中key是结点的编号,value是相邻结点编号组成的list
"""
def __init__(self):
super(Graph, self).__init__(list)
def nodes(self):
"""返回图中的所有结点"""
return self.keys()
def adjacency_iter(self):
"""返回邻接表"""
return self.items()
def subgraph(self, nodes={}):
"""给定顶点集合nodes,返回对于的子图"""
subgraph = Graph()
for n in nodes:
if n in self:
subgraph[n] = [x for x in self[n] if x in nodes]
return subgraph
def check_self_loops(self):
"""检测自循环(也就是某个结点的相邻节点包含自己的情况)"""
for x in self:
for y in self[x]:
if x==y:
return True
def remove_self_loops(self):
"""删除自循环"""
for x in self:
if x in self[x]:
self[x].remove(x)
return self
def make_consistent(self):
"""对邻接表中的相邻节点进行排序并去除自循环"""
for k in iterkeys(self):
self[k] = list(sorted(set(self[k])))
self.remove_self_loops()
return self
def make_undirected(self):
"""转换为无向图"""
for v in list(self):
for other in self[v]:
if v != other:
self[other].append(v)
self.make_consistent()
return self
def has_edge(self, v1, v2):
"""判断两顶点间是否有边"""
if v2 in self[v1] or v1 in self[v2]:
return True
return False
def degree(self, nodes=None):
"""返回给定顶点的度"""
if isinstance(nodes, Iterable):
return {v:len(self[v]) for v in nodes}
else:
return len(self[nodes])
def order(self):
return len(self)
def number_of_edges(self):
"""图中边的数目"""
return sum([self.degree(x) for x in self.keys()])/2 # 所有顶点度的和再除以2
def number_of_nodes(self):
"""图中顶点的数目"""
return self.order()
def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
"""
返回截断随机游走
path_length:随机游走的长度
alpha:重新开始的概率
start:随机游走的起点
"""
G = self
if start:
path = [start]
else:
path = [rand.choice(list(G.keys()))]
while len(path)<path_length:
cur = path[-1]
# 度大于0的点,也就是有相邻节点的点
if len(G[cur]) > 0:
if rand.random() >= alpha:
path.append(rand.choice(G[cur]))
else:
# 以一定的概率重新回到出发顶点
path.append(path[0])
else:
break
return [str(node) for node in path]
def grouper(n, iterable, padvalue=None):
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)
数据格式为:
1 2 3 4 5 6 7 8 9 11 12 13 14 18 20 22 32
2 1 3 4 8 14 18 20 22 31
3 1 2 4 8 9 10 14 28 29 33
4 1 2 3 8 13 14
5 1 7 11
6 1 7 11 17
其中每行的第一个数表示图中的某个点,后面一系列数表示与该点相邻的点。
def load_adjacencylist(file_, undirected=False, chunksize=10000,unchecked=True):
"""
每chunksize个顶点的连接信息为一个chunk
"""
parse_func = parse_adjacencylist_unchecked
convert_func = from_adjlist_unchecked
adjlist = []
with open(file_) as f:
for idx, adj_chunk in enumerate(map(parse_func, grouper(int(chunksize), f))):
adjlist.extend(adj_chunk)
G = convert_func(adjlist)
# 转换为无向图
if undirected:
G = G.make_undirected()
return G
def parse_adjacencylist_unchecked(f):
"""
输入:('1 2 3', '2 1','3 1')
输出:[[1,2,3],[2,1],[3,1]]
"""
adjlist = []
for l in f:
if l and l[0] != "#":
adjlist.extend([[int(x) for x in l.strip().split()]])
return adjlist
def from_adjlist_unchecked(adjlist):
"""
输入:[[1,2,3],[2,1],[3,1]]
输出:实例化Graph,例如{1:[2,3],2:[1],3:[1]}
"""
G = Graph()
for row in adjlist:
node = row[0]
neighbors = row[1:]
G[node] = neighbors
return G
G1 = load_adjacencylist('./data/karate.adjlist')
print(G1)
Graph(, {1: [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 18, 20, 22, 32], 2: [1, 3, 4, 8, 14, 18, 20, 22, 31], 3: [1, 2, 4, 8, 9, 10, 14, 28, 29, 33], 4: [1, 2, 3, 8, 13, 14], 5: [1, 7, 11], 6: [1, 7, 11, 17], 7: [1, 5, 6, 17], 8: [1, 2, 3, 4], 9: [1, 3, 31, 33, 34], 10: [3, 34], 11: [1, 5, 6], 12: [1], 13: [1, 4], 14: [1, 2, 3, 4, 34], 15: [33, 34], 16: [33, 34], 17: [6, 7], 18: [1, 2], 19: [33, 34], 20: [1, 2, 34], 21: [33, 34], 22: [1, 2], 23: [33, 34], 24: [26, 28, 30, 33, 34], 25: [26, 28, 32], 26: [24, 25, 32], 27: [30, 34], 28: [3, 24, 25, 34], 29: [3, 32, 34], 30: [24, 27, 33, 34], 31: [2, 9, 33, 34], 32: [1, 25, 26, 29, 33, 34], 33: [3, 9, 15, 16, 19, 21, 23, 24, 30, 31, 32, 34], 34: [9, 10, 14, 15, 16, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32, 33]})
数据格式为:
0 1
0 2
3 7
3 8
其中每一行代表连接两个顶点的边。
def load_edgelist(file_, undirected=True):
G = Graph()
with open(file_) as f:
for l in f: # 读取每行的数据
x, y = l.strip().split()[:2]
x = int(x)
y = int(y)
G[x].append(y)
if undirected: # 无向图,则加相反的边
G[y].append(x)
G.make_consistent()
return G
G2 = load_edgelist("./data/p2p-Gnutella08.edgelist")
print(G2.number_of_nodes())
6301
def load_matfile(file_, variable_name="network", undirected=True):
mat_varables = loadmat(file_)
mat_matrix = mat_varables[variable_name]
return from_numpy(mat_matrix, undirected)
def from_numpy(x, undirected=True):
G = Graph()
# 如果是稀疏矩阵格式
if issparse(x):
cx = x.tocoo() # 转换为coo matrix形式
for i,j,v in zip(cx.row, cx.col, cx.data):
G[i].append(j)
else:
raise Exception("Dense matrices not yet supported.")
if undirected:
G.make_undirected()
G.make_consistent()
return G
G3 = load_matfile("./data/blogcatalog.mat")
print(G3.number_of_nodes())
10312
seed = 0
number_walks = 80 # 在全图上执行随机游走的次数(每次都会对图中的所有点进行随机游走)
walk_length = 40 # 随机游走的长度
num_walks = len(G3.nodes()) * number_walks # 总共产生的游走序列的数量
data_size = num_walks * walk_length
representation_size = 128 # 词向量的维度
window_size = 10 # word2vec训练时窗口的大小
workers = 4 # 并行进程数
def build_deepwalk_corpus(G, num_paths, path_length, alpha=0, rand=random.Random(0)):
walks = []
nodes = list(G.nodes())
for cnt in range(num_paths):
rand.shuffle(nodes)
for node in nodes:
walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
return walks
walks = build_deepwalk_corpus(G3,
num_paths=number_walks,
path_length=walk_length,
alpha=0,
rand=random.Random(seed))
print(len(walks))
print(len(walks[0]))
print(walks[0])
824960
40
['597', '4373', '1360', '7894', '4162', '4445', '1452', '1635', '3764', '8343', '8762', '8323', '5255', '4175', '445', '1230', '1704', '327', '3197', '3280', '3695', '3857', '855', '1555', '4414', '862', '2357', '686', '1969', '2009', '3338', '4560', '5090', '6622', '6072', '4637', '5050', '4804', '7072', '6916']
model = Word2Vec(walks,
size=representation_size,
window=window_size,
min_count=0, sg=1, hs=1,
workers=workers)
model.wv.save_word2vec_format("blogcatalog.embeddings")
embeddings_file = "blogcatalog.embeddings"
matfile = "./data/blogcatalog.mat"
adj_matrix_name = "network"
label_matrix_name = "group"
num_shuffles = 10
def sparse2graph(x):
"""将稀疏邻接矩阵转换为由字典表示的邻接表"""
G = defaultdict(lambda: set())
cx = x.tocoo()
for i,j,v in zip(cx.row, cx.col, cx.data):
G[i].add(j)
return {str(k):[str(x) for x in v] for k,v in G.items()}
class TopKRanker(OneVsRestClassifier):
def predict(self, X, top_k_list):
assert X.shape[0] == len(top_k_list)
probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
all_labels = []
for i, k in enumerate(top_k_list):
probs_ = probs[i, :]
labels = self.classes_[probs_.argsort()[-k:]].tolist()
all_labels.append(labels)
return all_labels
# 加载词向量
model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
# 加载标签
mat = loadmat(matfile)
A = mat[adj_matrix_name]
graph = sparse2graph(A)
labels_matrix = mat[label_matrix_name] # (顶点数,标签类别)
labels_count = labels_matrix.shape[1]
# 多标签二值化转换
mlb = MultiLabelBinarizer(range(labels_count))
# 词向量矩阵
features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))])
# 数据shuffle
shuffles = [] # 用于保存多次shuffle的结果
for x in range(num_shuffles):
shuffles.append(skshuffle(features_matrix, labels_matrix))
# 用于保存结果
all_results = defaultdict(list)
# 训练集和测试集的划分比较分别为0.1、0.5和0.9
training_percents = [0.1, 0.5, 0.9]
# 不同的训练集和测试集划分比较
for train_percent in training_percents:
# 执行num_shuffles次训练
for shuf in shuffles:
X, y = shuf
# 划分训练集
training_size = int(train_percent*X.shape[0])
X_train = X[:training_size,:]
y_train_ = y[:training_size]
y_train = [[] for _ in range(y_train_.shape[0])]
cy = y_train_.tocoo()
for i,j in zip(cy.row, cy.col):
y_train[i].append(j)
assert sum(len(l) for l in y_train) == y_train_.nnz
# 划分测试集
X_test = X[training_size:,:]
y_test_ = y[training_size:]
y_test = [[] for _ in range(y_test_.shape[0])]
cy = y_test_.tocoo()
for i,j in zip(cy.row, cy.col):
y_test[i].append(j)
# 训练模型
clf = TopKRanker(LogisticRegression(solver='lbfgs'))
clf.fit(X_train, y_train_)
# 模型预测
top_k_list = [len(l) for l in y_test]
preds = clf.predict(X_test, top_k_list)
# 模型评估
results = {}
averages = ["micro", "macro"]
for average in averages:
results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)
all_results[train_percent].append(results)
print('Results, using embeddings of dimensionality', X.shape[1])
print('-------------------')
for train_percent in sorted(all_results.keys()):
print ('Train percent:', train_percent)
for index, result in enumerate(all_results[train_percent]):
print ('Shuffle #%d: ' % (index + 1), result)
avg_score = defaultdict(float)
for score_dict in all_results[train_percent]:
for metric, score in score_dict.items():
avg_score[metric] += score
for metric in avg_score:
avg_score[metric] /= len(all_results[train_percent])
print ('Average score:', dict(avg_score))
print ('-------------------')
Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.1
Shuffle #1: {'micro': 0.3581986673814812, 'macro': 0.2033224239333088}
Shuffle #2: {'micro': 0.3652487714987715, 'macro': 0.21577524088832908}
Shuffle #3: {'micro': 0.3623166141792765, 'macro': 0.21066689705245478}
Shuffle #4: {'micro': 0.3620649919336253, 'macro': 0.21171869067147162}
Shuffle #5: {'micro': 0.35988947731982507, 'macro': 0.20222950493659103}
Shuffle #6: {'micro': 0.35737906636929934, 'macro': 0.21090059048705442}
Shuffle #7: {'micro': 0.3616287094547964, 'macro': 0.20471820148032435}
Shuffle #8: {'micro': 0.3655955211289209, 'macro': 0.21922779915266496}
Shuffle #9: {'micro': 0.35839791299010204, 'macro': 0.21189205464993427}
Shuffle #10: {'micro': 0.35914952410193424, 'macro': 0.2078152517515965}
Average score: {'micro': 0.36098692563580326, 'macro': 0.20982666550037302}
-------------------
Train percent: 0.5
Shuffle #1: {'micro': 0.4175778546712803, 'macro': 0.27454889378894143}
Shuffle #2: {'micro': 0.4148700939745716, 'macro': 0.27046150803807273}
Shuffle #3: {'micro': 0.41626129256428074, 'macro': 0.2696294691077568}
Shuffle #4: {'micro': 0.41131664853101196, 'macro': 0.2687751830941183}
Shuffle #5: {'micro': 0.4086511885019347, 'macro': 0.2628792857558395}
Shuffle #6: {'micro': 0.42060622914349277, 'macro': 0.2676333163255493}
Shuffle #7: {'micro': 0.4159658072521715, 'macro': 0.2685014990215809}
Shuffle #8: {'micro': 0.4115448504983389, 'macro': 0.271229381068436}
Shuffle #9: {'micro': 0.41506565307532833, 'macro': 0.27456128536082786}
Shuffle #10: {'micro': 0.41693180246230455, 'macro': 0.2744381873233862}
Average score: {'micro': 0.41487914206747156, 'macro': 0.27026580088845087}
-------------------
Train percent: 0.9
Shuffle #1: {'micro': 0.42203742203742206, 'macro': 0.25880516173753676}
Shuffle #2: {'micro': 0.4207232267037552, 'macro': 0.28166124425360894}
Shuffle #3: {'micro': 0.43083275980729524, 'macro': 0.28905126447658364}
Shuffle #4: {'micro': 0.4314789687924016, 'macro': 0.2977340602872717}
Shuffle #5: {'micro': 0.44467640918580376, 'macro': 0.289030378588737}
Shuffle #6: {'micro': 0.4166666666666667, 'macro': 0.2857354498771346}
Shuffle #7: {'micro': 0.44308111792774363, 'macro': 0.3100004806293008}
Shuffle #8: {'micro': 0.42270194986072424, 'macro': 0.28487379305227356}
Shuffle #9: {'micro': 0.451985559566787, 'macro': 0.2897217493931982}
Shuffle #10: {'micro': 0.428067700987306, 'macro': 0.2626375242068254}
Average score: {'micro': 0.4312251781535906, 'macro': 0.28492511065024706}
-------------------