DeepWalk算法:
SkipGram算法:
分层Sofmax
import networkx as nx
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
df = pd.read_csv("data/wiki/seealsology-data.tsv", sep = "\t")
df.head()
G = nx.from_pandas_edgelist(df, "source", "target",
edge_attr=True, create_using=nx.Graph())
def get_randomwalk(node, path_length):
'''
输入起始节点和路径长度,生成随机游走节点序列
'''
random_walk = [node]
for i in range(path_length-1):
# 汇总邻接节点
temp = list(G.neighbors(node))
temp = list(set(temp) - set(random_walk))
if len(temp) == 0:
break
# 从邻接节点中随机选择下一个节点
random_node = random.choice(temp)
random_walk.append(random_node)
node = random_node
return random_walk
all_nodes = list(G.nodes())
# 每个节点作为起始点生成随机游走序列个数
gamma = 10
# 随机游走序列最大长度
walk_length = 5
# 生成随机游走序列
random_walks = []
for n in tqdm(all_nodes):
# 将每个节点作为起始点生成gamma个随机游走序列
for i in range(gamma):
random_walks.append(get_randomwalk(n, walk_length))
from gensim.models import Word2Vec
model = Word2Vec(
vector_size=256, # Embedding维数
window=4, # 窗口宽度
sg=1, # Skip-Gram
hs=0, # 不加分层softmax
negative=10, # 负采样
alpha=0.03, # 初始学习率
min_alpha=0.0007, # 最小学习率
seed=14 # 随机数种子
)
# 用随机游走序列构建词汇表
model.build_vocab(random_walks, progress_per=2)
# 训练Word2Vec模型
model.train(random_walks, total_examples=model.corpus_count, epochs=50, report_delay=1)
# 查看某个节点的Embedding
model.wv.get_vector('random forest').shape
# 找相似词语
model.wv.similar_by_word('decision tree')