School:Stanford
Teacher:Prof. Christopher Manning
Library:Pytorch
# Gensim word vector visualization of various word vectors
import numpy as np
# Get the interactive Tools for Matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn. manifold import TSNE
from sklearn. decomposition import PCA
from gensim.test.utils import datapath, get_tmpfile
from gensim. models import KeyedVectors
from gensim. scripts .glove2word2vec import glove2word2vec
# 将GloVe文件格式转换为word2vec文件格式
glove_file = datapath('D:\Python-text\\nlp_text\\nlp_datawhale\\task01\\glove.6B\\glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("D:\Python-text\\nlp_text\\nlp_datawhale\\task01\\glove.6B\\glove.6B.100d.word2vec.txt")
print(glove2word2vec(glove_file, word2vec_glove_file))
# 加载预训练词向量模型
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
# 与obama最相似的词
print(model.most_similar('obama'))
# 与banana最相似的词
print(model.most_similar('banana'))
print(model.most_similar(negative='banana'))
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))
def analogy(x1, x2, y1):
result = model.most_similar(positive=[y1, x2], negative=[x1])
return result[0][0]
# 神经词向量的可视化散点图
def display_pca_scatterplot(model, words=None, sample=0):
if words == None:
if sample > 0:
words = np.random.choice(list(model.vocab.keys()), sample)
else:
words = [word for word in model.vocab]
word_vectors = np.array([model[w] for w in words])
twodim = PCA().fit_transform(word_vectors)[:, :2]
plt.figure(figsize=(6, 6))
plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
for word, (x, y) in zip(words, twodim):
plt.text(x + 0.05, y + 0.05, word)
display_pca_scatterplot(model,
['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
'homework', 'assignment', 'problem', 'exam', 'test', 'class',
'school', 'college', 'university', 'institute'])
plt.show()
# 样本为300,可视化散点图
display_pca_scatterplot(model, sample=300)
plt.show()
斯坦福cs224n-2019链接:https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/
bilibili 视频:https://www.bilibili.com/video/BV1s4411N7fC?p=2