工作记录|在PyTorch下读取GloVe向量

官网:http://nlp.stanford.edu/projects/glove/
官方代码:https://github.com/stanfordnlp/GloVe
不过官方代码是C写的,跑在linux下

glove.6B.50d.txt 前三行

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353 0.59868 0.28825 -0.11547 -0.041848 -0.67989 -0.25063 0.18472 0.086876 0.46582 0.015035 0.043474 -1.4671 -0.30384 -0.023441 0.30589 -0.21785 3.746 0.0042284 -0.18436 -0.46209 0.098329 -0.11907 0.23919 0.1161 0.41705 0.056763 -6.3681e-05 0.068987 0.087939 -0.10285 -0.13931 0.22314 -0.080803 -0.35652 0.016413 0.10216

第一个是word,后面50个float数,表示50-d的向量

当前已知vocab词汇表,希望找到vocab中每个word对应的权重向量,代码如下:

def get_glove_vector(vocab):
	
	glove = {
     }
	with open(txt_file_path, 'rb') as ifs:
		for line in ifs.readlines():
			line = line.decode().strip()
			word = line[0]
			vector = np.array(line[1:]).astype(np.float)
			glove[word] = vector
	
	weights = []
	glove_dict = glove.keys()
	for i in range(len(vocab)):
		word = vocab.idx2word[i]
		if word in glove_dict:
			weight = glove[word]
		elif word == '':
			weight = torch.FloatTensor(512).normal_(0, 1)
		else: # ,,或者其他词
			weight = torch.zeros(opt.word_projected_size)
		weights.append(weight)
	weights = torch.stack(weights, 0)
	torch.save(weights, glove_weight_path)
	print('The size of glove weight is %s' % str(weights.size()))

当已知glove.42B.300d.txt时,300个数字太多,注意会有enter, 因此要删除回车

line = line.decode().strip().split('\t')

当能够拿到glove.42B.300d.pt时,直接使用torch.load读取即可

def get_glove_vector(vocab):
	weights = []
	glove = torch.load(pt_file_path)
	glove_dict = glove[0]
	for i in range(len(vocab)):
		word = vocab.idx2word[i]
		if word in glove_dict:
			glove_idx = glove_dic[word]
			weight = glove[1][glove_idx]
		elif word == '':
			weight = torch.FloatTensor(512).normal_(0, 1)
		else: # ,,或者其他词
			weight = torch.zeros(opt.word_projected_size)
		weights.append(weight)
	weights = torch.stack(weights, 0)
	torch.save(weights, glove_weight_path)
	print('The size of glove weight is %s' % str(weights.size()))

如果已知txt,使用bcolz保存词汇表和word2idx

words = []
idx = 0
word2idx = {
     }
vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.50.dat', mode='w')

with open(f'{glove_path}/glove.6B.50t.txt', 'rb') as f:
	for l in f:
		line = l.decode().split()
		word = line[0]
		words.append(word)
		word2idx[word] = idx
		idx += 1
		vect = np.array(line[1:]).astype(np.float)
		vectors.append(vect)

vectors = bcolz.carray(vectors[1:].reshape(400000, 50), rootdir=f'{glove_path}/6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))

使用word和word2idx可以创建词典,给定word返回向量值

vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))

glove = {
     w: vectors[word2idx[w] for w in words]}
# 使用: glove['the'] = array([...])

在PyTorch中使用,需要创建一个embedding layer,作为数据集的词汇表(和GloVe的词典不是一个东东)到vector的映射。
可以使用 torch.nn.Embedding类。
首先需要构建一个权重矩阵,(数据集词汇表长度, 单词向量维度)。

matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))
words_found = 0

for i, word in enumerate(target_vocab):
	try:
		weights_matrix[i] = glove[word]
		words_found += 1
	except KeyError:
		weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))	

接下来创建一个NN,第一层是embedding layer,加载weights matrix,后面是GRU层。

def create_emb_layer(weights_matrix, non_trainable=False):
	num_embeddings, emb_dim = weights_matrix.size()
	emb_layer = nn.Embedding(num_beddings, emb_dim)
	emb_layer.load_state_dict({
     'weight': weights_matrix})
	if non_trainable:
		emb_layer.weight.requires_grad = False
	return emb_layer, num_embeddings, emb_dim

class ToyNN(nn.Module):
	def __init__(self, weights_matrix, hidden_size, num_layers):
		super(self).__init__()
		self.embedding, num_embeddings, emb_dim = create_emb_layer(weights_matrix, True)
		self.hidden_size = hidden_size
		self.num_layers = num_layers
		self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True)
	
	def forward(self, inp, hidden):
		return self.gru(self.embedding(inp), hidden)
		
	def init_hidden(self, batch_size):
		return Variable(torch.zeors(self.num_layers, batch_size, self.hidden_size))
	

你可能感兴趣的:(闲文杂记)