(十)文本embedding的相关操作

1. glove转为word2vec

两句代码,

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec('glove.840B.300d.txt', 'test.txt')
# 参数(glove文件路径,目标文件路径)

2. torchtext生成embedding

import torch
import numpy as np
import torch.nn as nn
from torch.autograd import Variable
from torchtext import data
import os
from torchtext.vocab import Vectors

train_data = [ ['3', '15', '2', '7'] ]
# 加载本地预训练词向量文件,并读入vector中
if not os.path.exists('.vector_cache'):
    os.mkdir('.vector_cache')
# 本地不存在.txt,会自动下载这个文件
vector = Vectors(name='glove.840B.300d.txt')
# 利用torchtext建词汇表,词向量作为参数
TEXT =data.Field(sequential=True)
TEXT.build_vocab(train_data, vectors=vector)
# 查看vocab字典、向量、以及字典长度
print(TEXT.vocab.stoi)
print(TEXT.vocab.vectors)
print(len(TEXT.vocab))
# 初始化embedding权值矩阵:vocab大小*词向量维度
embeds = nn.Embedding(len(TEXT.vocab), vector.dim)
# 导入使用本地文件训练好的权值矩阵,覆盖初始化的权值矩阵
pretrain_weight = np.array(TEXT.vocab.vectors)
embeds.weight.data.copy_(torch.from_numpy(pretrain_weight))
print((embeds.weight))
print(embeds.weight.size())
print('======================embedding over======================')
# 创建数据迭代器
train_iter = data.BucketIterator(dataset=train_data, batch_size=8, shuffle=True, sort_key=lambda x: len(x.comment_text), sort_within_batch=False, repeat=False)
# 输出embedding矩阵的第3和第5行(索引访问)
# list转为tensor
input = Variable(torch.LongTensor([3,5]))
train_embedding = embeds(input)  # shape:torch.Size([2, 300])
print('example:', train_embedding)
print('Over')

你可能感兴趣的:(python学习)