目标:下载腾讯词向量,并对其进行预处理,输出字向量与对应的tokenizer。
腾讯词向量下载地址:Tencent AI Lab Embedding Corpus for Chinese Words and Phrases。解压后是一个约16G的txt文件,命名为:Tencent_AILab_ChineseEmbedding.txt。
Python代码:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@Time :2020/2/3
@Name :Zhang Wei
@Contact :[email protected]
@File :tencent.py
@Software :Pycharm
"""
import pickle as pk
# from gensim.models import KeyedVectors
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
# 加载pkl文件
def load_pkl(input_path):
with open(input_path, 'rb') as f:
loaded_obj = pk.load(f)
return loaded_obj
# 写入pkl文件
def to_pkl(content, output_path):
with open(output_path, 'wb') as f:
pk.dump(content, f)
# 加载腾讯词向量, 数据清洗
def load_tencent_word_embedding():
n = 0
with open('tencent.txt', 'a', encoding='utf-8', errors='ignore') as w_f:
with open('Tencent_AILab_ChineseEmbedding.txt', 'r', encoding='utf-8', errors='ignore')as f:
for i in tqdm(range(8824330)): # 似乎不同时期下载的词向量range并不一样
data = f.readline()
a = data.split()
if i == 0:
w_f.write('8748463 200\n') # 行数的写入也有可能不同
if len(a) == 201:
if not a[0].isdigit():
n = n + 1
w_f.write(data)
print(n) # 输出清洗后的range
# model = KeyedVectors.load_word2vec_format('tencent.txt', binary=False, unicode_errors='ignore')
print("successfully load tencent word embedding!")
# 保存字向量与对应的tokenier
def save_char_embedding(embedding_path, tokenizer_path):
flag, keras_embedding, words = 0, [], []
with open('tencent.txt','r',encoding='utf-8') as file:
for line in file:
flag += 1
if flag >= 3:
vectorlist = line.split() # 切分一行,分为词汇和词向量
# 单字符
if len(vectorlist[0]) == 1: # 单字: '\u4e00' <= vectorlist[0] <= '\u9fff'
vector = list(map(lambda x:float(x),vectorlist[1:])) # 对词向量进行处理
vec = np.array(vector) # 将列表转化为array
keras_embedding.append(vec)
words.append(vectorlist[0])
res = np.array(keras_embedding)
to_pkl(res, embedding_path) # 保存腾讯字向量
# 创建分词器Tokenzier对象
tokenizer = Tokenizer()
# fit_on_texts 方法
tokenizer.fit_on_texts(words)
to_pkl(tokenizer, tokenizer_path) # 保存腾讯字分词器
print("successfully save!")
# 保存为单字词典word.txt
def save_one_word(tokenizer_path, word_path="word.txt"):
tokenizer = load_pkl(tokenizer_path)
with open(word_path, "w", encoding="utf-8") as file:
for word in tokenizer.word_docs.keys():
if '\u4e00' <= word <= '\u9fff': # 可以设置长度, 用来保存常用词典
file.write(word + "\n")
print("successfully save one word!")
if __name__ == "__main__":
embedding_path = "keras_embedding.pkl"
tokenizer_path = "keras_tokenizer.pkl"
# save_char_embedding(embedding_path, tokenizer_path) # 保存腾讯字向量与字分词器
tokenizer = load_pkl(tokenizer_path)
# 单元测试
query = "武汉加油。中国加油。"
text = " ".join(list(query)) # 对 "武汉加油" 进行切分,得到 "武 汉 加 油"
seq = tokenizer.texts_to_sequences([text])
print(query, seq)
输出:
武汉加油。中国加油。 [[1449, 1663, 304, 553, 96, 131, 451, 304, 553, 96]]
参考资料:
1、Keras深度学习库
2、Python函数——Keras分词器Tokenizer