实验文件地址:https://github.com/audier/my_deep_project/blob/master/NLP/2.translation/openEmbedding.ipynb
下载数据:https://ai.tencent.com/ailab/nlp/embedding.html
Tencent AI Lab Embedding Corpus for Chinese Words and Phrases
import numpy as np
from tqdm import tqdm
# 包含数据处理函数
from utils import GenData
def loadEmbedding(embeddingFile, word2id, embeddingSize):
with open(embeddingFile, "r", encoding='ISO-8859-1') as f:
header = f.readline()
vocab_size, vector_size = map(int, header.split())
initW = np.random.uniform(-0.25,0.25,(len(word2id), vector_size))
count = 0
for i in tqdm(range(vocab_size)):
line = f.readline()
lists = line.split(' ')
word = lists[0]
try: word = word.encode('ISO-8859-1').decode('utf8')
except: pass
if word in word2id:
count += 1
number = map(float, lists[1:])
number = list(number)
vector = np.array(number)
initW[word2id[word]] = vector
print(count)
return initW
def main():
data = GenData('cmn.txt','jieba',200)
weight = loadEmbedding('E:\\Desktop\\nlp\\Tencent_AILab_ChineseEmbedding.txt', data.ch2id, 200)
print(weight.shape)
#main()
import jieba
jieba.lcut('我今天吃了西红柿炒面,隔壁的人也是因吹斯听的人')
['我', '今天', '吃', '了', '西红柿', '炒面', ',', '隔壁', '的', '人', '也', '是', '因吹斯', '听', '的', '人']
from tqdm import tqdm
def gendict(inputFile, ouputFile):
output_f = open(ouputFile, 'a', encoding='utf8')
with open(inputFile, "r", encoding='ISO-8859-1') as f:
header = f.readline()
vocab_size, vector_size = map(int, header.split())
for i in tqdm(range(vocab_size)):
line = f.readline()
lists = line.split(' ')
word = lists[0]
try:
word = word.encode('ISO-8859-1').decode('utf8')
output_f.write(word+'\n')
except: pass
output_f.close()
f.close()
#gendict('E:\\Desktop\\nlp\\Tencent_AILab_ChineseEmbedding.txt', 'E:\\Desktop\\nlp\\jieba.txt')
import jieba
jieba.load_userdict('E:\\Desktop\\nlp\\jieba.txt')
jieba.lcut('我今天吃了西红柿炒面,隔壁的人也是因吹斯听的人')
['我', '今天', '吃了', '西红柿炒面', ',', '隔壁', '的人', '也是', '因吹斯听', '的人']