分享一个将文本单词转为id并存储的函数

先介绍word2id转换程序,映射字典存储为vocab.pickle文件

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:ShidongDu time:2019/10/10

from hanziconv import HanziConv
from jieba import cut
from tflearn.data_utils import VocabularyProcessor

with open("****.txt", 'r', encoding='utf-8') as f:
    list_text = f.readlines()


def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    """

    for document in documents:
        # 繁体转简体
        text = HanziConv.toSimplified(document)
        # 英文转小写
        text = text.lower()
        # 分词
        yield list(cut(text))

if __name__ == '__main__':
    # 序列长度填充或截取到100,删除词频<=2的词
    # vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer)
    vocab = VocabularyProcessor(40, 2, tokenizer_fn=chinese_tokenizer)

    # 创建词汇表,创建后不能更改
    vocab.fit(list_text)

    # 保存和加载词汇表
    vocab.save('vocab.pickle')
    vocab = VocabularyProcessor.restore('vocab.pickle')

    # 文本转为词ID序列,未知或填充用的词ID为0
    id_documents = list(vocab.transform(list_text))
    for id_document in id_documents:
        print(id_document)
    with open("eval_data.txt", 'a', encoding='utf-8') as f:
        for id_document in id_documents:
            for id in id_document.tolist():
                f.write(str(id) + ' ')
            f.write('\n')
    for document in vocab.reverse(id_documents):
        print(document)

转换完成的文本存储为eval_data.txt


分享一个将文本单词转为id并存储的函数_第1张图片

这样的形式啦

接下来介绍id2word程序

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: ShidongDu time2019/10/11

from tflearn.data_utils import VocabularyProcessor
import numpy as np

number_lists = []
number_list = []

def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    """

    for document in documents:
        # 繁体转简体
        text = HanziConv.toSimplified(document)
        # 英文转小写
        text = text.lower()
        # 分词
        yield list(cut(text))

vocab = VocabularyProcessor(40, 2, tokenizer_fn=chinese_tokenizer)
vocab = VocabularyProcessor.restore('vocab.pickle')
with open('eval_data.txt', 'r', encoding='utf-8') as f:
    list_text = f.readlines()

for text in list_text:
    for num_list in text[:-1].split(' ')[:-1]:
        number_list.append(int(num_list))
    number_lists.append(number_list)
    number_list=[]


for document in vocab.reverse(number_lists):
    print(document)

这里我是直接将结果打印出来了,没有存储为新的文件哦,小伙伴们可以自己修改最后一行代码,将转换结果存储起来。

你可能感兴趣的:(分享一个将文本单词转为id并存储的函数)