jieba多进程分词

文章目录

  • 基础分词
  • 多进程分词

基础分词


import jieba
import re
import emoji
from common.path.dataset.keywords import get_it_keywords_dir
from common.utils import filter_content_for_blog_cls
from config.stopwords.cn import CNStopwordsBuilder
from config.stopwords.default import DefaultStopwordsBuilder



class JiebaSegment(object):
    def __init__(self) -> None:
        jieba.load_userdict(get_it_keywords_dir())
        self.cn_stopwords = CNStopwordsBuilder.load()
        self.default_stopwords = DefaultStopwordsBuilder.load()
        self.stop_words = []
        self.stop_words.extend(self.cn_stopwords)
        self.stop_words.extend(self.default_stopwords)
        self.stop_words = list(set(self.stop_words))


    def remove_stopwords(self, word_list):
        result = []
        for word in word_list:
            if word not in self.stop_words:
                word = filter_content_for_blog_cls(word)
                result.append(word)
        for i in reversed(result):
            if i == "":
                result.remove(i)
        return result

    
    def segment(self, text):
        res = jieba.cut(text)
        res = self.remove_stopwords(res)
        return res

这种单进程的方式,在数据量比较少时,没什么问题,但当数据量达到几十万级别时,分一次词就得半小时以上,比较费时间,因此,当数据量很大时,多进程分词就很有必要了。

多进程分词

def test_extract_qa_data(config, options):
    data_path = './test/answer/data/answer_data.txt'
    file_handle = open(data_path, 'r')
    count = 0
    result = []
    while True:
        count+=1
        line = file_handle.readline()
        if not line:
            break
        item_list = line.split('\t')
        question_id = item_list[0]
        title = item_list[1]
        body = item_list[2]
        try:
            title = base64.b64decode(title).decode('utf-8')
            body = base64.b64decode(body).decode('utf-8')
            line = (question_id, title, body)
        except Exception as e:
            print(e)
            continue
        yield line

当数据量较大时,读取数据时最好不要一次性读取到内存中,这里我们采用逐行读取的方式,以免内存不足。

多进程分词:

def test_token_multi_p(config, options):
    num_processes = multiprocessing.cpu_count()
    pool = Pool(6)

    segment = JiebaSegment()
    print("获取数据...")
    line_list = list(test_extract_qa_data(config, options))
    print("开始处理...")
    text_list = []
    for line in line_list:
        question_id, title, body = line
        text_list.append(title + body)
    token_result = []
    for result in tqdm(pool.imap(func=segment.segment, iterable=text_list), total=len(text_list)):
        token_result.append(result)
    pool.close()

原地起飞!!!

你可能感兴趣的:(NLP成长之路,python)