import jieba
import re
import emoji
from common.path.dataset.keywords import get_it_keywords_dir
from common.utils import filter_content_for_blog_cls
from config.stopwords.cn import CNStopwordsBuilder
from config.stopwords.default import DefaultStopwordsBuilder
class JiebaSegment(object):
def __init__(self) -> None:
jieba.load_userdict(get_it_keywords_dir())
self.cn_stopwords = CNStopwordsBuilder.load()
self.default_stopwords = DefaultStopwordsBuilder.load()
self.stop_words = []
self.stop_words.extend(self.cn_stopwords)
self.stop_words.extend(self.default_stopwords)
self.stop_words = list(set(self.stop_words))
def remove_stopwords(self, word_list):
result = []
for word in word_list:
if word not in self.stop_words:
word = filter_content_for_blog_cls(word)
result.append(word)
for i in reversed(result):
if i == "":
result.remove(i)
return result
def segment(self, text):
res = jieba.cut(text)
res = self.remove_stopwords(res)
return res
这种单进程的方式,在数据量比较少时,没什么问题,但当数据量达到几十万级别时,分一次词就得半小时以上,比较费时间,因此,当数据量很大时,多进程分词就很有必要了。
def test_extract_qa_data(config, options):
data_path = './test/answer/data/answer_data.txt'
file_handle = open(data_path, 'r')
count = 0
result = []
while True:
count+=1
line = file_handle.readline()
if not line:
break
item_list = line.split('\t')
question_id = item_list[0]
title = item_list[1]
body = item_list[2]
try:
title = base64.b64decode(title).decode('utf-8')
body = base64.b64decode(body).decode('utf-8')
line = (question_id, title, body)
except Exception as e:
print(e)
continue
yield line
当数据量较大时,读取数据时最好不要一次性读取到内存中,这里我们采用逐行读取的方式,以免内存不足。
多进程分词:
def test_token_multi_p(config, options):
num_processes = multiprocessing.cpu_count()
pool = Pool(6)
segment = JiebaSegment()
print("获取数据...")
line_list = list(test_extract_qa_data(config, options))
print("开始处理...")
text_list = []
for line in line_list:
question_id, title, body = line
text_list.append(title + body)
token_result = []
for result in tqdm(pool.imap(func=segment.segment, iterable=text_list), total=len(text_list)):
token_result.append(result)
pool.close()
原地起飞!!!