jieba
中采用了《现代汉语词性标记》标准来标记汉语的词性,使用大量的中文细致地对汉语的各个词性进行分类,详细的列表可参考官方文档:jieba词性标注
下面是 jieba 支持的词性名字及其代号的对照表:
名称 | 代号 | 名称 | 代号 | 名称 | 代号 | 名称 | 代号 |
---|---|---|---|---|---|---|---|
名词 | n | 时间词 | t | 副词 | d | 动形词 | a |
处所词 | s | 连词 | c | 形容词 | p | 区别词 | b |
时间词 | t | 介词 | p | 助词 | u | 状态词 | z |
方位词 | f | 叹词 | e | 数词 | m | 语气词 | y |
代词 | r | 量词 | q | 拟声词 | o | 习用语 | i |
动词 | v | 专有名词 | nr | 成语 | ch |
在词性代号之前加上 “n”,可以得到与 ICTCLAS 标注集的对应关系。总体来说,jieba
对于中文文本的分词和词性标注都非常精准,是中文 NLP 领域中常用的工具之一。
希望这个信息对你有帮助!如果还有其他问题,请随时提问。️
from difflib import SequenceMatcher, Differ
from fuzzywuzzy import fuzz
import pandas as pd
import re
from multiprocessing import Process, Manager, freeze_support
import jieba.posseg as psg
from glob import glob
from tqdm import tqdm
def compare_text_sim_score(text1, text2):
# text1 = "今天天气真好"
# text2 = "今日天气很好"
# 计算文本相似度得分
similarity_score = fuzz.ratio(text1, text2)
# 输出相似度得分
# print(similarity_score)
return similarity_score
def compare_text_sim(text1, text2):
# text1 = "今天天气真好"
# text2 = "今日天气很好"
# 创建 SequenceMatcher 对象,将两个文本作为参数传入
matcher = SequenceMatcher(None, text1, text2)
# 获取文本相似度得分
similarity_score = matcher.ratio()
# 输出相似度得分
return similarity_score
def replace_stop_flag(text):
# text = "我爱北京天安门。"
# 对文本进行分词和词性标注
words = psg.cut(text, use_paddle=False)
# 遍历每个词汇,并输出它们的分词和词性标注信息
stop_flag = ["n", "t", "r", "s"]
# | 名称 | 代号 | 名称 | 代号 | 名称 | 代号 | 名称 | 代号 |
# | :-- | :-- | :-- | :-- | :-- | :-- | :-- | :-- |
# | 名词 | n | 时间词 | t | 副词 | d | 动形词 | a |
# | 处所词 | s | 连词 | c | 形容词 | p | 区别词 | b |
# | 时间词 | t | 介词 | p | 助词 | u | 状态词 | z |
# | 方位词 | f | 叹词 | e | 数词 | m | 语气词 | y |
# | 代词 | r | 量词 | q | 拟声词 | o | 习用语 | i |
# | 动词 | v | 专有名词 | nr | 成语 | ch | | |
text_new = []
text_flag = []
for word, flag in words:
if len(set(list(flag)) & set(stop_flag)) > 0:
# print(word, flag)
# text_new += "#2*1#3*"
text_flag.append(word)
else:
# text_flag += "#2*1#3*"
text_new.append(word)
return text_new, text_flag
def gen_data_dict(one_data):
logic_da, logic_ob = replace_stop_flag(one_data)
return {"原数据": one_data, "逻辑": logic_da, "逻辑对象": logic_ob}
def gen_data_pre(data_v, data_path, process_count, total_c):
res = []
for one_v in tqdm(data_v):
with open(one_v, "r", encoding="utf-8") as f:
one_data = f.read()
one_data = "".join(one_data.split())
total_c["count"] += 1
one_data = "".join(one_data.split())
# print("".join(one_data[:10]))
one_ob = gen_data_dict(one_data)
res.append(one_ob)
if len(res) > 10000:
pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))
res = []
pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))
def gen_data_find(data_v, return_data, input_text):
res = []
for one_v in tqdm(data_v):
one_pd = pd.read_pickle(one_v)
one_pd = one_pd["one_part"]
one_pd_data = gen_data_dict(input_text)
# one_sim=compare_text_sim()
# 要发明新的算法让字符最大长度对齐 计算 比如 假设 两个字符串 其中一个节点对齐后
# 对齐其他节点
d = Differ()
# diff = d.compare(text1_lines, text2_lines)
one_sort = [[
sum([1 if "-" != i[0] and "+" != i[0] else 0 for i in d.compare(j["逻辑"], one_pd_data["逻辑"])]),
sum([1 if "-" != i[0] and "+" != i[0] else 0 for i in d.compare(j["逻辑对象"], one_pd_data["逻辑对象"])])
]
for j in tqdm(one_pd)]
sort_index=pd.DataFrame(one_sort).sort_values([0,1], ascending=[False,False]).index.values[0]
res.append({"原数据":one_pd[sort_index]["原数据"],"计数":one_sort[sort_index]})
return_data.append(res)
def gen_text_to_logic(paths="E:/just_and_sum/data_sets/", out_dir="E:/just_and_sum/data_set_new"):
paths_list_pr = glob(pathname=paths + "*")
works_num = 8
total_count = Manager().dict()
total_count["count"] = 0
p_list = []
# 发任务到异步进程
for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
j = len(paths_list_pr) // works_num + i
p = Process(target=gen_data_pre, args=(paths_list_pr[i:j], out_dir, i, total_count))
p.start()
p_list.append(p)
for p in p_list:
p.join()
def gen_text_to_text(paths="E:/just_and_sum/data_set_new/", input_text="请问头孢克洛和头孢泊肟酯是"):
paths_list_pr = glob(pathname=paths + "*")
works_num = 5
total_count = Manager().dict()
total_count["count"] = 0
return_count = Manager().list()
p_list = []
# 发任务到异步进程
for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
j = len(paths_list_pr) // works_num + i
p = Process(target=gen_data_find, args=(paths_list_pr[i:j], return_count, input_text))
p.start()
p_list.append(p)
for p in p_list:
p.join()
r_list=[]
for i in list(return_count):
r_list+=i
r_df=pd.DataFrame(r_list)
r_c=r_df["计数"].values.tolist()
r_c=pd.DataFrame(r_c).sort_values([0,1],ascending=[False,False]).index.values[0]
print(r_df.values[r_c])
def replace_non_specified_chars(input_str, specified_chars, replacement_char):
return re.sub(f'[^{specified_chars}]', replacement_char, input_str)
if __name__ == '__main__':
freeze_support()
# 生成三个等级模糊数据
# gen_text_to_logic()
# 查询
gen_text_to_text()
稳定版
from difflib import Differ
import pandas as pd
import re
from multiprocessing import Process, Manager, freeze_support
import jieba.posseg as psg
from glob import glob
from tqdm import tqdm
def replace_stop_flag(text):
# text = "我爱北京天安门。"
# 对文本进行分词和词性标注
words = psg.cut(text, use_paddle=False)
# 遍历每个词汇,并输出它们的分词和词性标注信息
stop_flag = ["n", "t", "r", "s"]
# | 名称 | 代号 | 名称 | 代号 | 名称 | 代号 | 名称 | 代号 |
# | :-- | :-- | :-- | :-- | :-- | :-- | :-- | :-- |
# | 名词 | n | 时间词 | t | 副词 | d | 动形词 | a |
# | 处所词 | s | 连词 | c | 形容词 | p | 区别词 | b |
# | 时间词 | t | 介词 | p | 助词 | u | 状态词 | z |
# | 方位词 | f | 叹词 | e | 数词 | m | 语气词 | y |
# | 代词 | r | 量词 | q | 拟声词 | o | 习用语 | i |
# | 动词 | v | 专有名词 | nr | 成语 | ch | | |
text_new = []
text_flag = []
for word, flag in words:
if len(set(list(flag)) & set(stop_flag)) > 0:
# print(word, flag)
# text_new += "#2*1#3*"
text_flag.append(word)
else:
# text_flag += "#2*1#3*"
text_new.append(word)
return text_new, text_flag
def gen_data_dict(one_data):
logic_da, logic_ob = replace_stop_flag(one_data)
return {"原数据": one_data, "逻辑": logic_da, "逻辑对象": logic_ob}
def get_next_str(one_compare_res):
one_compare_res = pd.DataFrame(one_compare_res)
only_bool = ~one_compare_res[0].str[0].isin(["-", "+"])
ss = sum(only_bool)
return ss
def gen_data_pre(data_v, data_path, process_count, total_c):
res = []
for one_v in tqdm(data_v):
with open(one_v, "r", encoding="utf-8") as f:
one_data = f.read()
one_data = "".join(one_data.split())
total_c["count"] += 1
one_data = "".join(one_data.split())
# print("".join(one_data[:10]))
one_ob = gen_data_dict(one_data)
res.append(one_ob)
if len(res) > 10000:
pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))
res = []
pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))
def gen_data_find(data_v, return_data, input_text):
res = []
one_pd_data = gen_data_dict(input_text)
d = Differ()
for one_v in tqdm(data_v):
one_pd = pd.read_pickle(one_v)
one_pd = one_pd["one_part"]
sum_count = []
for j in tqdm(one_pd):
one_compare_res = list(d.compare(j["逻辑"], one_pd_data["逻辑"]))
ss=get_next_str(one_compare_res)
temp = [ss]
one_compare_res = list(d.compare(j["逻辑对象"], one_pd_data["逻辑对象"]))
ss=get_next_str(one_compare_res)
temp.append(ss)
sum_count.append(temp)
sort_index = pd.DataFrame(sum_count).sort_values([0, 1], ascending=[False, False]).index.values[0]
res.append({"原数据": one_pd[sort_index]["逻辑"], "计数": sum_count[sort_index]})
return_data.append(res)
def gen_text_to_logic(paths="E:/just_and_sum/data_sets/", out_dir="E:/just_and_sum/data_set_new"):
paths_list_pr = glob(pathname=paths + "*")
works_num = 8
total_count = Manager().dict()
total_count["count"] = 0
p_list = []
# 发任务到异步进程
for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
j = len(paths_list_pr) // works_num + i
p = Process(target=gen_data_pre, args=(paths_list_pr[i:j], out_dir, i, total_count))
p.start()
p_list.append(p)
for p in p_list:
p.join()
def gen_text_to_text(paths="E:/just_and_sum/data_set_new/", input_text="请问头孢克洛和头孢泊肟酯是"):
paths_list_pr = glob(pathname=paths + "*")
works_num = 8
total_count = Manager().dict()
total_count["count"] = 0
return_count = Manager().list()
p_list = []
# 发任务到异步进程
for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
j = len(paths_list_pr) // works_num + i
p = Process(target=gen_data_find, args=(paths_list_pr[i:j], return_count, input_text))
p.start()
p_list.append(p)
for p in p_list:
p.join()
r_list = []
for i in list(return_count):
r_list += i
r_df = pd.DataFrame(r_list)
r_c = r_df["计数"].values.tolist()
r_c = pd.DataFrame(r_c).sort_values([0,1], ascending=[False, False]).index.values[0]
print(r_df.values[r_c])
def replace_non_specified_chars(input_str, specified_chars, replacement_char):
return re.sub(f'[^{specified_chars}]', replacement_char, input_str)
if __name__ == '__main__':
freeze_support()
# 生成三个等级模糊数据
# gen_text_to_logic()
# 查询
gen_text_to_text()