使用jieba 分词拆分文本的逻辑和逻辑对象并制作搜索引擎

jieba 中采用了《现代汉语词性标记》标准来标记汉语的词性,使用大量的中文细致地对汉语的各个词性进行分类,详细的列表可参考官方文档:jieba词性标注

下面是 jieba 支持的词性名字及其代号的对照表:

名称 代号 名称 代号 名称 代号 名称 代号
名词 n 时间词 t 副词 d 动形词 a
处所词 s 连词 c 形容词 p 区别词 b
时间词 t 介词 p 助词 u 状态词 z
方位词 f 叹词 e 数词 m 语气词 y
代词 r 量词 q 拟声词 o 习用语 i
动词 v 专有名词 nr 成语 ch

在词性代号之前加上 “n”,可以得到与 ICTCLAS 标注集的对应关系。总体来说,jieba 对于中文文本的分词和词性标注都非常精准,是中文 NLP 领域中常用的工具之一。

希望这个信息对你有帮助!如果还有其他问题,请随时提问。️

from difflib import SequenceMatcher, Differ
from fuzzywuzzy import fuzz
import pandas as pd
import re
from multiprocessing import Process, Manager, freeze_support
import jieba.posseg as psg
from glob import glob

from tqdm import tqdm


def compare_text_sim_score(text1, text2):
    # text1 = "今天天气真好"
    # text2 = "今日天气很好"

    # 计算文本相似度得分
    similarity_score = fuzz.ratio(text1, text2)

    # 输出相似度得分
    # print(similarity_score)
    return similarity_score


def compare_text_sim(text1, text2):
    # text1 = "今天天气真好"
    # text2 = "今日天气很好"

    # 创建 SequenceMatcher 对象,将两个文本作为参数传入
    matcher = SequenceMatcher(None, text1, text2)

    # 获取文本相似度得分
    similarity_score = matcher.ratio()

    # 输出相似度得分
    return similarity_score


def replace_stop_flag(text):
    # text = "我爱北京天安门。"

    # 对文本进行分词和词性标注
    words = psg.cut(text, use_paddle=False)

    # 遍历每个词汇,并输出它们的分词和词性标注信息
    stop_flag = ["n", "t", "r", "s"]
    # | 名称  | 代号   |   名称   | 代号   |  名称  | 代号   |  名称   | 代号   |
    # | :--  | :--  | :--  | :--  | :--  | :--  | :--  | :--  |
    # | 名词  | n    |  时间词 | t    | 副词  | d    | 动形词 | a    |
    # | 处所词 | s    | 连词  | c    | 形容词 | p    | 区别词 | b    |
    # | 时间词 | t    | 介词  | p    | 助词  | u    | 状态词 | z    |
    # | 方位词 | f    | 叹词  | e    | 数词  | m    | 语气词 | y    |
    # | 代词  | r    | 量词  | q    | 拟声词 | o    | 习用语 | i    |
    # | 动词  | v    | 专有名词 | nr  | 成语  | ch   |       |      |
    text_new = []
    text_flag = []
    for word, flag in words:
        if len(set(list(flag)) & set(stop_flag)) > 0:
            # print(word, flag)
            # text_new += "#2*1#3*"
            text_flag.append(word)
        else:
            # text_flag += "#2*1#3*"

            text_new.append(word)
    return text_new, text_flag


def gen_data_dict(one_data):
    logic_da, logic_ob = replace_stop_flag(one_data)

    return {"原数据": one_data, "逻辑": logic_da, "逻辑对象": logic_ob}


def gen_data_pre(data_v, data_path, process_count, total_c):
    res = []
    for one_v in tqdm(data_v):
        with open(one_v, "r", encoding="utf-8") as f:
            one_data = f.read()
            one_data = "".join(one_data.split())

        total_c["count"] += 1

        one_data = "".join(one_data.split())

        # print("".join(one_data[:10]))

        one_ob = gen_data_dict(one_data)
        res.append(one_ob)
        if len(res) > 10000:
            pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))
            res = []
    pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))


def gen_data_find(data_v, return_data, input_text):
    res = []
    for one_v in tqdm(data_v):
        one_pd = pd.read_pickle(one_v)
        one_pd = one_pd["one_part"]
        one_pd_data = gen_data_dict(input_text)

        # one_sim=compare_text_sim()
        # 要发明新的算法让字符最大长度对齐  计算 比如 假设 两个字符串 其中一个节点对齐后
        # 对齐其他节点
        d = Differ()
        # diff = d.compare(text1_lines, text2_lines)

        one_sort = [[
            sum([1 if "-" != i[0] and "+" != i[0] else 0 for i in d.compare(j["逻辑"], one_pd_data["逻辑"])]),
            sum([1 if "-" != i[0] and "+" != i[0] else 0 for i in d.compare(j["逻辑对象"], one_pd_data["逻辑对象"])])
        ]
            for j in tqdm(one_pd)]
        sort_index=pd.DataFrame(one_sort).sort_values([0,1], ascending=[False,False]).index.values[0]
        res.append({"原数据":one_pd[sort_index]["原数据"],"计数":one_sort[sort_index]})
    return_data.append(res)


def gen_text_to_logic(paths="E:/just_and_sum/data_sets/", out_dir="E:/just_and_sum/data_set_new"):
    paths_list_pr = glob(pathname=paths + "*")

    works_num = 8

    total_count = Manager().dict()
    total_count["count"] = 0

    p_list = []
    # 发任务到异步进程
    for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
        j = len(paths_list_pr) // works_num + i

        p = Process(target=gen_data_pre, args=(paths_list_pr[i:j], out_dir, i, total_count))
        p.start()
        p_list.append(p)

    for p in p_list:
        p.join()


def gen_text_to_text(paths="E:/just_and_sum/data_set_new/", input_text="请问头孢克洛和头孢泊肟酯是"):
    paths_list_pr = glob(pathname=paths + "*")

    works_num = 5

    total_count = Manager().dict()
    total_count["count"] = 0
    return_count = Manager().list()

    p_list = []
    # 发任务到异步进程
    for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
        j = len(paths_list_pr) // works_num + i

        p = Process(target=gen_data_find, args=(paths_list_pr[i:j], return_count, input_text))
        p.start()
        p_list.append(p)

    for p in p_list:
        p.join()
    r_list=[]
    for i in list(return_count):
        r_list+=i
    r_df=pd.DataFrame(r_list)
    r_c=r_df["计数"].values.tolist()
    r_c=pd.DataFrame(r_c).sort_values([0,1],ascending=[False,False]).index.values[0]
    print(r_df.values[r_c])



def replace_non_specified_chars(input_str, specified_chars, replacement_char):
    return re.sub(f'[^{specified_chars}]', replacement_char, input_str)


if __name__ == '__main__':
    freeze_support()
    # 生成三个等级模糊数据
    # gen_text_to_logic()
    # 查询
    gen_text_to_text()




稳定版

from difflib import Differ

import pandas as pd
import re
from multiprocessing import Process, Manager, freeze_support
import jieba.posseg as psg
from glob import glob

from tqdm import tqdm


def replace_stop_flag(text):
    # text = "我爱北京天安门。"

    # 对文本进行分词和词性标注
    words = psg.cut(text, use_paddle=False)

    # 遍历每个词汇,并输出它们的分词和词性标注信息
    stop_flag = ["n", "t", "r", "s"]
    # | 名称  | 代号   |   名称   | 代号   |  名称  | 代号   |  名称   | 代号   |
    # | :--  | :--  | :--  | :--  | :--  | :--  | :--  | :--  |
    # | 名词  | n    |  时间词 | t    | 副词  | d    | 动形词 | a    |
    # | 处所词 | s    | 连词  | c    | 形容词 | p    | 区别词 | b    |
    # | 时间词 | t    | 介词  | p    | 助词  | u    | 状态词 | z    |
    # | 方位词 | f    | 叹词  | e    | 数词  | m    | 语气词 | y    |
    # | 代词  | r    | 量词  | q    | 拟声词 | o    | 习用语 | i    |
    # | 动词  | v    | 专有名词 | nr  | 成语  | ch   |       |      |
    text_new = []
    text_flag = []
    for word, flag in words:
        if len(set(list(flag)) & set(stop_flag)) > 0:
            # print(word, flag)
            # text_new += "#2*1#3*"
            text_flag.append(word)
        else:
            # text_flag += "#2*1#3*"

            text_new.append(word)
    return text_new, text_flag


def gen_data_dict(one_data):
    logic_da, logic_ob = replace_stop_flag(one_data)

    return {"原数据": one_data, "逻辑": logic_da, "逻辑对象": logic_ob}

def get_next_str(one_compare_res):
    one_compare_res = pd.DataFrame(one_compare_res)
    only_bool = ~one_compare_res[0].str[0].isin(["-", "+"])
    ss = sum(only_bool)
    return ss
def gen_data_pre(data_v, data_path, process_count, total_c):
    res = []
    for one_v in tqdm(data_v):
        with open(one_v, "r", encoding="utf-8") as f:
            one_data = f.read()
            one_data = "".join(one_data.split())

        total_c["count"] += 1

        one_data = "".join(one_data.split())

        # print("".join(one_data[:10]))

        one_ob = gen_data_dict(one_data)
        res.append(one_ob)
        if len(res) > 10000:
            pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))
            res = []
    pd.to_pickle({"one_part": res}, data_path + "/{}{}.pandas_pickle".format(process_count, total_c["count"]))


def gen_data_find(data_v, return_data, input_text):
    res = []
    one_pd_data = gen_data_dict(input_text)
    d = Differ()
    for one_v in tqdm(data_v):
        one_pd = pd.read_pickle(one_v)
        one_pd = one_pd["one_part"]
        sum_count = []
        for j in tqdm(one_pd):
            one_compare_res = list(d.compare(j["逻辑"], one_pd_data["逻辑"]))
            ss=get_next_str(one_compare_res)
            temp = [ss]

            one_compare_res = list(d.compare(j["逻辑对象"], one_pd_data["逻辑对象"]))
            ss=get_next_str(one_compare_res)
            temp.append(ss)


            sum_count.append(temp)

        sort_index = pd.DataFrame(sum_count).sort_values([0, 1], ascending=[False, False]).index.values[0]
        res.append({"原数据": one_pd[sort_index]["逻辑"], "计数": sum_count[sort_index]})
    return_data.append(res)


def gen_text_to_logic(paths="E:/just_and_sum/data_sets/", out_dir="E:/just_and_sum/data_set_new"):
    paths_list_pr = glob(pathname=paths + "*")

    works_num = 8

    total_count = Manager().dict()
    total_count["count"] = 0

    p_list = []
    # 发任务到异步进程
    for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
        j = len(paths_list_pr) // works_num + i

        p = Process(target=gen_data_pre, args=(paths_list_pr[i:j], out_dir, i, total_count))
        p.start()
        p_list.append(p)

    for p in p_list:
        p.join()


def gen_text_to_text(paths="E:/just_and_sum/data_set_new/", input_text="请问头孢克洛和头孢泊肟酯是"):
    paths_list_pr = glob(pathname=paths + "*")

    works_num = 8

    total_count = Manager().dict()
    total_count["count"] = 0
    return_count = Manager().list()

    p_list = []
    # 发任务到异步进程
    for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
        j = len(paths_list_pr) // works_num + i

        p = Process(target=gen_data_find, args=(paths_list_pr[i:j], return_count, input_text))
        p.start()
        p_list.append(p)

    for p in p_list:
        p.join()
    r_list = []
    for i in list(return_count):
        r_list += i
    r_df = pd.DataFrame(r_list)
    r_c = r_df["计数"].values.tolist()
    r_c = pd.DataFrame(r_c).sort_values([0,1], ascending=[False, False]).index.values[0]
    print(r_df.values[r_c])


def replace_non_specified_chars(input_str, specified_chars, replacement_char):
    return re.sub(f'[^{specified_chars}]', replacement_char, input_str)


if __name__ == '__main__':
    freeze_support()
    # 生成三个等级模糊数据
    # gen_text_to_logic()
    # 查询
    gen_text_to_text()

你可能感兴趣的:(日常,搜索引擎,人工智能)