【大语言模型NER处理-进行标注生成】

Qwen 进行NER 识别

目录

  • Qwen 进行NER 识别
  • 前言
  • 一、QWEN_NER
    • 1.引入库
    • 2.封装主函数相似度处理
    • 实际匹配标签(例子)
    • 获取结果
  • 总结


前言

chatgpt出来的时候就想过将其利用在信息抽取方面,后续也发现了不少基于这种大语言模型的信息抽取的论文,比如之前收集过的::

https://github.com/cocacola-lab/GPT4IE
https://github.com/RidongHan/Evaluation-of-ChatGPT-on-Information-Extraction
https://github.com/cocacola-lab/ChatIE


为什么使用大语言模型在NER上的表现仍然明显低于普遍的基线?

	
	由于NER和LLMs这两个任务之间的差距:

		前者本质上是一个序列标记任务,而后者是一个文本生成模型。
		

一、QWEN_NER

1.引入库

代码如下(示例):

from dashscope import Generation
from http import HTTPStatus
import difflib
import jieba
import synonyms
import numpy as np
import yaml
from typing import List, Tuple
np.seterr(divide='ignore', invalid='ignore')


2.封装主函数相似度处理


class EmotionAnalyzer:
    def __init__(self):
        file1 = open('./config_emtion.yaml', 'r', encoding='utf-8')
        self.data_file = yaml.load(file1, Loader=yaml.FullLoader)
        self.SK_TONGYI = self.data_file['SK_TONGYI'][0]
        self.dict_labelmakeup_list = self.data_file['Attribute_words_makeup_label']
        self.data_makeup_list = self.data_file['Attribute_makeup_words']
        file1.close()

    def call_with_messages(self, text: str, label_list: List[str]) -> str:
        messages = [
            {
                'role': 'system',
                'content': '''# Goals
                    对用户输入的内容进行分析,打标处理,从客服的角度分析用户的意图,就事论事,不要发散输出和思考,判断与以下哪些合适的标签匹配。  
                    只输出标签,自行检查匹配标签和内容是否合理,不要输出任何解释,若匹配多个标签则使用半角逗号分割,若一个标签也没有或者不合理匹配输出null,
                    # 要求
                    - 标签列表:{a}'''.format(a=label_list)},
            {
                'role': 'user',
                'content': text
            }
        ]
        gen = Generation()
        response = gen.call(
            Generation.Models.qwen_plus_v1,
            messages=messages,
            api_key=self.SK_TONGYI,
            temperature=0.6,
            top_p=0.6,
            result_format='message'  # set the result is message format
        )
        if response.status_code == HTTPStatus.OK:
            return response
        else:
            return (
                f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}'
            )


	# 获取相似度匹配最高值进行迭代数据
    def find_most_similar(self, target: str, data_list: List[str]) -> Tuple[str, float]:
        most_similar = None
        highest_similarity = 0
        for data in data_list:
            similarity = difflib.SequenceMatcher(None, target, data).ratio()
            if similarity > highest_similarity:
                highest_similarity = similarity
                most_similar = data
        return most_similar, highest_similarity

    def cut_jieba(self, word: str, dict_path_jieba: str) -> List[str]:
        if isinstance(word, str):
            data = [word.replace('/', '')]
            jieba.load_userdict(dict_path_jieba)
            words = [words for i in data for words in jieba.cut(i) if len(words) > 1]
            return words
        else:
            print('日志存储')

    def label_jieba(self) -> Tuple[List[str], List[str]]:
        data_list_label = self.dict_labelmakeup_list
        data_list = self.data_makeup_list
        return data_list_label, data_list

    def get_target_set(self, word: str, dict_path_jieba: str = './jieba_dict.txt') -> set:
        data_list_label, data_list = self.label_jieba()
        target = self.cut_jieba(word, dict_path_jieba)
        for i in target:
            highest_similarity = 0
            for data in data_list:
                synlst = synonyms.compare(i, data)
                if synlst > highest_similarity:
                    highest_similarity = synlst
                    most_similar = data
            if float(highest_similarity) > 0.5:
                yield most_similar

    def score_similar(self, word: str) -> List[str]:
        target = word.replace('\'', '').split(',')
        data_list_label, data_list = self.label_jieba()
        for i in target:
            item = i.strip('\'')
            word_score, score = self.find_most_similar(item, data_list_label)
            if float(score) > float(0.9):
                yield word_score

    def node_class_sec(self, node: str) -> List[str]:
        attributes = {}
        attrlist = []
        score_list = []
        if isinstance(node, str):
            key, value = node.split('-', 1)
            if value not in ['', '无法确定', '无法确定 ', '未提及', '无 ', '无', '无评价', '无评价 ', '未提及 ']:
                attributes[key] = value
            for key, values in attributes.items():
                attrlist.append(key + '-' + values)
            data_list_label, data_list = self.label_jieba()
            for i in attrlist:
                for j in data_list_label:
                    word_score, score = self.find_most_similar(i, j)
                    if float(score) > float(0.9):
                        score_list.append(word_score)
            return score_list
        else:
            return [None]

    def node_class_first(self, node: str) -> List[str]:
        attributes = {}
        attrlist = []
        score_list = []
        if isinstance(node, str):
            key, value = node.split(':', 1)
            
            if value not in ['', '无法确定', '无法确定 ', '未提及', '无 ', '无', '无评价', '无评价 ', '未提及 ']:
                attributes[key] = value
            for key, values in attributes.items():
                attrlist.append(key + '-' + values)
            data_list_label, data_list = self.label_jieba()
            for i in attrlist:
                for j in data_list_label:
                    word_score, score = self.find_most_similar(i, j)
                    if float(score) > float(0.9):
                        score_list.append(word_score)
            return score_list
        else:
            return [None]

    def class_type_first(self, data: str) -> List[str]:
        score_list_f = []
        pairs = data.replace('\'', '').replace('"', '').replace(" ", ',').replace('\\n', ' ,').split(',')
        pairs = list(filter(None, pairs))

        for pair in pairs:
            if ':' in pair:
                score_list = self.node_class_first(pair)
                if len(score_list) == 0:
                    continue
                score_list_f.append(score_list[0])

            elif '-' in pair:
                score_list = self.node_class_sec(pair)
                if len(score_list) == 0:
                    continue
                score_list_f.append(score_list[0])
            else:
                continue

        if len(list(set(score_list_f))) >= 10:
            return [None]
        else:
            return list(set(score_list_f))

    def main_run(self, text: str) -> Tuple[List[str], str]:
        dict_text = list(self.get_target_set(text))  # 将生成器转换为列表
        prompt_text = text + '\n参考内容:\n' + str(dict_text).replace('\\', '')
        try:
            data = self.call_with_messages(prompt_text, label_list=self.dict_labelmakeup_list)['output']['choices'][0][
                'message']['content']
        except TypeError:
            data = text

        # 在处理生成器进行数据迭代匹配
        result = self.score_similar(data)
        result_list = list(result).copy()

        if len(list(set(result_list))) >= 9:
            return self.class_type_first(data), data
        else:
            return list(set(result_list)), data

    def main_dict_run(self,text):
        attrlist, data = analyzer.main_run(text)
        result = {'好评': [], '差评': []}
        for item in attrlist:
            if '-好评' in item:
                result['好评'].append(item.split('-')[0])
            elif '-差评' in item:
                result['差评'].append(item.split('-')[0])
        return result


if __name__ == '__main__':
    analyzer = EmotionAnalyzer()
    text = '最差的一次,明明买30ml享50ml收到货少了一个小样,然后还有入会员的福利,虽说是一分钱不多,但是要讲究一个信誉啊,福利也没发过来,然后就是各种借口理由搪塞过去无语'
    data = analyzer.main_dict_run(text)


实际匹配标签(例子)

 Exact_Match_Tag = ['质量', '卖家服务',
              '性价比', '物流', '正品', '包装', '整体', '赠品', '美白效果', '使用效果', '颜色', '抗衰效果',
              '成分', '去黑头效果', '异味气味', '妆感','控油效果', '洁净效果', '保质期', '做工', '外观设计', 
              '色泽', '描述相符', '便捷性', '手感', '份量',...........
              ]

获取结果

     result_dict = {'好评': [], '差评': ['便捷性', '包装', '保质期', '正品']}

总结

真对LLM 模型的表述

不足之处: LLM 的泛化和召回和开始说的一样,本身就是生成model 的胚子,可控的系数很低,但是可以用chatgpt 3.5 或者是4.0 试一试,应该会非常理想。我目前测试的是Qwen一个小demo,当然,有好的prompt,欢迎指正,嘿嘿


当然,不用LLM_embedding 来计算相似度是因为耗时太长,所以 我选择了粗细力度和synonyms来进行NER 的匹配。

建议是最好的优化prompt . 多谢分享

你可能感兴趣的:(语言模型,人工智能,自然语言处理)