chatgpt出来的时候就想过将其利用在信息抽取方面,后续也发现了不少基于这种大语言模型的信息抽取的论文,比如之前收集过的::
https://github.com/cocacola-lab/GPT4IE
https://github.com/RidongHan/Evaluation-of-ChatGPT-on-Information-Extraction
https://github.com/cocacola-lab/ChatIE
为什么使用大语言模型在NER上的表现仍然明显低于普遍的基线?
由于NER和LLMs这两个任务之间的差距:
前者本质上是一个序列标记任务,而后者是一个文本生成模型。
代码如下(示例):
from dashscope import Generation
from http import HTTPStatus
import difflib
import jieba
import synonyms
import numpy as np
import yaml
from typing import List, Tuple
np.seterr(divide='ignore', invalid='ignore')
class EmotionAnalyzer:
def __init__(self):
file1 = open('./config_emtion.yaml', 'r', encoding='utf-8')
self.data_file = yaml.load(file1, Loader=yaml.FullLoader)
self.SK_TONGYI = self.data_file['SK_TONGYI'][0]
self.dict_labelmakeup_list = self.data_file['Attribute_words_makeup_label']
self.data_makeup_list = self.data_file['Attribute_makeup_words']
file1.close()
def call_with_messages(self, text: str, label_list: List[str]) -> str:
messages = [
{
'role': 'system',
'content': '''# Goals
对用户输入的内容进行分析,打标处理,从客服的角度分析用户的意图,就事论事,不要发散输出和思考,判断与以下哪些合适的标签匹配。
只输出标签,自行检查匹配标签和内容是否合理,不要输出任何解释,若匹配多个标签则使用半角逗号分割,若一个标签也没有或者不合理匹配输出null,
# 要求
- 标签列表:{a}'''.format(a=label_list)},
{
'role': 'user',
'content': text
}
]
gen = Generation()
response = gen.call(
Generation.Models.qwen_plus_v1,
messages=messages,
api_key=self.SK_TONGYI,
temperature=0.6,
top_p=0.6,
result_format='message' # set the result is message format
)
if response.status_code == HTTPStatus.OK:
return response
else:
return (
f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}'
)
# 获取相似度匹配最高值进行迭代数据
def find_most_similar(self, target: str, data_list: List[str]) -> Tuple[str, float]:
most_similar = None
highest_similarity = 0
for data in data_list:
similarity = difflib.SequenceMatcher(None, target, data).ratio()
if similarity > highest_similarity:
highest_similarity = similarity
most_similar = data
return most_similar, highest_similarity
def cut_jieba(self, word: str, dict_path_jieba: str) -> List[str]:
if isinstance(word, str):
data = [word.replace('/', '')]
jieba.load_userdict(dict_path_jieba)
words = [words for i in data for words in jieba.cut(i) if len(words) > 1]
return words
else:
print('日志存储')
def label_jieba(self) -> Tuple[List[str], List[str]]:
data_list_label = self.dict_labelmakeup_list
data_list = self.data_makeup_list
return data_list_label, data_list
def get_target_set(self, word: str, dict_path_jieba: str = './jieba_dict.txt') -> set:
data_list_label, data_list = self.label_jieba()
target = self.cut_jieba(word, dict_path_jieba)
for i in target:
highest_similarity = 0
for data in data_list:
synlst = synonyms.compare(i, data)
if synlst > highest_similarity:
highest_similarity = synlst
most_similar = data
if float(highest_similarity) > 0.5:
yield most_similar
def score_similar(self, word: str) -> List[str]:
target = word.replace('\'', '').split(',')
data_list_label, data_list = self.label_jieba()
for i in target:
item = i.strip('\'')
word_score, score = self.find_most_similar(item, data_list_label)
if float(score) > float(0.9):
yield word_score
def node_class_sec(self, node: str) -> List[str]:
attributes = {}
attrlist = []
score_list = []
if isinstance(node, str):
key, value = node.split('-', 1)
if value not in ['', '无法确定', '无法确定 ', '未提及', '无 ', '无', '无评价', '无评价 ', '未提及 ']:
attributes[key] = value
for key, values in attributes.items():
attrlist.append(key + '-' + values)
data_list_label, data_list = self.label_jieba()
for i in attrlist:
for j in data_list_label:
word_score, score = self.find_most_similar(i, j)
if float(score) > float(0.9):
score_list.append(word_score)
return score_list
else:
return [None]
def node_class_first(self, node: str) -> List[str]:
attributes = {}
attrlist = []
score_list = []
if isinstance(node, str):
key, value = node.split(':', 1)
if value not in ['', '无法确定', '无法确定 ', '未提及', '无 ', '无', '无评价', '无评价 ', '未提及 ']:
attributes[key] = value
for key, values in attributes.items():
attrlist.append(key + '-' + values)
data_list_label, data_list = self.label_jieba()
for i in attrlist:
for j in data_list_label:
word_score, score = self.find_most_similar(i, j)
if float(score) > float(0.9):
score_list.append(word_score)
return score_list
else:
return [None]
def class_type_first(self, data: str) -> List[str]:
score_list_f = []
pairs = data.replace('\'', '').replace('"', '').replace(" ", ',').replace('\\n', ' ,').split(',')
pairs = list(filter(None, pairs))
for pair in pairs:
if ':' in pair:
score_list = self.node_class_first(pair)
if len(score_list) == 0:
continue
score_list_f.append(score_list[0])
elif '-' in pair:
score_list = self.node_class_sec(pair)
if len(score_list) == 0:
continue
score_list_f.append(score_list[0])
else:
continue
if len(list(set(score_list_f))) >= 10:
return [None]
else:
return list(set(score_list_f))
def main_run(self, text: str) -> Tuple[List[str], str]:
dict_text = list(self.get_target_set(text)) # 将生成器转换为列表
prompt_text = text + '\n参考内容:\n' + str(dict_text).replace('\\', '')
try:
data = self.call_with_messages(prompt_text, label_list=self.dict_labelmakeup_list)['output']['choices'][0][
'message']['content']
except TypeError:
data = text
# 在处理生成器进行数据迭代匹配
result = self.score_similar(data)
result_list = list(result).copy()
if len(list(set(result_list))) >= 9:
return self.class_type_first(data), data
else:
return list(set(result_list)), data
def main_dict_run(self,text):
attrlist, data = analyzer.main_run(text)
result = {'好评': [], '差评': []}
for item in attrlist:
if '-好评' in item:
result['好评'].append(item.split('-')[0])
elif '-差评' in item:
result['差评'].append(item.split('-')[0])
return result
if __name__ == '__main__':
analyzer = EmotionAnalyzer()
text = '最差的一次,明明买30ml享50ml收到货少了一个小样,然后还有入会员的福利,虽说是一分钱不多,但是要讲究一个信誉啊,福利也没发过来,然后就是各种借口理由搪塞过去无语'
data = analyzer.main_dict_run(text)
Exact_Match_Tag = ['质量', '卖家服务',
'性价比', '物流', '正品', '包装', '整体', '赠品', '美白效果', '使用效果', '颜色', '抗衰效果',
'成分', '去黑头效果', '异味气味', '妆感','控油效果', '洁净效果', '保质期', '做工', '外观设计',
'色泽', '描述相符', '便捷性', '手感', '份量',...........
]
result_dict = {'好评': [], '差评': ['便捷性', '包装', '保质期', '正品']}
真对LLM 模型的表述
不足之处: LLM 的泛化和召回和开始说的一样,本身就是生成model 的胚子,可控的系数很低,但是可以用chatgpt 3.5 或者是4.0 试一试,应该会非常理想。我目前测试的是Qwen一个小demo,当然,有好的prompt,欢迎指正,嘿嘿
当然,不用LLM_embedding 来计算相似度是因为耗时太长,所以 我选择了粗细力度和synonyms来进行NER 的匹配。
建议是最好的优化prompt . 多谢分享