餐馆食物不错,但是环境不太容易让人放松惬意。
{属性:食物 ; 观点:不错 ; 情感:正}
{属性:环境 ; 观点:不容易让人放松 ; 情感:负}
从一个评论句子中找出,用户评论了哪些方向、观点是什么、情感急性是什么。看起来是一个观点抽取+多分类的问题。
解决这个问题主要有两大思路:1.联合模型一步解决 2.任务分成两步做
好处:
1.把问题当成一个整体,问题之间关联关系可以作为默认约束
2.单一模型解决简单方便,使用者输出结果简单
问题:
1.需要更强大表现力的模型,也就也为需要更多的标注数据
2.数据的标注需要更多的技巧,一种标记符号区别3个任务,符号信息熵一定是比较大的
好处:
1.可以拆分成现有的NLP问题来解,泛化的标注数据集更多,可以用的预训练模型也更多
2.分两阶段,模型参数可共享,需要标注的数据量也可以更少
问题:
1.分两段来做,需要考虑人为的思考两阶段之间的关系,训练时候需要人工调整数据分布
2.两段之间相互约束关系没有一个网络学习到
两阶段法也有好多实现思路,比如:把属性、观点用实体抽取法抽取了,在对实体对做分类判断属于哪类属性、观点是什么;属性做实体抽取,抽取的属性和评论一起输出模型做观点抽取和情感判断。我们这次介绍的思路是两段法属性做实体抽取,抽取的属性和评论一起输出模型做观点抽取和情感判断。
实现思路:
1.bert+crf做属性抽取
2.bert+span+softmax做观点抽取和情感分类
属性抽取训练样本构造如下:
四 O
驱 O
价 B-ORG
格 I-ORG
貌 O
似 O
挺 O
高 O
的 O
, O
高 O
的 O
可 O
以 O
看 O
齐 O
X O
C O
6 O
0 O
了 O
, O
看 O
实 O
车 B-ORG
前 I-ORG
脸 I-ORG
有 O
点 O
违 O
和 O
感 O
。 O
不 O
过 O
大 B-ORG
众 B-ORG
的 I-ORG
车 I-ORG
应 O
该 O
不 O
会 O
差 O
。 O
。 O
观点抽取和情感分类训练数据构造:
价格,四驱价格貌似挺高的,高的可以看齐XC60了,看实车前脸有点违和感。不过大众的车应该不会差。 挺高(8,9) 0
车前脸,四驱价格貌似挺高的,高的可以看齐XC60了,看实车前脸有点违和感。不过大众的车应该不会差。 有点违和感(28,32) -1
大众的车,四驱价格貌似挺高的,高的可以看齐XC60了,看实车前脸有点违和感。不过大众的车应该不会差。 不会差(42,44) 1
初始化环境+训练数据集+bert模型参数准备
# 下载人民日报数据集
! wget http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 解压
! tar -xzvf china-people-daily-ner-corpus.tar.gz
# 下载bert
! wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
# 解压
! unzip chinese_L-12_H-768_A-12.zip
#初始化环境
!pip install tensorflow==2.1
!pip install keras==2.3.1
# pip下载bert4keras包
!pip install bert4keras
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, ViterbiDecoder, to_array
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm
构造模型
maxlen = 256
epochs = 1
batch_size = 32
bert_layers = 12
learning_rate = 2e-5 # bert_layers越小,学习率应该要越大
crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率
categories = set()
# bert配置
config_path = './chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = './chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './chinese_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:[text, (start, end, label), (start, end, label), ...],
意味着text[start:end + 1]是类型为label的实体。
"""
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
categories.add(flag[2:])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 标注数据
train_data = load_data('./sample/example.train')
valid_data = load_data('./sample/example.dev')
test_data = load_data('./sample/example.test')
categories = list(sorted(categories))
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, d in self.sample(random):
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories.index(label) * 2 + 1
labels[start + 1:end + 1] = categories.index(label) * 2 + 2
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(labels)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
"""
后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:
model = build_transformer_model(
config_path,
checkpoint_path,
model='albert',
)
output_layer = 'Transformer-FeedForward-Norm'
output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
"""
model = build_transformer_model(
config_path,
checkpoint_path,
)
output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
output = model.get_layer(output_layer).output
output = Dense(len(categories) * 2 + 1)(output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output)
model = Model(model.input, output)
model.summary()
model.compile(
loss=CRF.sparse_loss,
optimizer=Adam(learning_rate),
metrics=[CRF.sparse_accuracy]
)
class NamedEntityRecognizer(ViterbiDecoder):
"""命名实体识别器
"""
def recognize(self, text):
tokens = tokenizer.tokenize(text, maxlen=512)
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
token_ids, segment_ids = to_array([token_ids], [segment_ids])
nodes = model.predict([token_ids, segment_ids])[0]
labels = self.decode(nodes)
entities, starting = [], False
for i, label in enumerate(labels):
if label > 0:
if label % 2 == 1:
starting = True
entities.append([[i], categories[(label - 1) // 2]])
elif starting:
entities[-1][0].append(i)
else:
starting = False
else:
starting = False
return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities]
NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
def evaluate(data):
"""评测函数
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
for d in tqdm(data, ncols=100):
R = set(NER.recognize(d[0]))
T = set([tuple(i) for i in d[1:]])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0
def on_epoch_end(self, epoch, logs=None):
trans = K.eval(CRF.trans)
NER.trans = trans
print(NER.trans)
f1, precision, recall = evaluate(valid_data)
# 保存最优
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
model.save('./best_model')
model.save_weights('./best_model.weights')
print(
'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
f1, precision, recall = evaluate(test_data)
print(
'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
(f1, precision, recall)
)
训练模型
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator])
验证+模型加载
text = 'MALLET是美国麻省大学(UMASS)阿姆斯特(Amherst)分校开发的一个统计自然语言处理开源软件包'
NER.recognize(text)
path = "./best_model.weights"
model.load_weights(path)
NER.trans = K.eval(CRF.trans)
text = '四驱价格貌似挺高的,高的可以看齐XC60了,看实车前脸有点违和感。不过大众的车应该不会差。'
NER.recognize(text)
初始化环境+模型参数准备
!pip install keras_bert
!pip install --upgrade tensorflow
#! -*- coding: utf-8 -*-
import json
from tqdm import tqdm
import os, re
import numpy as np
import pandas as pd
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import codecs
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.callbacks import Callback
#from keras.optimizers import Adam
from keras.optimizers import adam_v2
from keras.utils.np_utils import *
#BERT的相关参数
mode = 0
maxlen = 300
learning_rate = 5e-5
min_learning_rate = 1e-6
config_path = './chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = './chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './chinese_L-12_H-768_A-12/vocab.txt'
#config_path = '../bert_model/bert_config.json'
#checkpoint_path = '../bert_model/bert_model.ckpt'
#dict_path = '../bert_model/vocab.txt'
数据集构造
token_dict = {}
# 加载词表
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
# 定制化分词器,这里不论中文还是英文都根据单个字符进行切分
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R
# 构造分词器实例
tokenizer = OurTokenizer(token_dict)
def seq_padding(X, padding=0):
# 填充补0
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
])
def list_find(list1, list2):
# 在list1中查找子串list2,如果找到返回初始的下标,否则返回-1
n_list2 = len(list2)
for i in range(len(list1)):
if list1[i: i+n_list2] == list2:
return i
return -1
# 获取训练集
#训练集字段介绍
#id代表唯一数据标识
#title和text是用于识别的文本,可能为空
#unknownEntities代表实体,可能有多个,通过英文";"分隔
train_data = pd.read_csv('./Train_Data1.csv').fillna('>>>>>')
train_data = train_data[~train_data['unknownEntities'].isnull()].reset_index(drop = True)
train_data['label'] = pd.DataFrame(np.random.randint(0,2,size=(len(train_data), 1)))
train_data['label1'] = pd.Series(list(to_categorical(train_data['label'],3)))
train_data.head(3)
# 将title和text合并成content字段,将模型转化成单输入问题
# 如果title和text字段相等那么就合并,否则返回其中一个就行了
train_data['content'] = train_data.apply(lambda x: x['title'] if x['title']==x['text'] else x['title']+x['text'], axis = 1)
# 对于unknownEntities字段中存在多个实体的只使用第一个实体
train_data['unknownEntity'] = train_data['unknownEntities'].apply(lambda x:x.split(';')[0])
# 获取所有的实体类别
# 这里先将unknownEntities进行拼接,然后根据";"切分
entity_str = ''
for i in train_data['unknownEntities'].unique():
entity_str = i + ';' + entity_str
entity_classes_full = set(entity_str[:-1].split(";"))
# 3183
len(entity_classes_full)
# 训练集变成了两个字段:
# 需要识别的文本content,这是原始数据集中title和text合并之后的数据
# 未知实体列表unknownEntities,类似于label,只会有一个实体
train_data_list = []
for content,entity,label in zip(train_data['content'], train_data['unknownEntity'], train_data['label1']):
train_data_list.append((content, entity,label))
# 根据9:1划分训练集和验证集
random_order = np.arange(len(train_data_list))
train_list = [train_data_list[j] for i, j in enumerate(random_order) if i % 9 != mode]
dev_list = [train_data_list[j] for i, j in enumerate(random_order) if i % 9 == mode]
print(len(train_list), len(dev_list))
# 准备测试集数据
test_data = pd.read_csv('./Test_Data.csv').fillna('>>>>>')
test_data['content'] = test_data.apply(lambda x: x['title'] if x['title']==x['text'] else x['title']+x['text'], axis = 1)
# 测试集变成了两个字段:
# 控制数据唯一性的id
# 需要识别的文本content,这是原始数据集中title和text合并之后的数据
test_data_list = []
for id,content in zip(test_data['id'], test_data['content']):
test_data_list.append((id, content))
# 找到训练集中content字段中文、英文和数字以外的特殊字符
additional_chars = set()
for data in train_data_list:
additional_chars.update(re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', data[1]))
additional_chars
class train_data_generator:
"""
训练集数据生成器
"""
def __init__(self, train_list, batch_size=32):
self.train_list = train_list
self.batch_size = batch_size
self.steps = len(self.train_list) // self.batch_size
if len(self.train_list) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
# 返回训练数据集中的索引列表
idxs = np.arange(len(self.train_list))
np.random.shuffle(idxs)
X1, X2, S1, S2, S3 = [], [], [], [], []
for i in idxs:
train = self.train_list[i]
# 这里对于超长的文本只会取前510个字符
# 业界还有一种取头和取尾的方法,思想主要是一篇文章中头部和尾部的内容重要性更高
# head + tail : 选择前128个 token 和最后382个 token
# text= train[0][:128] + train[0][382:]
text= train[0][:maxlen]
tokens = tokenizer.tokenize(text)
# entity代表实体
entity = train[1]
#label代表情感属性
label = train[2]
# 获取实体的字符,因为首尾是cls和sep,所以取[1:-1]
e_tokens = tokenizer.tokenize(entity)[1:-1]
entity_left_np, entity_right_np = np.zeros(len(tokens)), np.zeros(len(tokens))
# 返回e_tokens实体在tokenszi字符串中的起始位置
start = list_find(tokens, e_tokens)
if start != -1:
end = start + len(e_tokens) - 1
entity_left_np[start] = 1
entity_right_np[end] = 1
# x1是词编码,x2是句子对关系编码
word_embedding, seg_embedding = tokenizer.encode(first=text)
X1.append(word_embedding)
X2.append(seg_embedding)
# 对于文本分类来说,S1和S2代表标签
# 这里命名体识别任务S1、S2代表文本中的实体左右边界
# 比如 tokens=['[CLS]', '傻', '大', '姐', '借', '口', '给', '二', '妹', '送', '钱', 'love', '[SEP]']
# e_tokens = ['二', '妹']
# word_embedding 是text编码得到的词编码 [101, 1004, 1920, 1995, 955, 1366, 5314, 753, 1987, 6843, 7178, 8451, 102]
# seg_embedding 是text编码得到的句子对编码 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# s1是数组中实体开始位置为1其他均为0 [array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])]
# s2是数组中实体结束为只为1其他均为0 [array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])]
S1.append(entity_left_np)
S2.append(entity_right_np)
S3.append(label)
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
S1 = seq_padding(S1)
S2 = seq_padding(S2)
S3 = seq_padding(S3)
yield [X1, X2, S1, S2,S3], None
X1, X2, S1, S2,S3 = [], [], [], [], []
模型构造
# 构建训练模型
# 整个模型是单输入和单输出的问题
# 模型输入是一条query文本,这里会先将文本转换成三层embedding,token embedding、seg embedding和position embedding
# 因为句子关系可以直接获取,所以只返回token embedding、seg embedding两个输入,作为网络的输入
# 模型输出是一个实体,这个实体是query中的一个子片段
#根据这个输出特性,输出应该用指针结构,通过两个Softmax分别预测首尾,然后得到一个实体
# 所以这里返回实体的左边界和右边界作为网络的输出
# 导入预训练模型
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
# 是否进行微调
for layer in bert_model.layers:
layer.trainable = True
# 词编码输入
word_in = Input(shape=(None,), name='word_in')
# 句子对编码输入
seg_in = Input(shape=(None,), name='seg_in')
# 实体左边界数组,只有实体开始位置为1,其他均为0
entiry_left_in = Input(shape=(None,), name='entiry_left_in')
# 实体右边界数组,只有实体结束位置为1,其他均为0
entiry_right_in = Input(shape=(None,), name='entiry_right_in')
# 情感极性标签
label_in = Input(shape=(None,), name='label_in')
x1, x2, s1, s2, s3 = word_in, seg_in, entiry_left_in, entiry_right_in, label_in
bert_in = bert_model([word_in, seg_in])
ps1 = Dense(1, use_bias=False, name='ps1')(bert_in)
# 遮掩掉不应该读取到的信息,或者无用的信息,以0作为mask的标记
x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'), name='x_mask')(word_in)
ps2 = Dense(1, use_bias=False, name='ps2')(bert_in)
ps11 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10, name='ps11')([ps1, x_mask])
ps22 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10, name='ps22')([ps2, x_mask])
#情感极性分析
ps3 = Dense(1, use_bias=False, name='ps3')(bert_in)
#ps4 = Dense(1, activation='softmax', name='ps4')(ps3)
#ps4 = Dense(3,activation='softmax', name='ps4')(ps3)
cls_layer = Lambda(lambda x: x[:, 0])(ps3) # 取出[CLS]对应的向量用来做分类
ps4 = Dense(3, activation='softmax',name='ps4')(cls_layer) # 多分类
train_model = Model([word_in, seg_in, entiry_left_in, entiry_right_in, label_in], [ps11, ps22, ps4])
# 构建模型
build_model = Model([word_in, seg_in], [ps11, ps22, ps4])
loss1 = K.mean(K.categorical_crossentropy(entiry_left_in, ps11, from_logits=True))
ps22 -= (1 - K.cumsum(s1, 1)) * 1e10
loss2 = K.mean(K.categorical_crossentropy(entiry_right_in, ps22, from_logits=True))
#ps3 -= (1 - K.cumsum(s3, 1)) * 1e10
loss3 = K.mean(K.categorical_crossentropy(label_in, ps4, from_logits=True))
loss = loss1 + loss2 + loss3
train_model.add_loss(loss)
train_model.compile(optimizer=adam_v2.Adam(learning_rate))
train_model.summary()
训练模型
# 经过一个softmax操作
def softmax(x):
x = x - np.max(x)
x = np.exp(x)
return x / np.sum(x)
softmax([1, 9, 5, 3])
# 抽取实体
# 输入用户搜索query
# 输出实体
def extract_entity(text_in):
text_in = text_in[:maxlen]
_tokens = tokenizer.tokenize(text_in)
_x1, _x2 = tokenizer.encode(first=text_in)
_x1, _x2 = np.array([_x1]), np.array([_x2])
_ps1, _ps2, _ps3 = build_model.predict([_x1, _x2])
_ps1, _ps2 = softmax(_ps1[0]), softmax(_ps2[0])
for i, _t in enumerate(_tokens):
if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
_ps1[i] -= 10
start = _ps1.argmax()
for end in range(start, len(_tokens)):
_t = _tokens[end]
if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
break
end = _ps2[start:end+1].argmax() + start
a = text_in[start-1: end]
return a
class Evaluate(Callback):
"""构建自定义评估期"""
def __init__(self):
self.ACC = []
self.best = 0.
self.passed = 0
def on_batch_begin(self, batch, logs=None):
"""第一个epoch用来warmup,第二个epoch把学习率降到最低
"""
if self.passed < self.params['steps']:
lr = (self.passed + 1.) / self.params['steps'] * learning_rate
K.set_value(self.model.optimizer.lr, lr)
self.passed += 1
elif self.params['steps'] <= self.passed < self.params['steps'] * 2:
lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate)
lr += min_learning_rate
K.set_value(self.model.optimizer.lr, lr)
self.passed += 1
def on_epoch_end(self, epoch, logs=None):
acc = self.evaluate()
self.ACC.append(acc)
if acc > self.best:
self.best = acc
train_model.save_weights('best_model.weights')
print('acc: %.4f, best acc: %.4f\n' % (acc, self.best))
def evaluate(self):
A = 1e-10
F = open('dev_pred.json', 'w')
for d in tqdm(iter(dev_list)):
#print(d)
R = extract_entity(d[0])
if R == d[1]:
A += 1
#s = ', '.join(d + (R,))
s = ', '.join(d[0] +d[1] + R)
F.write(s + '\n')
F.close()
return A / len(dev_list)
evaluator = Evaluate()
train_D = train_data_generator(train_list)
train_model.fit_generator(train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=2,
callbacks=[evaluator]
)
测试
# 抽取实体测试
# 输入文本
# 返回实体列表,这里最多返回num个实体
# 输入用户搜索query
# 输出实体
def extract_entity(text_in):
text_in = text_in[:maxlen]
_tokens = tokenizer.tokenize(text_in)
_x1, _x2 = tokenizer.encode(first=text_in)
_x1, _x2 = np.array([_x1]), np.array([_x2])
_ps1, _ps2, _ps3 = build_model.predict([_x1, _x2])
_ps1, _ps2 = softmax(_ps1[0]), softmax(_ps2[0])
for i, _t in enumerate(_tokens):
if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
_ps1[i] -= 10
print(_ps1)
start = _ps1.argmax()
for end in range(start, len(_tokens)):
_t = _tokens[end]
if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
break
end = _ps2[start:end+1].argmax() + start
print(_ps2)
a = text_in[start-1: end]
return a,_ps3
# 导入模型权重
build_model.load_weights('best_model.weights')
# 预测单个文本的实体
a,b = extract_entity_test(build_model, '今天大乐透开奖了吗?', 2)
以上就是实现的思路了,数据集需要自己做标注,情感分类需要自己加一个3分类的softmax头,代码中标注地方加就好。
参考代码:https://github.com/wilsonlsm006/bert_ner.git