"""
NLP命名体识别bilstm+crf
1、准备数据:origin_handle_entities()
读取源数据文件,把人名,地名,机构名合并起来
2、读取处理后的数据:origin_handle_mark()
把预处理后的的文本标注成BMO的格式,
B(begin)、M(middle)、E(end)、O(other)
3、句子切分:sentence_split()
按照指定的格式,比如标点等内容对数据完成切分
4、保存数据
a.将标注的句子拆分自成列表和对应的标注序列
b.创建词汇表和标签
c.文本的向量化表示
d.划分训练集和测试集
e.保存成二进制pkl文件
5、加载数据
6、训练模型BiLSTM&HMM
7、保存训练后的模型用于预测
8、预测
"""
import codecs
import re
import collections
import pickle
import TorchCRF as CRF
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences #使用tensorflow的pad_sequences进行数据对齐 tensorflow2.3.1
from sklearn.model_selection import train_test_split
#####数据清洗######
def origin_handle_entities():
with open('renmin.txt','r',encoding='utf-8') as inp,\
open('middle/renmin2.txt','w',encoding='utf-8')\
as outp:
#读取源文件中的数据
for line in inp.readlines():
#按照空格切分
line = line.split(' ')
i = 1
while i < len(line) - 1:
if line[i][0] == '[':
outp.write(line[i].split('/')[0][1:])
i += 1
while i < len(line) - 1 and line[i].find(']') == -1:
if line[i] !='':
#print(line[i].split('/')[0])
outp.write(line[i].split('/')[0])
i += 1
outp.write(line[i].split('/')[0].strip()+'/'+line[i])
elif line[i].split('/')[1] == 'nr':
word = line[i].split('/')[0]
i += 1
if i < len(line) - 1 and line[i].split('/')[1] == 'nr':
outp.write(word + line[i].split('/')[0] + 'nr')
else:
outp.write(word + '/nr ')
continue
else:
outp.write(line[i] + '/no ')
i += 1
outp.write('\n')
######数据的标注########
import codecs
def origin_handle_mark():
"""
1、读取数据预处理后的renmin2.txt
2、将标注好的数据写入renmin3.txt
a.打开输入和输出文件
b.遍历输入文件renmin2.txt
:return:
"""
with codecs.open('middle/renmin2.txt','r',encoding='utf-8') as inp,\
codecs.open('middle/renmin3.txt','w',encoding='utf-8') as outp:
#遍历renmin2.txt
for line in inp.readlines():
line = line.split(' ')
#遍历每个句子
i = 0
while i < len(line) - 1:
if line[i] == '':#跳过空字符
i += 1
continue
word = line[i].split('/')[0]
#标签
tag = line[i].split('/')[1]
if tag == 'nr' or tag == 'ns' or tag == 'nt':
outp.write(word[0] + '/B_' + tag + ' ')
for j in word[1:len(word) -1]:
if j != ' ':
outp.write(j + '/M_' + tag + ' ')
outp.write(word[-1] + '/E_' + tag + ' ')
else:
for w in word:
outp.write(w + '/O' + ' ')
i += 1
outp.write('\n')
#########句子切分###################################
import re
def sentence_split():
with codecs.open('middel/renmin3.txt','r',encoding='utf-8') as inp,\
codecs.open('middle/renmin4.txt','w',encoding='utf-8') as outp:
#文本文件的内容设置为对应的utf-8编码,python3:先encode,再decode
texts = inp.read().encode('utf-8').decode('utf-8')
#切分句子
sentences = \
re.split('[,。!?、''"":]/[0]'.encode('utf-8').decode('utf-8'),\
texts)
for sentence in sentences:
if sentence != ' ':
outp.write(sentence.strip() + '\n')
######保存数据###################
def data_to_pkl():
"""
将文本数据保存成二进制pkl文件
:return:
"""
datas = []#数据
labels = []# 标签
all_words = []#词汇表
tags = set()#标签
input_data = codecs.open('middle/renmin4.txt','r',encoding='utf-8')
# 1.将标注的句子拆分成列表和对应的标注列表
for line in input_data.readlines():
linedata = list()
linelabel = list()
line = line.split()
numNotO = 0
for word in line:
word = word.split('/')
linedata.append(word[0])
linelabel.append(word[1])
all_words.append(word[0])
tags.add(word[1])
if word[1] != 'O': #标注全为O的子句
numNotO += 1
if numNotO != 0: # 只保存标注不全为O的子句
datas.append(linedata)
labels.append(linelabel)
input_data.close()
# 2、创建词汇表和标签
"""
1、构建词汇表:语料库总所有不重复单词的数量
2、构建三个词典:{单词:频数} {单词:编号} {编号:单词}
3、把文本进行填充或者截断[pad]:
4、结合词汇表和词对文本数据进行向量化表示(数字)
pytorch、tensorflow、keras、paddle(Embedding)
"""
words_count = collections.Counter(all_words).most_common()
# word2id:单词:编号
word2id = {word: i for i ,(word, _) in enumerate(words_count, 1)}
word['[PAD]'] = 0
word2id['[unknown]'] = len(word2id) # 100000000
#id2word:编号:单词
id2word = {i:word for word,i in word2id.items()}
tag2id = {tag: i for i, tag in enumerate(tags)}
id2tag = {i:tag for tag,i in tag2id.items()}
# 3、文本向量化,并处理成相同长度
max_len = 60 # 超参数
#每个句子对应的ID编号
data_ids = [[word2id[w] for w in line]
for line in datas]
# 标签对应的编号信息
labels_ids = [[tag2id[t] for t in line]
for line in labels]
x = pad_sequences(data_ids,maxlen=max_len,
padding='post').astype(np.int64)
y = pad_sequences(labels_ids,maxlen=max_len,
padding='post').astype(np.int64)
print('文本向量化完成')
# 4、将向量化后的数据拆分成训练集,验证集,测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,
test_size=0.2,
random_state=43)
x_train, x_valid, y_train, y_valid = train_test_split(x, y,
test_size=0.2,
random_state=43)
print(len(x_valid))
# 5、保存数据
with open('../data_target_pkl/renmindata.pkl','wb') as outp:
#原始数据
pickle.dump(word2id,outp)
pickle.dump(id2word,outp)
pickle.dump(tag2id,outp)
pickle.dump(id2tag,outp)
#训练数据
pickle.dump(x_train,outp)
pickle.dump(y_train,outp)
pickle.dump(x_test,outp)
pickle.dump(y_test,outp)
pickle.dump(x_valid,outp)
pickle.dump(y_valid,outp)
with open('../data_target_pkl/vocab.pkl') as outp:
pickle.dump(word2id, outp)
pickle.dump(id2word, outp)
with open('../data_target_pkl/tags.pkl') as outp1:
pickle.dump(tag2id, outp1)
pickle.dump(id2tag, outp1)
def main():
# 数据清洗
origin_handle_entities()
#数据标注(字)
origin_handle_mark()
# 句子切分
sentence_split()
# 数据转换
data_to_pkl()
if __name__ == '__main__':
main()
##################################################################################################
####加载数据########
def load_data():
pickle_path = '../data_target_pkl/renmindata.pkl'
with open(pickle_path,'rb') as inp:
word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid =pickle.load(inp)
return word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid
def main():
word2id = load_data()
print(len(word2id))
if __name__ == '__main__':
main()
#######################################################################################
#bilstm_crf_model.py
import torch
import torch.nn as nn
from torch.utils.data import Dataset # 批量读取数据
#命名识别类(加载数据)
class NERDataSet(Dataset):
"""
X:表示样本,Y:表示标签
"""
def __init__(self,X,Y,*args,**kwargs):
"""
:param X: 样本
:param Y: 标签
:param args: 任意数;任意数量参数
:param kwargs: 任意数;任意数量参数
"""
self.data = [{'x':X[i],'y':Y[i]}
for i in range(X.shape[0])]
# 返回对应数据的索引,单词:编号
def __getitem__(self, index):
return self.data[index]
# 样本数据的个数
def __len__(self):
return len(self.data)
#参数
class Config():
embedding_dim = 100 #词向量的维度
hidden_dim = 200
word2id,tag2id = load_data()
vocab_size = len(word2id)
num_tags = len(tag2id)
dropout = 0.2
lr = 0.001
weight_decay = 1e-5
config = Config()
#构建模型(Bilstm + CRF)
class NERLSTM_CRF(nn.Module):
"""
1、输入层
2、词映射(Embedding(vocab_size,embedding_dim))
3、LSTM
4、全连接层
"""
def __init__(self):
super(NERLSTM_CRF,self).__init__()
self.embeding_dim = config.embeding_dim
self.hidden_dim = config.hidden_dim
self.vocab_size = config.vocab_size
self.num_tags = config.num_tags
#将处理后的数据对应单词的编号换成词向量
self.embeds = nn.Embedding(
self.vocab_size,
self.embeding_dim
)
self.dropout = nn.Dropout(config.dropout)
#lstm bidirectional 双向LSTM
self.lstm = nn.LSTM(
self.embeding_dim,
self.hidden_dim//2,#双向
num_layers=1,
bidrectional=True,
batch_first=True,#设置属性值,保持数据格式
)
#全连接
self.linear =nn.Linear(self.hidden_dim,
self.num_tags)
#CRF
self.crf = CRF(self.num_tags)
#向前计算
def forword(self,x,mask):
embeddings = self.embeds(x) # 词映射
feats,hidden = self.lstm(embeddings)
emissions = self.linear(self.dropout(feats))
#viterbi_decode预测和标记进行比对解码
outputs = self.crf.viterbi_decode(emissions,mask)
return outputs
#反向传播
def log_likelihood(self,x,labels,mask):
embeddings = self.embeds(x) # 词映射
feats, hidden = self.lstm(embeddings)
emissions = self.linear(self.dropout(feats))#LSTM
loss = -self.crf.forward(emissions,labels,mask)#全连接
return torch.sum(loss)
##################################################
from torch.utils.data import DataLoader #批量加载数据
import torch
import torch.optim as op
#模型训练的帮助函数
def utils_to_train():
device = torch.device('cpu')
max_epoch = 1
batch_size = 32
num_workers =4 #开启几个线程取执行程序
x_train,y_train,x_valid,y_valid,x_test,y_test = load_data()
# 训练集
train_data = NERDataSet(x_train,y_train)
# 验证集
valid_data = NERDataSet(x_valid,y_valid)
# 测试集
test_data = NERDataSet(x_test,y_test)
#批量加载数据
train_data_loader = DataLoader(
train_data,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)
valid_data_loader = DataLoader(
valid_data,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)
test_data_loader = DataLoader(
test_data,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)
config =Config()
model = NERLSTM_CRF(config).to(device)
optimizer = op.Adam(
model.parameters(),
lr = config.lr,
weight_decay=config.weight_decay
)
return max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,optimizer,model
#用于将实体类别解码,单字组合成单词
def parse_tags(text,path):
id2tag = load_data()
tags = [id2tag[idx] for idx in path]
begin = 0
res = []
for idx,tag in enumerate(tags):
#将连续的同类型的字连接
if tag.startwith('B'):
begin = idx
elif tag.startwith('E'):
end =idx
word = text[begin:end+1]
label = tag[2:]
res.append((word,label))
elif tag == 'O':
res.append((text[idx],tag))
return res
##################################################
# train.py
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score
word2id = load_data()[0]
max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,model = utils_to_train()
#中文命名体识别
class ChineseNER(object):
def train(self):
for epoch in range(max_epoch):
#训练模型
model.train()
for index,batch in enumerate(train_data_loader):
#梯度归零
optimizer = utils_to_train()
optimizer.zero_grad()
# 训练数据---cpu
x = batch['x'].to(device)
mask = (x>0).to(device)
y = batch['y'].to(device)
#前向计算损失
loss = model.log_likelihood(x,y,mask)
#反向传播
loss.backward()
#梯度裁剪
torch.nn.utils.clip_grad_norm(parameters=model.parameters(),
max_norm=10)
#更新参数
optimizer.step()
if index % 200 == 0:
print('epoch:%5d,-----loss:%f'%(epoch,loss.item()))
#验证损失和精度
aver_loss = 0
preds, labels = [],[]
for index,batch in enumerate(valid_data_loader):
#验证模式
model.eval()
#验证数据--->cpu
val_x,val_y = batch['x'].to(device)
val_mask = (val_x > 0).to(device)
predict = model(val_x,val_mask)
#前向计算损失
loss = model.log_likelihood(val_x,val_y)
aver_loss += loss.item()
#统计非0的,也就是真实标签的长度
leng = []
for i in val_y.cpu():
tmp = []
for j in i:
if j.item() >0:
tmp.append(j.item())
leng.append(tmp)
for index,i in enumerate(predict):
preds += i[:len(leng[index])]
for index,i in enumerate(val_y.tolist()):
labels +=i[:len(leng[index])]
#损失值与评测指标
aver_loss /= (len(valid_data_loader) * 64)
precision = precision_score(labels,preds,average='macro')
recall = recall_score(labels,preds,average='macro')
f1 = f1_score(labels,preds,average='macro')
report = classification_report(labels,preds)
print(report)
torch.save(model.state_dict(),'params.pkl')
"""
NLP命名体识别bilstm+crf
1、准备数据:origin_handle_entities()
读取源数据文件,把人名,地名,机构名合并起来
2、读取处理后的数据:origin_handle_mark()
把预处理后的的文本标注成BMO的格式,
B(begin)、M(middle)、E(end)、O(other)
3、句子切分:sentence_split()
按照指定的格式,比如标点等内容对数据完成切分
4、保存数据
a.将标注的句子拆分自成列表和对应的标注序列
b.创建词汇表和标签
c.文本的向量化表示
d.划分训练集和测试集
e.保存成二进制pkl文件
5、加载数据
6、训练模型BiLSTM&HMM
7、保存训练后的模型用于预测
8、预测
"""
import codecs
import re
import collections
import pickle
import TorchCRF as CRF
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences #使用tensorflow的pad_sequences进行数据对齐 tensorflow2.3.1
from sklearn.model_selection import train_test_split
数据清洗######
def origin_handle_entities():
with open('renmin.txt','r',encoding='utf-8') as inp,
open('middle/renmin2.txt','w',encoding='utf-8')
as outp:
#读取源文件中的数据
for line in inp.readlines():
#按照空格切分
line = line.split(' ')
i = 1
while i < len(line) - 1:
if line[i][0] == '[':
outp.write(line[i].split('/')[0][1:])
i += 1
while i < len(line) - 1 and line[i].find(']') == -1:
if line[i] !='':
#print(line[i].split('/')[0])
outp.write(line[i].split('/')[0])
i += 1
outp.write(line[i].split('/')[0].strip()+'/'+line[i])
elif line[i].split('/')[1] == 'nr':
word = line[i].split('/')[0]
i += 1
if i < len(line) - 1 and line[i].split('/')[1] == 'nr':
outp.write(word + line[i].split('/')[0] + 'nr')
else:
outp.write(word + '/nr ')
continue
else:
outp.write(line[i] + '/no ')
i += 1
outp.write('\n')
数据的标注########
import codecs
def origin_handle_mark():
"""
1、读取数据预处理后的renmin2.txt
2、将标注好的数据写入renmin3.txt
a.打开输入和输出文件
b.遍历输入文件renmin2.txt
:return:
"""
with codecs.open('middle/renmin2.txt','r',encoding='utf-8') as inp,
codecs.open('middle/renmin3.txt','w',encoding='utf-8') as outp:
#遍历renmin2.txt
for line in inp.readlines():
line = line.split(' ')
#遍历每个句子
i = 0
while i < len(line) - 1:
if line[i] == '':#跳过空字符
i += 1
continue
word = line[i].split('/')[0]
#标签
tag = line[i].split('/')[1]
if tag == 'nr' or tag == 'ns' or tag == 'nt':
outp.write(word[0] + '/B_' + tag + ' ')
for j in word[1:len(word) -1]:
if j != ' ':
outp.write(j + '/M_' + tag + ' ')
outp.write(word[-1] + '/E_' + tag + ' ')
else:
for w in word:
outp.write(w + '/O' + ' ')
i += 1
outp.write('\n')
#########句子切分###################################
import re
def sentence_split():
with codecs.open('middel/renmin3.txt','r',encoding='utf-8') as inp,
codecs.open('middle/renmin4.txt','w',encoding='utf-8') as outp:
#文本文件的内容设置为对应的utf-8编码,python3:先encode,再decode
texts = inp.read().encode('utf-8').decode('utf-8')
#切分句子
sentences =
re.split('[,。!?、''"":]/[0]'.encode('utf-8').decode('utf-8'),
texts)
for sentence in sentences:
if sentence != ' ':
outp.write(sentence.strip() + '\n')
保存数据###################
def data_to_pkl():
"""
将文本数据保存成二进制pkl文件
:return:
"""
datas = []#数据
labels = []# 标签
all_words = []#词汇表
tags = set()#标签
input_data = codecs.open('middle/renmin4.txt','r',encoding='utf-8')
# 1.将标注的句子拆分成列表和对应的标注列表
for line in input_data.readlines():
linedata = list()
linelabel = list()
line = line.split()
numNotO = 0
for word in line:
word = word.split('/')
linedata.append(word[0])
linelabel.append(word[1])
all_words.append(word[0])
tags.add(word[1])
if word[1] != 'O': #标注全为O的子句
numNotO += 1
if numNotO != 0: # 只保存标注不全为O的子句
datas.append(linedata)
labels.append(linelabel)
input_data.close()
# 2、创建词汇表和标签
"""
1、构建词汇表:语料库总所有不重复单词的数量
2、构建三个词典:{单词:频数} {单词:编号} {编号:单词}
3、把文本进行填充或者截断[pad]:
4、结合词汇表和词对文本数据进行向量化表示(数字)
pytorch、tensorflow、keras、paddle(Embedding)
"""
words_count = collections.Counter(all_words).most_common()
# word2id:单词:编号
word2id = {word: i for i ,(word, _) in enumerate(words_count, 1)}
word['[PAD]'] = 0
word2id['[unknown]'] = len(word2id) # 100000000
#id2word:编号:单词
id2word = {i:word for word,i in word2id.items()}
tag2id = {tag: i for i, tag in enumerate(tags)}
id2tag = {i:tag for tag,i in tag2id.items()}
# 3、文本向量化,并处理成相同长度
max_len = 60 # 超参数
#每个句子对应的ID编号
data_ids = [[word2id[w] for w in line]
for line in datas]
# 标签对应的编号信息
labels_ids = [[tag2id[t] for t in line]
for line in labels]
x = pad_sequences(data_ids,maxlen=max_len,
padding='post').astype(np.int64)
y = pad_sequences(labels_ids,maxlen=max_len,
padding='post').astype(np.int64)
print('文本向量化完成')
# 4、将向量化后的数据拆分成训练集,验证集,测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,
test_size=0.2,
random_state=43)
x_train, x_valid, y_train, y_valid = train_test_split(x, y,
test_size=0.2,
random_state=43)
print(len(x_valid))
# 5、保存数据
with open('../data_target_pkl/renmindata.pkl','wb') as outp:
#原始数据
pickle.dump(word2id,outp)
pickle.dump(id2word,outp)
pickle.dump(tag2id,outp)
pickle.dump(id2tag,outp)
#训练数据
pickle.dump(x_train,outp)
pickle.dump(y_train,outp)
pickle.dump(x_test,outp)
pickle.dump(y_test,outp)
pickle.dump(x_valid,outp)
pickle.dump(y_valid,outp)
with open('../data_target_pkl/vocab.pkl') as outp:
pickle.dump(word2id, outp)
pickle.dump(id2word, outp)
with open('../data_target_pkl/tags.pkl') as outp1:
pickle.dump(tag2id, outp1)
pickle.dump(id2tag, outp1)
def main():
# 数据清洗
origin_handle_entities()
#数据标注(字)
origin_handle_mark()
# 句子切分
sentence_split()
# 数据转换
data_to_pkl()
if name == 'main':
main()
##################################################################################################
加载数据########
def load_data():
pickle_path = '../data_target_pkl/renmindata.pkl'
with open(pickle_path,'rb') as inp:
word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid =pickle.load(inp)
return word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid
def main():
word2id = load_data()
print(len(word2id))
if name == 'main':
main()
#######################################################################################
bilstm_crf_model.py
import torch
import torch.nn as nn
from torch.utils.data import Dataset # 批量读取数据
命名识别类(加载数据)
class NERDataSet(Dataset):
"""
X:表示样本,Y:表示标签
"""
def init(self,X,Y,args,*kwargs):
"""
:param X: 样本
:param Y: 标签
:param args: 任意数;任意数量参数
:param kwargs: 任意数;任意数量参数
"""
self.data = [{'x':X[i],'y':Y[i]}
for i in range(X.shape[0])]
# 返回对应数据的索引,单词:编号
def __getitem__(self, index):
return self.data[index]
# 样本数据的个数
def __len__(self):
return len(self.data)
参数
class Config():
embedding_dim = 100 #词向量的维度
hidden_dim = 200
word2id,tag2id = load_data()
vocab_size = len(word2id)
num_tags = len(tag2id)
dropout = 0.2
lr = 0.001
weight_decay = 1e-5
config = Config()
构建模型(Bilstm + CRF)
class NERLSTM_CRF(nn.Module):
"""
1、输入层
2、词映射(Embedding(vocab_size,embedding_dim))
3、LSTM
4、全连接层
"""
def init(self):
super(NERLSTM_CRF,self).init()
self.embeding_dim = config.embeding_dim
self.hidden_dim = config.hidden_dim
self.vocab_size = config.vocab_size
self.num_tags = config.num_tags
#将处理后的数据对应单词的编号换成词向量
self.embeds = nn.Embedding(
self.vocab_size,
self.embeding_dim
)
self.dropout = nn.Dropout(config.dropout)
#lstm bidirectional 双向LSTM
self.lstm = nn.LSTM(
self.embeding_dim,
self.hidden_dim//2,#双向
num_layers=1,
bidrectional=True,
batch_first=True,#设置属性值,保持数据格式
)
#全连接
self.linear =nn.Linear(self.hidden_dim,
self.num_tags)
#CRF
self.crf = CRF(self.num_tags)
#向前计算
def forword(self,x,mask):
embeddings = self.embeds(x) # 词映射
feats,hidden = self.lstm(embeddings)
emissions = self.linear(self.dropout(feats))
#viterbi_decode预测和标记进行比对解码
outputs = self.crf.viterbi_decode(emissions,mask)
return outputs
#反向传播
def log_likelihood(self,x,labels,mask):
embeddings = self.embeds(x) # 词映射
feats, hidden = self.lstm(embeddings)
emissions = self.linear(self.dropout(feats))#LSTM
loss = -self.crf.forward(emissions,labels,mask)#全连接
return torch.sum(loss)
##################################################
from torch.utils.data import DataLoader #批量加载数据
import torch
import torch.optim as op
模型训练的帮助函数
def utils_to_train():
device = torch.device('cpu')
max_epoch = 1
batch_size = 32
num_workers =4 #开启几个线程取执行程序
x_train,y_train,x_valid,y_valid,x_test,y_test = load_data()
# 训练集
train_data = NERDataSet(x_train,y_train)
# 验证集
valid_data = NERDataSet(x_valid,y_valid)
# 测试集
test_data = NERDataSet(x_test,y_test)
#批量加载数据
train_data_loader = DataLoader(
train_data,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)
valid_data_loader = DataLoader(
valid_data,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)
test_data_loader = DataLoader(
test_data,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)
config =Config()
model = NERLSTM_CRF(config).to(device)
optimizer = op.Adam(
model.parameters(),
lr = config.lr,
weight_decay=config.weight_decay
)
return max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,optimizer,model
用于将实体类别解码,单字组合成单词
def parse_tags(text,path):
id2tag = load_data()
tags = [id2tag[idx] for idx in path]
begin = 0
res = []
for idx,tag in enumerate(tags):
#将连续的同类型的字连接
if tag.startwith('B'):
begin = idx
elif tag.startwith('E'):
end =idx
word = text[begin:end+1]
label = tag[2:]
res.append((word,label))
elif tag == 'O':
res.append((text[idx],tag))
return res
##################################################
train.py
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score
word2id = load_data()[0]
max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,model = utils_to_train()
中文命名体识别
class ChineseNER(object):
def train(self):
for epoch in range(max_epoch):
#训练模型
model.train()
for index,batch in enumerate(train_data_loader):
#梯度归零
optimizer = utils_to_train()
optimizer.zero_grad()
# 训练数据---cpu
x = batch['x'].to(device)
mask = (x>0).to(device)
y = batch['y'].to(device)
#前向计算损失
loss = model.log_likelihood(x,y,mask)
#反向传播
loss.backward()
#梯度裁剪
torch.nn.utils.clip_grad_norm(parameters=model.parameters(),
max_norm=10)
#更新参数
optimizer.step()
if index % 200 == 0:
print('epoch:%5d,-----loss:%f'%(epoch,loss.item()))
#验证损失和精度
aver_loss = 0
preds, labels = [],[]
for index,batch in enumerate(valid_data_loader):
#验证模式
model.eval()
#验证数据--->cpu
val_x,val_y = batch['x'].to(device)
val_mask = (val_x > 0).to(device)
predict = model(val_x,val_mask)
#前向计算损失
loss = model.log_likelihood(val_x,val_y)
aver_loss += loss.item()
#统计非0的,也就是真实标签的长度
leng = []
for i in val_y.cpu():
tmp = []
for j in i:
if j.item() >0:
tmp.append(j.item())
leng.append(tmp)
for index,i in enumerate(predict):
preds += i[:len(leng[index])]
for index,i in enumerate(val_y.tolist()):
labels +=i[:len(leng[index])]
#损失值与评测指标
aver_loss /= (len(valid_data_loader) * 64)
precision = precision_score(labels,preds,average='macro')
recall = recall_score(labels,preds,average='macro')
f1 = f1_score(labels,preds,average='macro')
report = classification_report(labels,preds)
print(report)
torch.save(model.state_dict(),'params.pkl')