NLP命名体识别bilstm+crf

"""
NLP命名体识别bilstm+crf
1、准备数据:origin_handle_entities()
读取源数据文件,把人名,地名,机构名合并起来

2、读取处理后的数据:origin_handle_mark()
把预处理后的的文本标注成BMO的格式,
B(begin)、M(middle)、E(end)、O(other)

3、句子切分:sentence_split()
按照指定的格式,比如标点等内容对数据完成切分
4、保存数据
    a.将标注的句子拆分自成列表和对应的标注序列
    b.创建词汇表和标签
    c.文本的向量化表示
    d.划分训练集和测试集
    e.保存成二进制pkl文件
5、加载数据
6、训练模型BiLSTM&HMM
7、保存训练后的模型用于预测
8、预测
"""
import codecs
import re
import collections
import pickle

import TorchCRF as CRF
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences     #使用tensorflow的pad_sequences进行数据对齐   tensorflow2.3.1
from sklearn.model_selection import train_test_split

#####数据清洗######
def origin_handle_entities():
    with open('renmin.txt','r',encoding='utf-8') as inp,\
        open('middle/renmin2.txt','w',encoding='utf-8')\
            as outp:
        #读取源文件中的数据
        for line in inp.readlines():
            #按照空格切分
            line = line.split(' ')
            i = 1
            while i < len(line) - 1:
                if line[i][0] == '[':
                    outp.write(line[i].split('/')[0][1:])
                    i += 1
                    while i < len(line) - 1 and line[i].find(']') == -1:
                        if line[i] !='':
                            #print(line[i].split('/')[0])
                            outp.write(line[i].split('/')[0])
                        i += 1
                    outp.write(line[i].split('/')[0].strip()+'/'+line[i])
                elif line[i].split('/')[1] == 'nr':
                    word = line[i].split('/')[0]
                    i += 1
                    if i < len(line) - 1 and line[i].split('/')[1] == 'nr':
                        outp.write(word + line[i].split('/')[0] + 'nr')
                    else:
                        outp.write(word + '/nr ')
                        continue
                else:
                    outp.write(line[i] + '/no ')
                i += 1
            outp.write('\n')



######数据的标注########

import codecs
def origin_handle_mark():
    """
    1、读取数据预处理后的renmin2.txt
    2、将标注好的数据写入renmin3.txt
    a.打开输入和输出文件
    b.遍历输入文件renmin2.txt
    :return:
    """
    with codecs.open('middle/renmin2.txt','r',encoding='utf-8') as inp,\
            codecs.open('middle/renmin3.txt','w',encoding='utf-8') as outp:

        #遍历renmin2.txt
        for line in inp.readlines():
            line = line.split(' ')
            #遍历每个句子
            i = 0
            while i < len(line) - 1:
                if line[i] == '':#跳过空字符
                    i += 1
                    continue
                word = line[i].split('/')[0]
                #标签
                tag = line[i].split('/')[1]
                if tag == 'nr' or tag == 'ns' or tag == 'nt':
                    outp.write(word[0] + '/B_' + tag + ' ')
                    for j in word[1:len(word) -1]:
                        if j != ' ':
                            outp.write(j + '/M_' + tag + ' ')
                    outp.write(word[-1] + '/E_' + tag + ' ')
                else:
                    for w in word:
                        outp.write(w + '/O' + ' ')
                i += 1

            outp.write('\n')


#########句子切分###################################
import re
def sentence_split():
    with codecs.open('middel/renmin3.txt','r',encoding='utf-8') as inp,\
            codecs.open('middle/renmin4.txt','w',encoding='utf-8') as outp:
        #文本文件的内容设置为对应的utf-8编码,python3:先encode,再decode
        texts = inp.read().encode('utf-8').decode('utf-8')
        #切分句子
        sentences = \
        re.split('[,。!?、''"":]/[0]'.encode('utf-8').decode('utf-8'),\
                 texts)
        for sentence in sentences:
            if sentence != ' ':
                outp.write(sentence.strip() + '\n')




######保存数据###################
def data_to_pkl():
    """
    将文本数据保存成二进制pkl文件
    :return:
    """

    datas = []#数据
    labels = []# 标签

    all_words = []#词汇表
    tags = set()#标签
    input_data = codecs.open('middle/renmin4.txt','r',encoding='utf-8')
    # 1.将标注的句子拆分成列表和对应的标注列表
    for line in input_data.readlines():
        linedata = list()
        linelabel = list()

        line = line.split()

        numNotO = 0
        for word in line:
            word = word.split('/')
            linedata.append(word[0])
            linelabel.append(word[1])

            all_words.append(word[0])
            tags.add(word[1])

            if word[1] != 'O': #标注全为O的子句
                numNotO += 1

        if numNotO != 0: # 只保存标注不全为O的子句
            datas.append(linedata)
            labels.append(linelabel)

    input_data.close()

    # 2、创建词汇表和标签
    """
    1、构建词汇表:语料库总所有不重复单词的数量
    2、构建三个词典:{单词:频数} {单词:编号} {编号:单词}
    3、把文本进行填充或者截断[pad]:
    4、结合词汇表和词对文本数据进行向量化表示(数字)
      pytorch、tensorflow、keras、paddle(Embedding)
    """
    words_count = collections.Counter(all_words).most_common()
    # word2id:单词:编号
    word2id = {word: i for i ,(word, _) in enumerate(words_count, 1)}
    word['[PAD]'] = 0
    word2id['[unknown]'] = len(word2id) # 100000000
    #id2word:编号:单词
    id2word = {i:word for word,i in word2id.items()}

    tag2id = {tag: i for i, tag in enumerate(tags)}

    id2tag = {i:tag for tag,i in tag2id.items()}

    # 3、文本向量化,并处理成相同长度
    max_len = 60 # 超参数
    #每个句子对应的ID编号
    data_ids = [[word2id[w] for w in line]
                for line in datas]
    # 标签对应的编号信息
    labels_ids = [[tag2id[t] for t in line]
                  for line in labels]
    x = pad_sequences(data_ids,maxlen=max_len,
                      padding='post').astype(np.int64)
    y = pad_sequences(labels_ids,maxlen=max_len,
                      padding='post').astype(np.int64)
    print('文本向量化完成')

    # 4、将向量化后的数据拆分成训练集,验证集,测试集
    x_train,x_test,y_train,y_test = train_test_split(x,y,
                                                     test_size=0.2,
                                                     random_state=43)
    x_train, x_valid, y_train, y_valid = train_test_split(x, y,
                                                        test_size=0.2,
                                                        random_state=43)
    print(len(x_valid))

    # 5、保存数据
    with open('../data_target_pkl/renmindata.pkl','wb') as outp:
        #原始数据
        pickle.dump(word2id,outp)
        pickle.dump(id2word,outp)
        pickle.dump(tag2id,outp)
        pickle.dump(id2tag,outp)

        #训练数据
        pickle.dump(x_train,outp)
        pickle.dump(y_train,outp)
        pickle.dump(x_test,outp)
        pickle.dump(y_test,outp)
        pickle.dump(x_valid,outp)
        pickle.dump(y_valid,outp)

    with open('../data_target_pkl/vocab.pkl') as outp:
        pickle.dump(word2id, outp)
        pickle.dump(id2word, outp)
    with open('../data_target_pkl/tags.pkl') as outp1:
        pickle.dump(tag2id, outp1)
        pickle.dump(id2tag, outp1)


def main():
    # 数据清洗
    origin_handle_entities()
    #数据标注(字)
    origin_handle_mark()
    # 句子切分
    sentence_split()
    # 数据转换
    data_to_pkl()


if __name__ == '__main__':
    main()

##################################################################################################

####加载数据########
def load_data():
    pickle_path = '../data_target_pkl/renmindata.pkl'
    with open(pickle_path,'rb') as inp:
        word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid =pickle.load(inp)

    return word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid


def main():
    word2id = load_data()
    print(len(word2id))

if __name__ == '__main__':
    main()



#######################################################################################
#bilstm_crf_model.py

import torch
import torch.nn as nn
from torch.utils.data import Dataset # 批量读取数据

#命名识别类(加载数据)
class NERDataSet(Dataset):
    """
    X:表示样本,Y:表示标签
    """
    def __init__(self,X,Y,*args,**kwargs):
        """

        :param X: 样本
        :param Y: 标签
        :param args: 任意数;任意数量参数
        :param kwargs: 任意数;任意数量参数
        """
        self.data = [{'x':X[i],'y':Y[i]}
                     for i in range(X.shape[0])]

    # 返回对应数据的索引,单词:编号
    def __getitem__(self, index):
        return self.data[index]
    # 样本数据的个数
    def __len__(self):
        return len(self.data)

#参数
class Config():
    embedding_dim = 100 #词向量的维度
    hidden_dim = 200

    word2id,tag2id = load_data()
    vocab_size = len(word2id)
    num_tags = len(tag2id)

    dropout = 0.2
    lr = 0.001
    weight_decay = 1e-5

config = Config()



#构建模型(Bilstm + CRF)
class NERLSTM_CRF(nn.Module):
    """
    1、输入层
    2、词映射(Embedding(vocab_size,embedding_dim))
    3、LSTM
    4、全连接层
    """
    def __init__(self):
        super(NERLSTM_CRF,self).__init__()
        self.embeding_dim = config.embeding_dim
        self.hidden_dim = config.hidden_dim
        self.vocab_size = config.vocab_size
        self.num_tags = config.num_tags

        #将处理后的数据对应单词的编号换成词向量
        self.embeds = nn.Embedding(
            self.vocab_size,
            self.embeding_dim
        )
        self.dropout = nn.Dropout(config.dropout)

        #lstm bidirectional   双向LSTM
        self.lstm = nn.LSTM(
            self.embeding_dim,
            self.hidden_dim//2,#双向
            num_layers=1,
            bidrectional=True,
            batch_first=True,#设置属性值,保持数据格式
        )
        #全连接
        self.linear =nn.Linear(self.hidden_dim,
                               self.num_tags)
        #CRF
        self.crf = CRF(self.num_tags)

    #向前计算
    def forword(self,x,mask):
        embeddings = self.embeds(x) # 词映射
        feats,hidden = self.lstm(embeddings)
        emissions = self.linear(self.dropout(feats))
        #viterbi_decode预测和标记进行比对解码
        outputs = self.crf.viterbi_decode(emissions,mask)
        return outputs

    #反向传播
    def log_likelihood(self,x,labels,mask):
        embeddings = self.embeds(x)  # 词映射
        feats, hidden = self.lstm(embeddings)
        emissions = self.linear(self.dropout(feats))#LSTM
        loss = -self.crf.forward(emissions,labels,mask)#全连接
        return torch.sum(loss)


##################################################
from torch.utils.data import DataLoader #批量加载数据
import torch
import torch.optim as op

#模型训练的帮助函数
def utils_to_train():
    device = torch.device('cpu')
    max_epoch = 1
    batch_size = 32
    num_workers =4 #开启几个线程取执行程序

    x_train,y_train,x_valid,y_valid,x_test,y_test = load_data()
    # 训练集
    train_data = NERDataSet(x_train,y_train)
    # 验证集
    valid_data = NERDataSet(x_valid,y_valid)
    # 测试集
    test_data = NERDataSet(x_test,y_test)

    #批量加载数据
    train_data_loader = DataLoader(
        train_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )
    valid_data_loader = DataLoader(
        valid_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )
    test_data_loader = DataLoader(
        test_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )
    config =Config()
    model = NERLSTM_CRF(config).to(device)
    optimizer = op.Adam(
        model.parameters(),
        lr = config.lr,
        weight_decay=config.weight_decay
    )
    return max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,optimizer,model

#用于将实体类别解码,单字组合成单词
def parse_tags(text,path):
    id2tag = load_data()
    tags = [id2tag[idx] for idx in path]
    
    begin = 0
    res = []
    for idx,tag in enumerate(tags):
        #将连续的同类型的字连接
        if tag.startwith('B'):
            begin = idx
        elif tag.startwith('E'):
            end =idx
            word = text[begin:end+1]
            label = tag[2:]
            res.append((word,label))
        elif tag == 'O':
            res.append((text[idx],tag))
    return res

            
    
    

##################################################
# train.py
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score


word2id = load_data()[0]
max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,model = utils_to_train()

#中文命名体识别
class ChineseNER(object):
    def train(self):
        for epoch in range(max_epoch):
            
            #训练模型
            model.train()
            
            for index,batch in enumerate(train_data_loader):
                #梯度归零
                optimizer = utils_to_train()
                optimizer.zero_grad()
                
                # 训练数据---cpu
                x = batch['x'].to(device)
                mask = (x>0).to(device)
                y = batch['y'].to(device)
                
                #前向计算损失
                loss = model.log_likelihood(x,y,mask)
                
                #反向传播
                loss.backward()
                
                #梯度裁剪
                torch.nn.utils.clip_grad_norm(parameters=model.parameters(),
                                              max_norm=10)
                
                #更新参数
                optimizer.step()
                if index % 200 == 0:
                    print('epoch:%5d,-----loss:%f'%(epoch,loss.item()))
                    
            #验证损失和精度
            aver_loss = 0
            preds, labels = [],[] 
            for index,batch in enumerate(valid_data_loader):
                
                #验证模式
                model.eval()
                
                #验证数据--->cpu
                val_x,val_y = batch['x'].to(device)
                val_mask = (val_x > 0).to(device)
                predict = model(val_x,val_mask)
                
                #前向计算损失
                loss = model.log_likelihood(val_x,val_y)
                aver_loss += loss.item()
                
                #统计非0的,也就是真实标签的长度
                leng = []
                for i in val_y.cpu():
                    tmp = []
                    for j in i:
                        if j.item() >0:
                            tmp.append(j.item())
                    leng.append(tmp)
                    
                for index,i in enumerate(predict):
                    preds += i[:len(leng[index])]
                for index,i in enumerate(val_y.tolist()):
                    labels +=i[:len(leng[index])]
            
            #损失值与评测指标
            aver_loss /= (len(valid_data_loader) * 64)
            precision = precision_score(labels,preds,average='macro')
            recall = recall_score(labels,preds,average='macro')
            f1 = f1_score(labels,preds,average='macro')
            report = classification_report(labels,preds)
            print(report)
            torch.save(model.state_dict(),'params.pkl')
            
                    
                
                
                
                
                























"""
NLP命名体识别bilstm+crf
1、准备数据:origin_handle_entities()
读取源数据文件,把人名,地名,机构名合并起来

2、读取处理后的数据:origin_handle_mark()
把预处理后的的文本标注成BMO的格式,
B(begin)、M(middle)、E(end)、O(other)

3、句子切分:sentence_split()
按照指定的格式,比如标点等内容对数据完成切分
4、保存数据
a.将标注的句子拆分自成列表和对应的标注序列
b.创建词汇表和标签
c.文本的向量化表示
d.划分训练集和测试集
e.保存成二进制pkl文件
5、加载数据
6、训练模型BiLSTM&HMM
7、保存训练后的模型用于预测
8、预测
"""
import codecs
import re
import collections
import pickle

import TorchCRF as CRF
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences #使用tensorflow的pad_sequences进行数据对齐 tensorflow2.3.1
from sklearn.model_selection import train_test_split

数据清洗######

def origin_handle_entities():
with open('renmin.txt','r',encoding='utf-8') as inp,
open('middle/renmin2.txt','w',encoding='utf-8')
as outp:
#读取源文件中的数据
for line in inp.readlines():
#按照空格切分
line = line.split(' ')
i = 1
while i < len(line) - 1:
if line[i][0] == '[':
outp.write(line[i].split('/')[0][1:])
i += 1
while i < len(line) - 1 and line[i].find(']') == -1:
if line[i] !='':
#print(line[i].split('/')[0])
outp.write(line[i].split('/')[0])
i += 1
outp.write(line[i].split('/')[0].strip()+'/'+line[i])
elif line[i].split('/')[1] == 'nr':
word = line[i].split('/')[0]
i += 1
if i < len(line) - 1 and line[i].split('/')[1] == 'nr':
outp.write(word + line[i].split('/')[0] + 'nr')
else:
outp.write(word + '/nr ')
continue
else:
outp.write(line[i] + '/no ')
i += 1
outp.write('\n')

数据的标注########

import codecs
def origin_handle_mark():
"""
1、读取数据预处理后的renmin2.txt
2、将标注好的数据写入renmin3.txt
a.打开输入和输出文件
b.遍历输入文件renmin2.txt
:return:
"""
with codecs.open('middle/renmin2.txt','r',encoding='utf-8') as inp,
codecs.open('middle/renmin3.txt','w',encoding='utf-8') as outp:

    #遍历renmin2.txt
    for line in inp.readlines():
        line = line.split(' ')
        #遍历每个句子
        i = 0
        while i < len(line) - 1:
            if line[i] == '':#跳过空字符
                i += 1
                continue
            word = line[i].split('/')[0]
            #标签
            tag = line[i].split('/')[1]
            if tag == 'nr' or tag == 'ns' or tag == 'nt':
                outp.write(word[0] + '/B_' + tag + ' ')
                for j in word[1:len(word) -1]:
                    if j != ' ':
                        outp.write(j + '/M_' + tag + ' ')
                outp.write(word[-1] + '/E_' + tag + ' ')
            else:
                for w in word:
                    outp.write(w + '/O' + ' ')
            i += 1

        outp.write('\n')

#########句子切分###################################
import re
def sentence_split():
with codecs.open('middel/renmin3.txt','r',encoding='utf-8') as inp,
codecs.open('middle/renmin4.txt','w',encoding='utf-8') as outp:
#文本文件的内容设置为对应的utf-8编码,python3:先encode,再decode
texts = inp.read().encode('utf-8').decode('utf-8')
#切分句子
sentences =
re.split('[,。!?、''"":]/[0]'.encode('utf-8').decode('utf-8'),
texts)
for sentence in sentences:
if sentence != ' ':
outp.write(sentence.strip() + '\n')

保存数据###################

def data_to_pkl():
"""
将文本数据保存成二进制pkl文件
:return:
"""

datas = []#数据
labels = []# 标签

all_words = []#词汇表
tags = set()#标签
input_data = codecs.open('middle/renmin4.txt','r',encoding='utf-8')
# 1.将标注的句子拆分成列表和对应的标注列表
for line in input_data.readlines():
    linedata = list()
    linelabel = list()

    line = line.split()

    numNotO = 0
    for word in line:
        word = word.split('/')
        linedata.append(word[0])
        linelabel.append(word[1])

        all_words.append(word[0])
        tags.add(word[1])

        if word[1] != 'O': #标注全为O的子句
            numNotO += 1

    if numNotO != 0: # 只保存标注不全为O的子句
        datas.append(linedata)
        labels.append(linelabel)

input_data.close()

# 2、创建词汇表和标签
"""
1、构建词汇表:语料库总所有不重复单词的数量
2、构建三个词典:{单词:频数} {单词:编号} {编号:单词}
3、把文本进行填充或者截断[pad]:
4、结合词汇表和词对文本数据进行向量化表示(数字)
  pytorch、tensorflow、keras、paddle(Embedding)
"""
words_count = collections.Counter(all_words).most_common()
# word2id:单词:编号
word2id = {word: i for i ,(word, _) in enumerate(words_count, 1)}
word['[PAD]'] = 0
word2id['[unknown]'] = len(word2id) # 100000000
#id2word:编号:单词
id2word = {i:word for word,i in word2id.items()}

tag2id = {tag: i for i, tag in enumerate(tags)}

id2tag = {i:tag for tag,i in tag2id.items()}

# 3、文本向量化,并处理成相同长度
max_len = 60 # 超参数
#每个句子对应的ID编号
data_ids = [[word2id[w] for w in line]
            for line in datas]
# 标签对应的编号信息
labels_ids = [[tag2id[t] for t in line]
              for line in labels]
x = pad_sequences(data_ids,maxlen=max_len,
                  padding='post').astype(np.int64)
y = pad_sequences(labels_ids,maxlen=max_len,
                  padding='post').astype(np.int64)
print('文本向量化完成')

# 4、将向量化后的数据拆分成训练集,验证集,测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,
                                                 test_size=0.2,
                                                 random_state=43)
x_train, x_valid, y_train, y_valid = train_test_split(x, y,
                                                    test_size=0.2,
                                                    random_state=43)
print(len(x_valid))

# 5、保存数据
with open('../data_target_pkl/renmindata.pkl','wb') as outp:
    #原始数据
    pickle.dump(word2id,outp)
    pickle.dump(id2word,outp)
    pickle.dump(tag2id,outp)
    pickle.dump(id2tag,outp)

    #训练数据
    pickle.dump(x_train,outp)
    pickle.dump(y_train,outp)
    pickle.dump(x_test,outp)
    pickle.dump(y_test,outp)
    pickle.dump(x_valid,outp)
    pickle.dump(y_valid,outp)

with open('../data_target_pkl/vocab.pkl') as outp:
    pickle.dump(word2id, outp)
    pickle.dump(id2word, outp)
with open('../data_target_pkl/tags.pkl') as outp1:
    pickle.dump(tag2id, outp1)
    pickle.dump(id2tag, outp1)

def main():
# 数据清洗
origin_handle_entities()
#数据标注(字)
origin_handle_mark()
# 句子切分
sentence_split()
# 数据转换
data_to_pkl()

if name == 'main':
main()

##################################################################################################

加载数据########

def load_data():
pickle_path = '../data_target_pkl/renmindata.pkl'
with open(pickle_path,'rb') as inp:
word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid =pickle.load(inp)

return word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid

def main():
word2id = load_data()
print(len(word2id))

if name == 'main':
main()

#######################################################################################

bilstm_crf_model.py

import torch
import torch.nn as nn
from torch.utils.data import Dataset # 批量读取数据

命名识别类(加载数据)

class NERDataSet(Dataset):
"""
X:表示样本,Y:表示标签
"""
def init(self,X,Y,args,*kwargs):
"""

    :param X: 样本
    :param Y: 标签
    :param args: 任意数;任意数量参数
    :param kwargs: 任意数;任意数量参数
    """
    self.data = [{'x':X[i],'y':Y[i]}
                 for i in range(X.shape[0])]

# 返回对应数据的索引,单词:编号
def __getitem__(self, index):
    return self.data[index]
# 样本数据的个数
def __len__(self):
    return len(self.data)

参数

class Config():
embedding_dim = 100 #词向量的维度
hidden_dim = 200

word2id,tag2id = load_data()
vocab_size = len(word2id)
num_tags = len(tag2id)

dropout = 0.2
lr = 0.001
weight_decay = 1e-5

config = Config()

构建模型(Bilstm + CRF)

class NERLSTM_CRF(nn.Module):
"""
1、输入层
2、词映射(Embedding(vocab_size,embedding_dim))
3、LSTM
4、全连接层
"""
def init(self):
super(NERLSTM_CRF,self).init()
self.embeding_dim = config.embeding_dim
self.hidden_dim = config.hidden_dim
self.vocab_size = config.vocab_size
self.num_tags = config.num_tags

    #将处理后的数据对应单词的编号换成词向量
    self.embeds = nn.Embedding(
        self.vocab_size,
        self.embeding_dim
    )
    self.dropout = nn.Dropout(config.dropout)

    #lstm bidirectional   双向LSTM
    self.lstm = nn.LSTM(
        self.embeding_dim,
        self.hidden_dim//2,#双向
        num_layers=1,
        bidrectional=True,
        batch_first=True,#设置属性值,保持数据格式
    )
    #全连接
    self.linear =nn.Linear(self.hidden_dim,
                           self.num_tags)
    #CRF
    self.crf = CRF(self.num_tags)

#向前计算
def forword(self,x,mask):
    embeddings = self.embeds(x) # 词映射
    feats,hidden = self.lstm(embeddings)
    emissions = self.linear(self.dropout(feats))
    #viterbi_decode预测和标记进行比对解码
    outputs = self.crf.viterbi_decode(emissions,mask)
    return outputs

#反向传播
def log_likelihood(self,x,labels,mask):
    embeddings = self.embeds(x)  # 词映射
    feats, hidden = self.lstm(embeddings)
    emissions = self.linear(self.dropout(feats))#LSTM
    loss = -self.crf.forward(emissions,labels,mask)#全连接
    return torch.sum(loss)

##################################################
from torch.utils.data import DataLoader #批量加载数据
import torch
import torch.optim as op

模型训练的帮助函数

def utils_to_train():
device = torch.device('cpu')
max_epoch = 1
batch_size = 32
num_workers =4 #开启几个线程取执行程序

x_train,y_train,x_valid,y_valid,x_test,y_test = load_data()
# 训练集
train_data = NERDataSet(x_train,y_train)
# 验证集
valid_data = NERDataSet(x_valid,y_valid)
# 测试集
test_data = NERDataSet(x_test,y_test)

#批量加载数据
train_data_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)
valid_data_loader = DataLoader(
    valid_data,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)
test_data_loader = DataLoader(
    test_data,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)
config =Config()
model = NERLSTM_CRF(config).to(device)
optimizer = op.Adam(
    model.parameters(),
    lr = config.lr,
    weight_decay=config.weight_decay
)
return max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,optimizer,model

用于将实体类别解码,单字组合成单词

def parse_tags(text,path):
id2tag = load_data()
tags = [id2tag[idx] for idx in path]

begin = 0
res = []
for idx,tag in enumerate(tags):
    #将连续的同类型的字连接
    if tag.startwith('B'):
        begin = idx
    elif tag.startwith('E'):
        end =idx
        word = text[begin:end+1]
        label = tag[2:]
        res.append((word,label))
    elif tag == 'O':
        res.append((text[idx],tag))
return res

##################################################

train.py

from sklearn.metrics import classification_report,precision_score,recall_score,f1_score

word2id = load_data()[0]
max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,model = utils_to_train()

中文命名体识别

class ChineseNER(object):
def train(self):
for epoch in range(max_epoch):

        #训练模型
        model.train()
        
        for index,batch in enumerate(train_data_loader):
            #梯度归零
            optimizer = utils_to_train()
            optimizer.zero_grad()
            
            # 训练数据---cpu
            x = batch['x'].to(device)
            mask = (x>0).to(device)
            y = batch['y'].to(device)
            
            #前向计算损失
            loss = model.log_likelihood(x,y,mask)
            
            #反向传播
            loss.backward()
            
            #梯度裁剪
            torch.nn.utils.clip_grad_norm(parameters=model.parameters(),
                                          max_norm=10)
            
            #更新参数
            optimizer.step()
            if index % 200 == 0:
                print('epoch:%5d,-----loss:%f'%(epoch,loss.item()))
                
        #验证损失和精度
        aver_loss = 0
        preds, labels = [],[] 
        for index,batch in enumerate(valid_data_loader):
            
            #验证模式
            model.eval()
            
            #验证数据--->cpu
            val_x,val_y = batch['x'].to(device)
            val_mask = (val_x > 0).to(device)
            predict = model(val_x,val_mask)
            
            #前向计算损失
            loss = model.log_likelihood(val_x,val_y)
            aver_loss += loss.item()
            
            #统计非0的,也就是真实标签的长度
            leng = []
            for i in val_y.cpu():
                tmp = []
                for j in i:
                    if j.item() >0:
                        tmp.append(j.item())
                leng.append(tmp)
                
            for index,i in enumerate(predict):
                preds += i[:len(leng[index])]
            for index,i in enumerate(val_y.tolist()):
                labels +=i[:len(leng[index])]
        
        #损失值与评测指标
        aver_loss /= (len(valid_data_loader) * 64)
        precision = precision_score(labels,preds,average='macro')
        recall = recall_score(labels,preds,average='macro')
        f1 = f1_score(labels,preds,average='macro')
        report = classification_report(labels,preds)
        print(report)
        torch.save(model.state_dict(),'params.pkl')

你可能感兴趣的:(NLP命名体识别bilstm+crf)