Bert小黑初次尝试1:Bert_finetuning

1.参数定义

from importlib import import_module
import time
import torch
import numpy as np
import torch.nn as nn
from pytorch_pretrained import BertModel,BertTokenizer
# config类
class Config(object):
    '''配置参数'''
    def __init__(self,dataset):
        self.model_name = 'bert'
        self.train_path = dataset +'/data/dev.txt'    # 训练集
        self.dev_path = dataset + '/data/dev.txt'    # 验证集
        self.test_path = dataset + '/data/test.txt'    # 测试集
        self.class_list = [x.strip() for x in open(dataset+'/data/class.txt').readlines()]    # 类别标签
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'# 模型保存地址
        self.device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
        self.require_improvement = 1000   # 若超过该迭代次数,模型准确率没有提升,就停止训练
        self.num_classes = len(self.class_list)# 类别数
        self.num_epochs = 3# 轮数
        self.batch_size = 128# 批次数量
        self.pad_size = 32# 每个句子长度
        self.learning_rate = 5e-5# 学习率
        self.bert_path = '../Bert-Chinese-Text-Classification-Pytorch-master/bert_pretrain/'    # bert地址
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path+'vocab.txt')
        self.hidden_size = 768# 隐层数量

2.模型建立

class Model(nn.Module):
    def __init__(self,config):
        super(Model,self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)    # 加载预训练模型
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size,config.num_classes)# 全连接层
    def forward(self,x):
        context = x[0]    # 输入的句子
        mask = x[2]     # 对padding进行mask
        _,pooled = self.bert(context,attention_mask = mask,output_all_encoded_layers = False)    # [batch_size,hidden_size]
        out = self.fc(pooled).   # [batch_size,num_classes]
        return out
#config = Config('.')
#model = Model(config)
#model.bert(torch.ones(4,8).long(),torch.ones(4,8).long())[1].shape
dataset = '.'
config = Config(dataset)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True    # 保证每次结果一样
start_time = time.time()
print('Loading data...')

3.数据集建立

from tqdm import tqdm
import time
from datetime import timedelta
PAD,CLS = '[PAD]','[CLS]'    
def build_dataset(config):
    def load_dataset(path,pad_size = 32):
        contents = []
        with open(path,'r',encoding = 'utf-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content,label = lin.split('\t')
                token = config.tokenizer.tokenize(content)
                token = [CLS] + token    # 开始标记
                seq_len = len(token)# 句子长度
                mask = []
                token_ids = config.tokenizer.convert_tokens_to_ids(token)
                if pad_size:    # 长截断,短pad补齐
                    if len(token) < pad_size:
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += ([0] * (pad_size - len(token)))
                    else:
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size
                contents.append((token_ids,int(label),seq_len,mask))
        return contents
    train = load_dataset(config.train_path,config.pad_size)
    dev = load_dataset(config.dev_path,config.pad_size)
    test = load_dataset(config.test_path,config.pad_size)
    return train,dev,test
# 建立数据集迭代器
def build_iterator(dataset,config):
    iter_ = DatasetIterater(dataset,config.batch_size,config.device)
    return iter_
def get_time_dif(start_time):
    '''获取已经使用的时间'''
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds = int(round(time_dif)))
class DatasetIterater(object):
    def __init__(self,batches,batch_size,device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device
    def _to_tensor(self,datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        # pad前的长度(超过pad_size的设置为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x,seq_len,mask),y
    # 迭代下一个batch
    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size:len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches
        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches
    def __iter__(self):
        return self
    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches
train_data,dev_data,test_data = build_dataset(config)
train_iter = build_iterator(train_data,config)
dev_iter = build_iterator(test_data,config)
test_iter = build_iterator(test_data,config)
time_dif = get_time_dif(start_time)
print('Time usage:',time_dif)

4.模型训练

# 建立模型
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from pytorch_pretrained_bert.optimization import BertAdam
# 初始化参数
def init_network(model,method = 'xavier',exclude = 'embedding',seed = 123):
    for name,w in model.named_parameters():
        if exclude not in name:
            if len(w.size()) < 2:
                continue
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w,0)
            else:
                pass
def train(config,model,train_iter,dev_iter,test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias','LayerNorm.bias','LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':0.01},
        {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay':0.0}
    ]
    optimizer = BertAdam(optimizer_grouped_parameters,lr = config.learning_rate,warmup = 0.05,t_total = len(train_iter) * config.num_epochs)
    total_batch = 0.   # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0    # 最后一次提升的batch序号
    flag = False
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch+1,config.num_epochs))
        for i,(trains,labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs,labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 输出训练集和验证集效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data,1)[1].cpu()
                train_acc = metrics.accuracy_score(true,predic)
                dev_acc,dev_loss = evaluate(config,model,dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(),config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter:{0:>6},Train Loss:{1:>5.2},Train Acc:{2:>6.2%},Val Loss:{3:>5.2},Val Acc:{4:>6.2%},Time:{5} {6}'
                print(msg.format(total_batch,loss.item(),train_acc,dev_loss,dev_acc,time_dif,improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                print('No optimization for a long time,auto-stopping...')
                flag = True
                break
        if flag:
            break
    test(config,model,test_iter)
def test(config,model,test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc,test_loss,test_report,test_confusion = evaluate(config,model,test_iter,test = True)
    msg = 'Test Loss:{0:>5.2},Test Acc:{1:>6.2%}'
    print(msg.format(test_loss,test_acc))
    print('Precision,Recall and F1-Score...')
    print(test_report)
    print('Confusion Matrix...')
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print('Time usage:',time_dif)
def evaluate(config,model,data_iter,test = False):
    model.eval()
    loss_total = 0
    predict_all = np.array([],dtype = int)
    labels_all = np.array([],dtype = int)
    with torch.no_grad():
        for texts,labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs,labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data,1)[1].cpu().numpy()
            labels_all = np.append(labels_all,labels)
            predict_all = np.append(predict_all,predic)
    acc = metrics.accuracy_score(labels_all,predict_all)
    if test:
        report = metrics.classification_report(labels_all,predict_all,target_names = config.class_list,digits = 4)
        confusion = metrics.confusion_matrix(labels_all,predict_all)
        return acc,loss_total / len(data_iter),report,confusion
    return acc,loss_total / len(data_iter)

# 开始训练
model = Model(config).to(config.device)
train(config, model, train_iter, dev_iter, test_iter)

输出:
Epoch [1/3]
Iter: 0, Train Loss: 2.4, Train Acc: 13.28%, Val Loss: 2.4, Val Acc: 9.08%, Time: 0:05:45 *
Epoch [2/3]
Iter: 100, Train Loss: 0.43, Train Acc: 88.28%, Val Loss: 0.22, Val Acc: 93.70%, Time: 0:33:42 *
Epoch [3/3]
Iter: 200, Train Loss: 0.031, Train Acc: 100.00%, Val Loss: 0.091, Val Acc: 97.57%, Time: 0:59:47 *
Test Loss: 0.34, Test Acc: 90.52%
Precision, Recall and F1-Score…
precision recall f1-score support

  finance     0.9189    0.8720    0.8948      1000
   realty     0.9143    0.9170    0.9156      1000
   stocks     0.8579    0.8150    0.8359      1000
education     0.9551    0.9350    0.9449      1000
  science     0.8587    0.8690    0.8638      1000
  society     0.8780    0.9280    0.9023      1000
 politics     0.8807    0.9010    0.8908      1000
   sports     0.9732    0.9440    0.9584      1000
     game     0.9231    0.9240    0.9235      1000

entertainment 0.8968 0.9470 0.9212 1000

 accuracy                         0.9052     10000
macro avg     0.9057    0.9052    0.9051     10000

weighted avg 0.9057 0.9052 0.9051 10000

Confusion Matrix…
[[872 19 72 4 11 9 6 3 1 3]
[ 12 917 8 1 6 16 12 3 7 18]
[ 52 33 815 0 37 5 42 1 9 6]
[ 0 3 2 935 5 28 14 1 2 10]
[ 5 6 28 2 869 17 18 2 39 14]
[ 1 11 0 16 10 928 15 0 4 15]
[ 5 7 18 10 20 30 901 2 0 7]
[ 1 3 3 1 3 9 8 944 3 25]
[ 0 2 4 1 44 8 4 2 924 11]
[ 1 2 0 9 7 7 3 12 12 947]]
Time usage: 0:04:42

你可能感兴趣的:(自然语言处理)