1.参数定义
from importlib import import_module
import time
import torch
import numpy as np
import torch.nn as nn
from pytorch_pretrained import BertModel,BertTokenizer
# config类
class Config(object):
'''配置参数'''
def __init__(self,dataset):
self.model_name = 'bert'
self.train_path = dataset +'/data/dev.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(dataset+'/data/class.txt').readlines()] # 类别标签
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'。 # 模型保存地址
self.device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
self.require_improvement = 1000 # 若超过该迭代次数,模型准确率没有提升,就停止训练
self.num_classes = len(self.class_list)。 # 类别数
self.num_epochs = 3。 # 轮数
self.batch_size = 128。 # 批次数量
self.pad_size = 32。 # 每个句子长度
self.learning_rate = 5e-5。 # 学习率
self.bert_path = '../Bert-Chinese-Text-Classification-Pytorch-master/bert_pretrain/' # bert地址
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path+'vocab.txt')
self.hidden_size = 768。 # 隐层数量
2.模型建立
class Model(nn.Module):
def __init__(self,config):
super(Model,self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path) # 加载预训练模型
for param in self.bert.parameters():
param.requires_grad = True
self.fc = nn.Linear(config.hidden_size,config.num_classes)。 # 全连接层
def forward(self,x):
context = x[0] # 输入的句子
mask = x[2] # 对padding进行mask
_,pooled = self.bert(context,attention_mask = mask,output_all_encoded_layers = False) # [batch_size,hidden_size]
out = self.fc(pooled). # [batch_size,num_classes]
return out
#config = Config('.')
#model = Model(config)
#model.bert(torch.ones(4,8).long(),torch.ones(4,8).long())[1].shape
dataset = '.'
config = Config(dataset)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 保证每次结果一样
start_time = time.time()
print('Loading data...')
3.数据集建立
from tqdm import tqdm
import time
from datetime import timedelta
PAD,CLS = '[PAD]','[CLS]'
def build_dataset(config):
def load_dataset(path,pad_size = 32):
contents = []
with open(path,'r',encoding = 'utf-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content,label = lin.split('\t')
token = config.tokenizer.tokenize(content)
token = [CLS] + token # 开始标记
seq_len = len(token)。 # 句子长度
mask = []
token_ids = config.tokenizer.convert_tokens_to_ids(token)
if pad_size: # 长截断,短pad补齐
if len(token) < pad_size:
mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
token_ids += ([0] * (pad_size - len(token)))
else:
mask = [1] * pad_size
token_ids = token_ids[:pad_size]
seq_len = pad_size
contents.append((token_ids,int(label),seq_len,mask))
return contents
train = load_dataset(config.train_path,config.pad_size)
dev = load_dataset(config.dev_path,config.pad_size)
test = load_dataset(config.test_path,config.pad_size)
return train,dev,test
# 建立数据集迭代器
def build_iterator(dataset,config):
iter_ = DatasetIterater(dataset,config.batch_size,config.device)
return iter_
def get_time_dif(start_time):
'''获取已经使用的时间'''
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds = int(round(time_dif)))
class DatasetIterater(object):
def __init__(self,batches,batch_size,device):
self.batch_size = batch_size
self.batches = batches
self.n_batches = len(batches) // batch_size
self.residue = False
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device
def _to_tensor(self,datas):
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
# pad前的长度(超过pad_size的设置为pad_size)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
return (x,seq_len,mask),y
# 迭代下一个batch
def __next__(self):
if self.residue and self.index == self.n_batches:
batches = self.batches[self.index * self.batch_size:len(self.batches)]
self.index += 1
batches = self._to_tensor(batches)
return batches
elif self.index >= self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches
def __iter__(self):
return self
def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches
train_data,dev_data,test_data = build_dataset(config)
train_iter = build_iterator(train_data,config)
dev_iter = build_iterator(test_data,config)
test_iter = build_iterator(test_data,config)
time_dif = get_time_dif(start_time)
print('Time usage:',time_dif)
4.模型训练
# 建立模型
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from pytorch_pretrained_bert.optimization import BertAdam
# 初始化参数
def init_network(model,method = 'xavier',exclude = 'embedding',seed = 123):
for name,w in model.named_parameters():
if exclude not in name:
if len(w.size()) < 2:
continue
if 'weight' in name:
if method == 'xavier':
nn.init.xavier_normal_(w)
elif method == 'kaiming':
nn.init.kaiming_normal_(w)
else:
nn.init.normal_(w)
elif 'bias' in name:
nn.init.constant_(w,0)
else:
pass
def train(config,model,train_iter,dev_iter,test_iter):
start_time = time.time()
model.train()
param_optimizer = list(model.named_parameters())
no_decay = ['bias','LayerNorm.bias','LayerNorm.weight']
optimizer_grouped_parameters = [
{'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':0.01},
{'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay':0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters,lr = config.learning_rate,warmup = 0.05,t_total = len(train_iter) * config.num_epochs)
total_batch = 0. # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 最后一次提升的batch序号
flag = False
model.train()
for epoch in range(config.num_epochs):
print('Epoch [{}/{}]'.format(epoch+1,config.num_epochs))
for i,(trains,labels) in enumerate(train_iter):
outputs = model(trains)
model.zero_grad()
loss = F.cross_entropy(outputs,labels)
loss.backward()
optimizer.step()
if total_batch % 100 == 0:
# 输出训练集和验证集效果
true = labels.data.cpu()
predic = torch.max(outputs.data,1)[1].cpu()
train_acc = metrics.accuracy_score(true,predic)
dev_acc,dev_loss = evaluate(config,model,dev_iter)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(),config.save_path)
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter:{0:>6},Train Loss:{1:>5.2},Train Acc:{2:>6.2%},Val Loss:{3:>5.2},Val Acc:{4:>6.2%},Time:{5} {6}'
print(msg.format(total_batch,loss.item(),train_acc,dev_loss,dev_acc,time_dif,improve))
model.train()
total_batch += 1
if total_batch - last_improve > config.require_improvement:
print('No optimization for a long time,auto-stopping...')
flag = True
break
if flag:
break
test(config,model,test_iter)
def test(config,model,test_iter):
# test
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc,test_loss,test_report,test_confusion = evaluate(config,model,test_iter,test = True)
msg = 'Test Loss:{0:>5.2},Test Acc:{1:>6.2%}'
print(msg.format(test_loss,test_acc))
print('Precision,Recall and F1-Score...')
print(test_report)
print('Confusion Matrix...')
print(test_confusion)
time_dif = get_time_dif(start_time)
print('Time usage:',time_dif)
def evaluate(config,model,data_iter,test = False):
model.eval()
loss_total = 0
predict_all = np.array([],dtype = int)
labels_all = np.array([],dtype = int)
with torch.no_grad():
for texts,labels in data_iter:
outputs = model(texts)
loss = F.cross_entropy(outputs,labels)
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data,1)[1].cpu().numpy()
labels_all = np.append(labels_all,labels)
predict_all = np.append(predict_all,predic)
acc = metrics.accuracy_score(labels_all,predict_all)
if test:
report = metrics.classification_report(labels_all,predict_all,target_names = config.class_list,digits = 4)
confusion = metrics.confusion_matrix(labels_all,predict_all)
return acc,loss_total / len(data_iter),report,confusion
return acc,loss_total / len(data_iter)
# 开始训练
model = Model(config).to(config.device)
train(config, model, train_iter, dev_iter, test_iter)
输出:
Epoch [1/3]
Iter: 0, Train Loss: 2.4, Train Acc: 13.28%, Val Loss: 2.4, Val Acc: 9.08%, Time: 0:05:45 *
Epoch [2/3]
Iter: 100, Train Loss: 0.43, Train Acc: 88.28%, Val Loss: 0.22, Val Acc: 93.70%, Time: 0:33:42 *
Epoch [3/3]
Iter: 200, Train Loss: 0.031, Train Acc: 100.00%, Val Loss: 0.091, Val Acc: 97.57%, Time: 0:59:47 *
Test Loss: 0.34, Test Acc: 90.52%
Precision, Recall and F1-Score…
precision recall f1-score support
finance 0.9189 0.8720 0.8948 1000
realty 0.9143 0.9170 0.9156 1000
stocks 0.8579 0.8150 0.8359 1000
education 0.9551 0.9350 0.9449 1000
science 0.8587 0.8690 0.8638 1000
society 0.8780 0.9280 0.9023 1000
politics 0.8807 0.9010 0.8908 1000
sports 0.9732 0.9440 0.9584 1000
game 0.9231 0.9240 0.9235 1000
entertainment 0.8968 0.9470 0.9212 1000
accuracy 0.9052 10000
macro avg 0.9057 0.9052 0.9051 10000
weighted avg 0.9057 0.9052 0.9051 10000
Confusion Matrix…
[[872 19 72 4 11 9 6 3 1 3]
[ 12 917 8 1 6 16 12 3 7 18]
[ 52 33 815 0 37 5 42 1 9 6]
[ 0 3 2 935 5 28 14 1 2 10]
[ 5 6 28 2 869 17 18 2 39 14]
[ 1 11 0 16 10 928 15 0 4 15]
[ 5 7 18 10 20 30 901 2 0 7]
[ 1 3 3 1 3 9 8 944 3 25]
[ 0 2 4 1 44 8 4 2 924 11]
[ 1 2 0 9 7 7 3 12 12 947]]
Time usage: 0:04:42