pytorch-transformers (BERT)微调

pytorch-transformers (BERT)微调

import torch
# from pytorch_transformers import *
from pytorch_transformers import BertModel,BertTokenizer,AdamW,BertForTokenClassification
import torch.nn as nn
import pytorch_transformers
torch.__version__
import pandas as pd
from torch.utils.data import DataLoader,dataset
import time


PyTorch-Transformers has a unified API

for 7 transformer architectures and 30 pretrained weights.

      Model(模型)    | Tokenizer(标记生成器)    | Pretrained weights shortcut(预训练权重)
# MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
#           (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
#           (GPT2Model,       GPT2Tokenizer,      'gpt2'),
#           (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
#           (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
#           (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024'),
#           (RobertaModel,    RobertaTokenizer,   'roberta-base')]
# Let's encode some text in a sequence of hidden-states using each model:
# for model_class, tokenizer_class, pretrained_weights in MODELS:
#     # Load pretrained model/tokenizer
#     tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#     model = model_class.from_pretrained(pretrained_weights)

#     # Encode text
#     input_ids = torch.tensor([tokenizer.encode("Here is some text to encode ", add_special_tokens=True)])  
#     print("input_ids = ",input_ids)
#     # Add special tokens takes care of adding [CLS], [SEP], ... tokens in the right way for each model.
#     with torch.no_grad():
#         last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
#         print("last_hidden_states = ",last_hidden_states)
#         print(last_hidden_states.size())
    
#     break #这里因为只需要BERT模型所以打断
# pretrained_weights = 'bert-base-chinese'
# model = BertModel.from_pretrained(pretrained_weights)
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# print("bert_model = ",model)
# print("bert_tokenizer = ",tokenizer)
# BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
#                       BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
#                       BertForQuestionAnswering]

# All the classes for an architecture can be initiated from pretrained weights for this architecture
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
# pretrained_weights = 'bert-base-chinese'
# model = BertModel.from_pretrained(pretrained_weights,
#                                  output_hidden_states = True,
#                                  output_attentions = True
#                                  )
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

# input_ids = torch.tensor([tokenizer.encode("让我们看看在这个文本中的隐层和感知层")]) #一句话18个字
# print(input_ids) #tensor([[6375, 2769,  812, 4692, 4692, 1762, 6821,  702, 3152, 3315,  704, 4638,7391, 2231, 1469, 2697, 4761, 2231]])
# print(input_ids.size()) #torch.Size([1, 18])
# out = model(input_ids) 
# print("len(out) = ",len(out))   # 4
# #输出数据是四个维度的元组,
# # (1) 层数(12层) -- 模型最后一层输出处的隐藏状态序列(batch_size,sequence_length,hidden_size)
# # (2)batch(这里是1句)
# # (3)单词/令牌号 #(一个用于每个层的输出+嵌入的输出)的列表(batch_size,sequence_length,hidden_size)
# # (4)隐藏单元/特征号(768个特征)
# all_hidden_states, all_attentions = out[-2:]
# last_hidden_states = out[1] # The last hidden-state is the first element of the output tuple
# print("len(out[0]) = ",len(out[0]),"    out[0].size() = ",out[0].size()) #1,torch.Size([1, 18, 768])
# print("len(out[1]) = ",len(out[1]),"    out[1].size() = ",out[1].size()) #1,torch.Size([1, 768])
# print("len(all_hidden_states) = ",len(all_hidden_states))  #13
# print("len(all_attentions) = ",len(all_attentions))     #12
# print("all_hidden_states[-1].size() = ",all_hidden_states[-1].size()) #【1,18,768】=【batch,词语数量 ,词向量维度】
# # print("all_attentions.size[-1].size() = ",all_attentions[0-1.size())        #【1,12,18,18】
# Models are compatible with Torchscript
# model = model_class.from_pretrained(pretrained_weights, torchscript=True)
# traced_model = torch.jit.trace(model, (input_ids,))
# print("traced_model = ",traced_model)
# Simple serialization for models and tokenizers
# model.save_pretrained('./modelsave/bert/save_model_1/')  # save
# model = BertModel.from_pretrained('./modelsave/bert/save_model_1/') #reload
# tokenizer.save_pretrained('./modelsave/bert/save_token_1/') #save
# tokenizer = BertTokenizer.from_pretrained('./modelsave/bert/save_token_1/')
def read_data():
    #读取数据
    data_corpus = pd.read_excel('D:\pro\pytorch\goods_name_classfiction/keyword_all.xlsx') #读取全部关键词
    corpus_list = list(data_corpus['keyword_all'].values) # 转化为列表,长度为22933

    data_goods = pd.read_excel('D:\pro\pytorch\goods_name_classfiction/分词后数据.xlsx')
#     print(data_goods)
    return corpus_list,data_goods

#读取数据
corpus_list,data_goods = read_data()
classes_list = list(set(data_goods['一级分类'].values))
print(classes_list)
label_dict = {}
for i in range(len(classes_list)):
    label_dict[ classes_list[i] ] = i
print(label_dict)
['日化用品', '粮油调味', '糖巧饼干', '香烟', '计生情趣', '方便速食', '汽水饮料', '牛奶乳品', '冷冻食品', '应季鲜食', '生鲜专区', '无酒不欢', '生活百货', '个人护理', '冲调保健', '休闲零食', '母婴用品']
{'日化用品': 0, '粮油调味': 1, '糖巧饼干': 2, '香烟': 3, '计生情趣': 4, '方便速食': 5, '汽水饮料': 6, '牛奶乳品': 7, '冷冻食品': 8, '应季鲜食': 9, '生鲜专区': 10, '无酒不欢': 11, '生活百货': 12, '个人护理': 13, '冲调保健': 14, '休闲零食': 15, '母婴用品': 16}
#建立数据集 
class GoodsName_Label_Dataset(dataset.Dataset):
    def __init__(self,data_goods,label_dict):
        self.data_goods = data_goods
        self.label_dict = label_dict
    def __len__(self):
        #返回数据长度
        return self.data_goods.shape[0]
    def __getitem__(self,ind):
        sku_name = self.data_goods['更新后商品名称'][ind]
        label = self.label_dict[self.data_goods['一级分类'][ind]]   
        label = torch.LongTensor([label])
        return sku_name,label
        
#建立批处理、随机抽取器
def set_dataloader(data_goods,label_dict):
    g_l_dataset = GoodsName_Label_Dataset(data_goods,label_dict)
    DL = DataLoader(g_l_dataset,
                batch_size=1,
                shuffle = True)
    return DL

dataloader = set_dataloader(data_goods,label_dict)

#建立模型
pretrained_weights = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# model = BertModel.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)
class Bert_Fc_Model(nn.Module):
    def __init__(self):
        super(Bert_Fc_Model,self).__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.fc = nn.Linear(768,17)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self,input_str):
        input_ids = torch.tensor([self.tokenizer.encode(input_str)])
#         print("input_ids.size() = ",input_ids.size())
        out = self.model(input_ids)
#         out_0 = out[0]
#         print("out[0].size() = ",out_0.size())
#         out_0 = self.dropout(out_0)
#         print("out[0].size() = ",out_0.size())
        out = out[0][:,-1,:]
        out = self.dropout(out)
        out = self.fc(out)
        return out
bert_model = Bert_Fc_Model()
#训练
bert_model.train()
print(bert_model)
def train():
    bert_model = Bert_Fc_Model()
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.AdamW(bert_model.parameters(),)
    correct_number = 0
    time_start = time.time() #开始时间
    for i ,item in enumerate(dataloader):
          
#         print("i = ",i)
        sku_name ,label = item
#         print("sku_name = ",sku_name[0])
#         print("label = ",label)
        input_str = sku_name[0]
        label = label.squeeze(dim=0)
#         print("label = ",label)

        #正向传播得到结果
        out = bert_model(input_str)
#         print("out = ",out)

#         print("out = ",out) 
        
        #清零梯度
        optim.zero_grad()
        loss = criterion(out,label)
        #反向传播
        loss.backward()
        #优化参数
        optim.step()
        
        # 计算准确率
        predict = torch.argmax(out)
#         print("predict label = ",predict,"label = ",label)
        if label.item() == predict.item():
            correct_number += 1 
#             print("i = ",i,correct_number)
        if (i+1) % 200 == 0:
            print( "loss = ", loss,"i+1 = ",i+1)
            print("acc = "  ,str(round(correct_number/200.0*100,2)) ,r"%" )
            #清空计分
            correct_number = 0
            print("总耗时:",time.time()-time_start)
        
        
Bert_Fc_Model(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (2): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (3): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (4): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (5): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (6): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (7): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (8): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (9): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (10): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (11): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (fc): Linear(in_features=768, out_features=17, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
if __name__ == "__main__":
    train()
loss =  tensor(0.8110, grad_fn=) i+1 =  200
acc =  14.0 %
总耗时: 205.16717767715454
loss =  tensor(2.9961, grad_fn=) i+1 =  400
acc =  13.5 %
总耗时: 424.97985672950745



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

 in 
      1 if __name__ == "__main__":
----> 2     train()


 in train()
     30         loss.backward()
     31         #优化参数
---> 32         optim.step()
     33 
     34         # 计算准确率


D:\anaconda\envs\python37-pytorch\lib\site-packages\torch\optim\adamw.py in step(self, closure)
    104                     denom = max_exp_avg_sq.sqrt().add_(group['eps'])
    105                 else:
--> 106                     denom = exp_avg_sq.sqrt().add_(group['eps'])
    107 
    108                 bias_correction1 = 1 - beta1 ** state['step']


KeyboardInterrupt: 


你可能感兴趣的:(Pytorch)