Task06使用Bert进行文本分类

前几天和好久没见的朋友出去玩了几天,发现打卡学习跟不上了,Bert一直没太理解,感觉只会单纯掉包,打算mark一下,这几天把task05和task06继续重新完善一下。看到了一个大佬的github感觉可以参考一下。

先mark一下pytorch版本的code,后续自己再重新把知识点捋完以后用tf2重写一下,加深理解。

`import pandas as pd
import re
import numpy as np
class GetInit:
    def __init__(self, data_root):
        print("GetInit Start!")
        self.data_root = data_root
        self.x_train, self.y_train, self.x_test,self.x_train_feature,self.x_test_feature = self.get_pandas()
        print("GetInit End!")

    def get_pandas(self):
        train = pd.read_csv(self.data_root["train_path"])
        test = pd.read_csv(self.data_root["test_path"])
        
        x_train = train.text.str.replace("900","[SEP]").replace("3750","[SEP]").values
        y_train = train.label.values
        x_test = test.text.str.replace("900","[SEP]").replace("3750","[SEP]").values
        
        train["length"]=train.text.apply(lambda x:len(x.split(" ")))
        test["length"]=test.text.apply(lambda x:len(x.split(" ")))

        train["length"]=np.log10(train["length"])/np.log10(train["length"].max())
        test["length"]=np.log10(test["length"])/np.log10(test["length"].max())
        
        train["sentence_length"]=train.text.apply(lambda x:len(re.split(" 3750 | 900 ",x)))
        test["sentence_length"]=test.text.apply(lambda x:len(re.split(" 3750 | 900 ",x)))

        train["sentence_length"]=np.log10(train["sentence_length"])/np.log10(train["sentence_length"].max())
        test["sentence_length"]=np.log10(test["sentence_length"])/np.log10(test["sentence_length"].max())
        
        x_train_feature=train[["length","sentence_length"]].values
        x_test_feature=test[["length","sentence_length"]].values
        
        
        del train
        del test
        return x_train, y_train, x_test,x_train_feature,x_test_feature
data_root = {
    "train_path": '../../data/train_sample.csv',
    "test_path": "../../data/test_a.csv",
    "sub_path": "../../data/test_a_sample_submit.csv",
    "w2v_path": "../../data/word2vec.bin"
}
config = GetInit(data_root)
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

class MyDataset(Dataset):
    def __init__(self, bert_path, corpus, feature, corpus_label=None, max_length=256, with_label=False):
        super(MyDataset, self).__init__()
        self.corpus = corpus
        self.tokenizer = BertTokenizer.from_pretrained(bert_path)
        self.with_label = with_label
        self.max_length = max_length
        self.feature = feature
        
        if self.with_label:
            self.corpus_label = torch.tensor(corpus_label)

    def __getitem__(self, item):
        encoded_dict = self.tokenizer.encode_plus(
                            self.corpus[item],           # 输入文本
                            add_special_tokens = True,    # 添加 '[CLS]' 和 '[SEP]'
                            max_length = self.max_length,           # 填充 & 截断长度
                            pad_to_max_length = True,
                            return_attention_mask = True,  
                            return_tensors = 'pt', 
                            truncation=True
                       )
        if self.with_label:
            return encoded_dict['input_ids'].squeeze(0),encoded_dict['attention_mask'].squeeze(0),torch.FloatTensor(self.feature[item]),self.corpus_label[item]
        else:
            return encoded_dict['input_ids'].squeeze(0),encoded_dict['attention_mask'].squeeze(0),torch.FloatTensor(self.feature[item])

    def __len__(self):
        return len(self.corpus)
bert_path ='./bert-mini/'
train_dataset = MyDataset(bert_path,
                          corpus=config.x_train,
                          feature=config.x_train_feature,
                          corpus_label=config.y_train,
                          with_label=True)
test_dataset = MyDataset(bert_path,
                         corpus=config.x_test,
                         feature=config.x_test_feature,
                         with_label=False)
train_dataset[2][0].shape
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader


class GetLoader:
    def __init__(self, train_dataset, test_dataset, split_ratio=0.9):
        self.ratio = split_ratio
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.train_dataset, self.valid_dataset = self.split()

        self.train_loader, self.valid_loader, self.test_loader = None,None,None
        self.get_iter()
        print("GetLoader End")

    def split(self):
        train_size = int(self.ratio * len(self.train_dataset))
        valid_size = len(self.train_dataset) - train_size
        train_dataset, valid_dataset = torch.utils.data.random_split(self.train_dataset, [train_size, valid_size])
        return train_dataset, valid_dataset

    def get_iter(self):
        self.train_loader = DataLoader(self.train_dataset, batch_size=16,shuffle=True)
        self.valid_loader = DataLoader(self.valid_dataset, batch_size=64)
        self.test_loader = DataLoader(self.test_dataset, batch_size=64)
loader = GetLoader(train_dataset, test_dataset)
for batch_idx, (data, mask,feature) in enumerate(iter(loader.test_loader)):
    print(data.shape)
    print(mask.shape)
    print(feature.shape)
    break
    
for batch_idx, (data, mask,feature,label) in enumerate(iter(loader.train_loader)):
    print(data.shape)
    print(mask.shape)
    print(feature.shape)
    break
len(loader.train_loader.dataset),len(loader.valid_loader.dataset),len(loader.test_loader.dataset)
import torch
from torch import nn
from torch.nn import functional as F
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertModel

class Bert(nn.Module):
    def __init__(self, bert_path, hidden_size=128, output_size=14, dropout=0.5):
        super(Bert, self).__init__()
        self.bert =BertModel.from_pretrained(
                            bert_path,
                            num_labels = 14, 
                            output_attentions = False, # 模型是否返回 attentions weights.
                            output_hidden_states = False, # 模型是否返回所有隐层状态.
                        )
        
        self.fc1 = nn.Linear(258, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, data, mask,feature):
        _ ,out=self.bert(data, token_type_ids=None, attention_mask=mask)
        out=torch.cat((out,feature),dim=1)
        out = F.relu(self.fc1(out))
        out = self.dropout(out)
        out = F.relu(self.fc2(out))
        return F.log_softmax(out, 1)
import random
import numpy as np
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# 建立model
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
model=Bert(bert_path)
model.cuda()
criterion = nn.NLLLoss()
opt = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 
                )
epochs = 2
total_steps = len(loader.train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(opt, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)
from copy import deepcopy
import torch
from sklearn.metrics import f1_score
import time
import datetime


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


class TrainFunc:
    def __init__(self, model, criterion, opt, schedule, train_iter=None, valid_iter=None, test_iter=None):
        self.model = model
        self.criterion = criterion
        self.opt = opt
        self.schedule = schedule
        self.best_model = model
        self.best_score = 0
        self.train_iter = train_iter
        self.valid_iter = valid_iter
        self.test_iter = test_iter
        self.training_stats = []

    def train(self, epoch):

        total_t0 = time.time()

        for epoch_i in range(0, epoch):
            print(" ")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epoch))
            print('Training...')
            t0 = time.time()
            total_train_loss = 0
            self.model.train()
            train_acc = 0
            # 训练集小批量迭代
            for step, (data, mask, feature, label) in enumerate(iter(self.train_iter)):
                batch_size = data.shape[0]
                data = data.cuda()
                mask = mask.cuda()
                feature = feature.cuda()
                label = label.cuda()

                self.opt.zero_grad()
                output = self.model(data, mask, feature)
                loss = self.criterion(output, label)
                loss.backward()
                total_train_loss += loss.item()
                train_acc += (output.argmax(1) == label).sum().item()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.opt.step()
                self.schedule.step()
                if step % int(80 * (8 / batch_size)) == 0:
                    elapsed = format_time(time.time() - t0)
                    print('  Batch {:>5,}  of  {:>5,}.  Loss:{:<20,}   Elapsed: {:}.'.format(step,
                                                                                              len(self.train_iter),
                                                                                              loss.item(), elapsed))
            # 平均训练误差
            avg_train_loss = total_train_loss / len(self.train_iter)
            # 单次 epoch 的训练时长
            training_time = format_time(time.time() - t0)
            print("")
            print("  Average training loss: {0:.4f}".format(avg_train_loss))
            print("  Training epcoh took: {:}".format(training_time))
            print("  Training acc: {0:.4f}".format(train_acc / len(self.train_iter.dataset) * 100))
            score, avg_val_loss, avg_val_accuracy, validation_time = self.valid_func()
            if score > self.best_score:
                self.best_score = score
                self.best_model = deepcopy(self.model)
                print("  Now_best:{:.4f}".format(self.best_score))
            #         scheduler.step()
            self.training_stats.append(
                {
                    'epoch': epoch_i + 1,
                    'Training Loss': avg_train_loss,
                    'Valid. Loss': avg_val_loss,
                    'Valid. Acc.': avg_val_accuracy,
                    'Training Time': training_time,
                    'Validation Time': validation_time
                }
            )
        print("")
        print("Training complete!")
        print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))
        return self.best_model

    def valid_func(self):
        print("")
        print("Running Validation...")
        t0 = time.time()
        self.model.eval()
        valid_acc = 0
        valid_loss = 0
        nb_eval_steps = 0
        ans_box = []
        label_box = []
        for batch_idx, (data, mask, feature, label) in enumerate(iter(self.valid_iter)):
            batch_size = data.shape[0]
            data = data.cuda()
            mask = mask.cuda()
            feature = feature.cuda()
            label=label.cuda()

            with torch.no_grad():
                output = self.model(data, mask, feature)
                loss = self.criterion(output, label)
            pred = output.argmax(1)
            valid_loss += loss.item()
            valid_acc += (pred == label).sum().item()

            ans_box.extend(pred.cpu().tolist())
            label_box.extend(label.cpu().tolist())
        score1 = f1_score(ans_box, label_box, average='macro')
        score2 = f1_score(ans_box, label_box, average='micro')

        avg_val_accuracy = valid_acc / len(self.valid_iter.dataset) * 100
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        avg_val_loss = valid_loss / len(self.valid_iter)
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))
        print("  Micro score: {:}".format(score2))
        print("  Macro score: {:}".format(score1))
        # 记录本次 epoch 的所有统计信息

        return score1, avg_val_loss, avg_val_accuracy, validation_time

    def predict(self):
        self.best_model.eval()
        t0 = time.time()
        ans_box = []
        with torch.no_grad():
            for step, (data, mask, feature) in enumerate(iter(self.test_iter)):
                if step % int(40) == 0:
                    elapsed = format_time(time.time() - t0)
                    print('  Batch {:>5,}  of  {:>5,}.  Elapsed: {:}.'.format(step,len(self.test_iter),elapsed))
                data = data.cuda()
                mask =mask.cuda()
                feature=feature.cuda()
                output = self.best_model(data, mask, feature)
                pred = output.argmax(1)
                ans_box.extend(pred.cpu().tolist())
        return ans_box
mytrain = TrainFunc(model, criterion, opt, scheduler, loader.train_loader, loader.valid_loader, loader.test_loader)
best_model= mytrain.train(1)

tf版本

待完成,占坑

你可能感兴趣的:(NLP)