`import pandas as pd
import re
import numpy as np
class GetInit:
def __init__(self, data_root):
print("GetInit Start!")
self.data_root = data_root
self.x_train, self.y_train, self.x_test,self.x_train_feature,self.x_test_feature = self.get_pandas()
print("GetInit End!")
def get_pandas(self):
train = pd.read_csv(self.data_root["train_path"])
test = pd.read_csv(self.data_root["test_path"])
x_train = train.text.str.replace("900","[SEP]").replace("3750","[SEP]").values
y_train = train.label.values
x_test = test.text.str.replace("900","[SEP]").replace("3750","[SEP]").values
train["length"]=train.text.apply(lambda x:len(x.split(" ")))
test["length"]=test.text.apply(lambda x:len(x.split(" ")))
train["length"]=np.log10(train["length"])/np.log10(train["length"].max())
test["length"]=np.log10(test["length"])/np.log10(test["length"].max())
train["sentence_length"]=train.text.apply(lambda x:len(re.split(" 3750 | 900 ",x)))
test["sentence_length"]=test.text.apply(lambda x:len(re.split(" 3750 | 900 ",x)))
train["sentence_length"]=np.log10(train["sentence_length"])/np.log10(train["sentence_length"].max())
test["sentence_length"]=np.log10(test["sentence_length"])/np.log10(test["sentence_length"].max())
x_train_feature=train[["length","sentence_length"]].values
x_test_feature=test[["length","sentence_length"]].values
del train
del test
return x_train, y_train, x_test,x_train_feature,x_test_feature
data_root = {
"train_path": '../../data/train_sample.csv',
"test_path": "../../data/test_a.csv",
"sub_path": "../../data/test_a_sample_submit.csv",
"w2v_path": "../../data/word2vec.bin"
}
config = GetInit(data_root)
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
class MyDataset(Dataset):
def __init__(self, bert_path, corpus, feature, corpus_label=None, max_length=256, with_label=False):
super(MyDataset, self).__init__()
self.corpus = corpus
self.tokenizer = BertTokenizer.from_pretrained(bert_path)
self.with_label = with_label
self.max_length = max_length
self.feature = feature
if self.with_label:
self.corpus_label = torch.tensor(corpus_label)
def __getitem__(self, item):
encoded_dict = self.tokenizer.encode_plus(
self.corpus[item], # 输入文本
add_special_tokens = True, # 添加 '[CLS]' 和 '[SEP]'
max_length = self.max_length, # 填充 & 截断长度
pad_to_max_length = True,
return_attention_mask = True,
return_tensors = 'pt',
truncation=True
)
if self.with_label:
return encoded_dict['input_ids'].squeeze(0),encoded_dict['attention_mask'].squeeze(0),torch.FloatTensor(self.feature[item]),self.corpus_label[item]
else:
return encoded_dict['input_ids'].squeeze(0),encoded_dict['attention_mask'].squeeze(0),torch.FloatTensor(self.feature[item])
def __len__(self):
return len(self.corpus)
bert_path ='./bert-mini/'
train_dataset = MyDataset(bert_path,
corpus=config.x_train,
feature=config.x_train_feature,
corpus_label=config.y_train,
with_label=True)
test_dataset = MyDataset(bert_path,
corpus=config.x_test,
feature=config.x_test_feature,
with_label=False)
train_dataset[2][0].shape
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
class GetLoader:
def __init__(self, train_dataset, test_dataset, split_ratio=0.9):
self.ratio = split_ratio
self.train_dataset = train_dataset
self.test_dataset = test_dataset
self.train_dataset, self.valid_dataset = self.split()
self.train_loader, self.valid_loader, self.test_loader = None,None,None
self.get_iter()
print("GetLoader End")
def split(self):
train_size = int(self.ratio * len(self.train_dataset))
valid_size = len(self.train_dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(self.train_dataset, [train_size, valid_size])
return train_dataset, valid_dataset
def get_iter(self):
self.train_loader = DataLoader(self.train_dataset, batch_size=16,shuffle=True)
self.valid_loader = DataLoader(self.valid_dataset, batch_size=64)
self.test_loader = DataLoader(self.test_dataset, batch_size=64)
loader = GetLoader(train_dataset, test_dataset)
for batch_idx, (data, mask,feature) in enumerate(iter(loader.test_loader)):
print(data.shape)
print(mask.shape)
print(feature.shape)
break
for batch_idx, (data, mask,feature,label) in enumerate(iter(loader.train_loader)):
print(data.shape)
print(mask.shape)
print(feature.shape)
break
len(loader.train_loader.dataset),len(loader.valid_loader.dataset),len(loader.test_loader.dataset)
import torch
from torch import nn
from torch.nn import functional as F
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertModel
class Bert(nn.Module):
def __init__(self, bert_path, hidden_size=128, output_size=14, dropout=0.5):
super(Bert, self).__init__()
self.bert =BertModel.from_pretrained(
bert_path,
num_labels = 14,
output_attentions = False, # 模型是否返回 attentions weights.
output_hidden_states = False, # 模型是否返回所有隐层状态.
)
self.fc1 = nn.Linear(258, hidden_size)
self.dropout = nn.Dropout(dropout)
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self, data, mask,feature):
_ ,out=self.bert(data, token_type_ids=None, attention_mask=mask)
out=torch.cat((out,feature),dim=1)
out = F.relu(self.fc1(out))
out = self.dropout(out)
out = F.relu(self.fc2(out))
return F.log_softmax(out, 1)
import random
import numpy as np
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# 建立model
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
model=Bert(bert_path)
model.cuda()
criterion = nn.NLLLoss()
opt = AdamW(model.parameters(),
lr = 5e-5, # args.learning_rate - default is 5e-5
eps = 1e-8
)
epochs = 2
total_steps = len(loader.train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(opt,
num_warmup_steps = 0,
num_training_steps = total_steps)
from copy import deepcopy
import torch
from sklearn.metrics import f1_score
import time
import datetime
def format_time(elapsed):
elapsed_rounded = int(round((elapsed)))
return str(datetime.timedelta(seconds=elapsed_rounded))
class TrainFunc:
def __init__(self, model, criterion, opt, schedule, train_iter=None, valid_iter=None, test_iter=None):
self.model = model
self.criterion = criterion
self.opt = opt
self.schedule = schedule
self.best_model = model
self.best_score = 0
self.train_iter = train_iter
self.valid_iter = valid_iter
self.test_iter = test_iter
self.training_stats = []
def train(self, epoch):
total_t0 = time.time()
for epoch_i in range(0, epoch):
print(" ")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epoch))
print('Training...')
t0 = time.time()
total_train_loss = 0
self.model.train()
train_acc = 0
# 训练集小批量迭代
for step, (data, mask, feature, label) in enumerate(iter(self.train_iter)):
batch_size = data.shape[0]
data = data.cuda()
mask = mask.cuda()
feature = feature.cuda()
label = label.cuda()
self.opt.zero_grad()
output = self.model(data, mask, feature)
loss = self.criterion(output, label)
loss.backward()
total_train_loss += loss.item()
train_acc += (output.argmax(1) == label).sum().item()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.opt.step()
self.schedule.step()
if step % int(80 * (8 / batch_size)) == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Loss:{:<20,} Elapsed: {:}.'.format(step,
len(self.train_iter),
loss.item(), elapsed))
# 平均训练误差
avg_train_loss = total_train_loss / len(self.train_iter)
# 单次 epoch 的训练时长
training_time = format_time(time.time() - t0)
print("")
print(" Average training loss: {0:.4f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(training_time))
print(" Training acc: {0:.4f}".format(train_acc / len(self.train_iter.dataset) * 100))
score, avg_val_loss, avg_val_accuracy, validation_time = self.valid_func()
if score > self.best_score:
self.best_score = score
self.best_model = deepcopy(self.model)
print(" Now_best:{:.4f}".format(self.best_score))
# scheduler.step()
self.training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Acc.': avg_val_accuracy,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))
return self.best_model
def valid_func(self):
print("")
print("Running Validation...")
t0 = time.time()
self.model.eval()
valid_acc = 0
valid_loss = 0
nb_eval_steps = 0
ans_box = []
label_box = []
for batch_idx, (data, mask, feature, label) in enumerate(iter(self.valid_iter)):
batch_size = data.shape[0]
data = data.cuda()
mask = mask.cuda()
feature = feature.cuda()
label=label.cuda()
with torch.no_grad():
output = self.model(data, mask, feature)
loss = self.criterion(output, label)
pred = output.argmax(1)
valid_loss += loss.item()
valid_acc += (pred == label).sum().item()
ans_box.extend(pred.cpu().tolist())
label_box.extend(label.cpu().tolist())
score1 = f1_score(ans_box, label_box, average='macro')
score2 = f1_score(ans_box, label_box, average='micro')
avg_val_accuracy = valid_acc / len(self.valid_iter.dataset) * 100
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
avg_val_loss = valid_loss / len(self.valid_iter)
validation_time = format_time(time.time() - t0)
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
print(" Micro score: {:}".format(score2))
print(" Macro score: {:}".format(score1))
# 记录本次 epoch 的所有统计信息
return score1, avg_val_loss, avg_val_accuracy, validation_time
def predict(self):
self.best_model.eval()
t0 = time.time()
ans_box = []
with torch.no_grad():
for step, (data, mask, feature) in enumerate(iter(self.test_iter)):
if step % int(40) == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step,len(self.test_iter),elapsed))
data = data.cuda()
mask =mask.cuda()
feature=feature.cuda()
output = self.best_model(data, mask, feature)
pred = output.argmax(1)
ans_box.extend(pred.cpu().tolist())
return ans_box
mytrain = TrainFunc(model, criterion, opt, scheduler, loader.train_loader, loader.valid_loader, loader.test_loader)
best_model= mytrain.train(1)
待完成,占坑