首先对数据用sample(frac = 1.0)随机打乱,把类别用Labelencoder换成数字。
from sklearn.preprocessing import LabelEncoder
train_data = train_data.sample(frac = 1.0)
lbl = LabelEncoder().fit(train_data['LABEL'])
train_data['LABEL'] = lbl.transform(train_data['LABEL'])
先用pd的frame的一列fit进去。再把[“LABEL”]换成Labelenconder的数字列。
接下来对数据进行划分,这里用hold-out法。
from sklearn.model_selection import train_test_split
tr_x, val_x, tr_y, val_y = train_test_split(
train_data['NEWS_TITLE'], train_data['LABEL'],random_state=0,
stratify = train_data['LABEL'],
test_size=0.2)
train_test_split的第一个参数是样本的特征,第二个是样本标签,test_size是样本占比,stratify 是指按照label种类的比列进行划分。
tr_x的类型是Series。只有list(tr_x)才能用tokenizer编码。
这里从transformers里引入bert,先引入中文的berttokernizer的预训练模型。
from transformers import BertTokenizer
#这个是对字进行encoding而引入('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encoding = tokenizer(list(tr_x), truncation=True, padding=True, max_length=32)
val_encoding = tokenizer(list(val_x), truncation=True, padding=True, max_length=32)
这里是把字变成字向量。只有list(tr_x)才能用tokenizer编码。padding是补全。max_length的长度最好是要输入最多字数的的90%。
from torch.utils.data import Dataset, DataLoader, TensorDataset
class TextDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
#Bert token 处理
#定义数据集本质就是把token和标签放在item里面
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels.iloc[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = TextDataset(train_encoding, tr_y)
test_dataset = TextDataset(val_encoding, val_y)
这个类是继承Dataset的。
这里是重新定义数据集,这个__getitem__是定义标签的。__len__是定义数据长度用来划分数据集。把train_dataset用train_encoding和tr_y定义一下。
引入BertForSequenceClassification模型。
from transformers import BertForSequenceClassification
#这个是为了分类用Bert进行分类而引入'bert-base-chinese', num_labels=13),这里要定义分类数量:num_labels
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=13)
num_labels=13要定义标签数。
import torch
from transformers import AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
#单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
#batchsize 是一个batch里面的数据数量。DataLoader是用来分batch的。
#for i, data in enumerate(train_loader):
# i表示第几个batch, data表示该batch对应的数据,包含data和对应的labels
# print("第 {} 个Batch \n{}".format(i, data))
#print(len(train_loader))
#定义优化器,定义学习方法
optim = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 1
#定义损失函数
loss_function = torch.nn.CrossEntropyLoss()
#学习率预热,或者说学习率更新方法
scheduler = get_linear_schedule_with_warmup(optim,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
model.parameters()应该是都要带的。
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def train(train_loader):
# model.train()是告诉模型正在train,dropout和batchnorm之类的层在train和test是不一样的。
model.train()
total_train_loss = 0
iter_num = 0
total_iter = len(train_loader)
for batch in train_loader:
#读取数据,正向传播,计算损失,计算梯度,更新权重
#optim.zero_grad()是梯队归零
optim.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
# loss = outputs[0]
loss = loss_function(outputs[1], labels)
total_train_loss += loss.item()
#loss.backward()梯度方向传播
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#optim.step()通过梯度下降更新
optim.step()
scheduler.step()
iter_num += 1
if(iter_num % 100==0):
print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
def validation(test_dataloader):
#验证集没有反向传播过程
#model.eval()告诉模型是验证过程
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
for batch in test_dataloader:
with torch.no_grad():
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels= batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
logits = outputs[1]
total_eval_loss += loss.item()
#detach()返回一个新的tensor,从当前计算图中分离下来的,但是仍指向原变量的存放位置,
#不同之处只是requires_grad为false,得到的这个tensor永远不需要计算其梯度,不具有grad。
logits = logits.detach().cpu().numpy()
label_ids = labels.to('cpu').numpy()
total_eval_accuracy += flat_accuracy(logits, label_ids)
avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
print("Accuracy: %.4f" % (avg_val_accuracy))
print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
print("-------------------------------")
for epoch in range(4):
print("------------Epoch: %d ----------------" % epoch)
train(train_loader)
validation(test_dataloader)