请按照以下顺序逐一了解知识点,懂的就略过。
了解LSTM、GRU结构两者还是很相像的。
人人都能看懂的GRU
了解文本预处理过程
中文文本预处理过程
了解词向量
什么是词向量
了解torchtext
TorchText之文本数据集处理
下载中文预训练词向量
预训练的词向量
下面就可以对模型进行训练了
参考代码:
## 导入本章所需要的模块
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report
from torch import nn, Tensor
import torch.optim as optim
from torchtext.data import Field, Example, Dataset, Iterator
from torchtext.vocab import Vectors
import torch
## 使用torchtext库进行数据准备
# 定义文件中对文本和标签所要做的操作
print('='*10+"start"+'='*10)
device = torch.device("cuda")
BATCH_SIZE = 128
epochs = 10
## 定义文本切分方法,直接使用空格切分即可
mytokenize = lambda x: x.split()
fld_label = Field()
fld_text = Field()
# 标签字段比较简答
fld_label.sequential = False # 这个属性默认True
fld_label.use_vocab = False # 这个属性默认True
fld_text .tokenize = mytokenize
# 特征字段
fld_text.sequential = True # 这个属性默认True
fld_text.use_vocab = True # 这个属性默认True
fld_text.batch_first = True
fld_text.fix_length = 1000
fields = [("text", fld_text),("label", fld_label)] # 两个字段
vectors = Vectors("word.iter5", "/home/featurize/data/text_data0311")
def load_data(data_file):
data = pd.read_csv(data_file) # csv: Comma-Separated Values,tsv: Tab-Separated Values
examples = []
for txt, lab in zip(data["text"], data["label"]):
one_example = Example.fromlist([txt, lab], fields)
examples.append(one_example)
dataset = Dataset(examples, fields)
it_dataset, = Iterator.splits((dataset,), batch_sizes=(BATCH_SIZE,), shuffle=True) # 每个批次过大,GPU容易溢出
fld_text.build_vocab(dataset, vectors=vectors) # 见上面的词向量
# 标签是整数,不用词向量。
fld_label.build_vocab(dataset)
return it_dataset
it_train = load_data("/home/featurize/data/text_data0311/train0308.csv")
it_valid = load_data("/home/featurize/data/text_data0311/test0308.csv")
class GRUNet(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, layer_dim, output_dim):
"""
vocab_size:词典长度
embedding_dim:词向量的维度
hidden_dim: GRU神经元个数
layer_dim: GRU的层数
output_dim:隐藏层输出的维度(分类的数量)
"""
super(GRUNet, self).__init__()
self.hidden_dim = hidden_dim ## GRU神经元个数
self.layer_dim = layer_dim ## GRU的层数
## 对文本进行词项量处理
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# LSTM + 全连接层
self.gru = nn.LSTM(embedding_dim, hidden_dim, layer_dim,
batch_first=True)
self.fc1 = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
torch.nn.Dropout(0.5),
torch.nn.ReLU(),
nn.Linear(hidden_dim, output_dim)
)
def forward(self, x):
embeds = self.embedding(x)
r_out, h_n = self.gru(embeds, None) # None 表示初始的 hidden state 为0
# 选取最后一个时间点的out输出
out = self.fc1(r_out[:, -1, :])
return out
## 初始化网络
vocab_size = len(fld_text.vocab)
embedding_dim = vectors.dim # 词向量的维度
# embedding_dim = 128 # 词向量的维度
hidden_dim = 128
layer_dim = 1
output_dim = 2
grumodel = GRUNet(vocab_size, embedding_dim, hidden_dim, layer_dim, output_dim)
## 将导入的词项量作为embedding.weight的初始值
grumodel.embedding.weight.data.copy_(fld_text.vocab.vectors)
grumodel.to(device)
## 将无法识别的词'', ''的向量初始化为0
UNK_IDX = fld_text.vocab.stoi[fld_text.unk_token]
PAD_IDX = fld_text.vocab.stoi[fld_text.pad_token]
grumodel.embedding.weight.data[UNK_IDX] = torch.zeros(vectors.dim)
grumodel.embedding.weight.data[PAD_IDX] = torch.zeros(vectors.dim)
evaluate_loss = []
train_loss = []
## 定义网络的训练过程函数
def train(model, traindataloader, criterion, optimizer):
learn_rate = []
## 设置等间隔调整学习率,每隔step_size个epoch,学习率缩小10倍
all_label = torch.tensor([]).to(device)
all_pre = torch.tensor([]).to(device)
epoch_loss = 0.0
model.train() ## 设置模型为训练模式
for step, batch in enumerate(traindataloader):
textdata, target = batch.text, batch.label
textdata, target = textdata.to(device), target.to(device)
out = model(textdata)
pre_lab = torch.argmax(out, 1) # 预测的标签
all_label = torch.cat((all_label, target), dim=0)
all_pre = torch.cat((all_pre, pre_lab), dim=0)
loss = criterion(out, target) # 计算损失函数值
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
train_loss.append(epoch_loss)
print('loss:{}'.format(epoch_loss))
print(classification_report(Tensor.cpu(all_label), Tensor.cpu(all_pre), target_names=['normal', 'depressed'], digits=4))
## 计算一个epoch的训练后在验证集上的损失和精度
# 定义优化器
optimizer = optim.AdamW(grumodel.parameters(), lr=0.00005)
loss_func = nn.CrossEntropyLoss() # 交叉熵作为损失函数
## 对模型进行迭代训练,对所有的数据训练EPOCH轮
def evaluate(model, testdataloader, criterion):
model.eval() ## 设置模型为训练模式评估模式
all_label = torch.tensor([]).to(device)#目标标签
all_pre = torch.tensor([]).to(device)#预测标签
test_loss = 0.0
for step, batch in enumerate(testdataloader):
textdata, target = batch.text, batch.label.view(-1)
textdata, target = textdata.to(device), target.to(device)
out = model(textdata)
loss = criterion(out, target)
test_loss += loss.item() * len(target)
pre_lab = torch.argmax(out, 1)
all_label = torch.cat((all_label, target), dim=0)
all_pre = torch.cat((all_pre, pre_lab), dim=0)
evaluate_loss.append(test_loss)
print('loss:{}'.format(test_loss))
print(classification_report(Tensor.cpu(all_label), Tensor.cpu(all_pre), target_names=['normal', 'depressed'], digits=4))
# 训练轮数
for i in range(epochs):
train(grumodel, it_train, loss_func, optimizer)
evaluate(grumodel, it_valid, loss_func)
# 将训练过程的loss显示
import matplotlib.pyplot as plt
plt.plot(range(epochs), train_loss, color='blue')
plt.scatter(range(epochs), evaluate_loss, color='red')
plt.legend(['Train Loss', 'Test Loss'], loc='upper right')
plt.xlabel('number of training examples seen')
plt.ylabel('negative log likelihood loss')
plt.show()