情感分类也就是对一段文本信息来分析是好评还是差评。
对文本使用glove编码方式,把每个词编码为向量,再送入LSTM,输出label为1/0也就判断是好/坏。
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) #torchtext提供IMDB数据库
class RNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super(RNN, self).__init__()
# [0-10001] => [100] 编码10000个词,每个词有100个特征
self.embedding = nn.Embedding(vocab_size, embedding_dim) #[10000,100]
# [100] => [256] 使用bi-lstm,使用dropout防止过拟合
self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
bidirectional=True, dropout=0.5)
# [256*2] => [1]
self.fc = nn.Linear(hidden_dim*2, 1)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
"""
x: [seq_len, b] vs [b, 3, 28, 28]
"""
# [seq, b, 1] => [seq, b, 100] 先编码再dropout
embedding = self.dropout(self.embedding(x))
# output: [seq, b, hid_dim*2]
# c/h: [num_layers*2, b, hid_dim]
output, (hidden, cell) = self.rnn(embedding) #[h,c]默认为0所以输入省了
# 取ht,和ht2做一次连接
# [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
# [b, hid_dim*2] => [b, 1]
hidden = self.dropout(hidden)
out = self.fc(hidden)
return out
rnn = RNN(len(TEXT.vocab), 100, 256)
#根据glove得到权值W覆盖掉embedding里的W=em的feature就来自glove
pretrained_embedding = TEXT.vocab.vectors
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')
def train(rnn, iterator, optimizer, criteon):
avg_acc = []
rnn.train()
for i, batch in enumerate(iterator):
# [seq, b] => [b, 1] => [b]
pred = rnn(batch.text).squeeze(1)
loss = criteon(pred, batch.label)
acc = binary_acc(pred, batch.label).item()
avg_acc.append(acc)
#不断更新优化,使pred接近label
optimizer.zero_grad()
loss.backward()
optimizer.step()
def eval(rnn, iterator, criteon):
avg_acc = []
rnn.eval()
#test不要梯度更新
with torch.no_grad():
for batch in iterator:
# [b, 1] => [b]
pred = rnn(batch.text).squeeze(1)
loss = criteon(pred, batch.label)
acc = binary_acc(pred, batch.label).item()
avg_acc.append(acc)
avg_acc = np.array(avg_acc).mean()
print('>>test:', avg_acc)
def binary_acc(preds, y):
"""
get accuracy
"""
preds = torch.round(torch.sigmoid(preds))
correct = torch.eq(preds, y).float()
acc = correct.sum() / len(correct)
return acc
optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)
rnn.to(device)
for epoch in range(10):
eval(rnn, test_iterator, criteon)
train(rnn, train_iterator, optimizer, criteon)