HAN模型由paperHierarchical Attention Networks for Document Classification提出,模型结构如下图:
模型自下而上结构如下:Enbedding -> Bidirectional RNN(GRU/LSTM) -> Attention -> Bidirectional RNN(GRU/LSTM) -> Attention -> FullyConnectedLayer -> Sigmoid/Softmax
Word Encoder:
对词汇进行编码,建立词向量。接着用双向 RNN 从单词的两个方向汇总信息来获取单词的注释,因此将上下文信息合并到句子向量中。Word Attention
对每句话的词语进行 Attention 操作,最后每句话都有一个特征向量,可以看做句向量。Sentence Encoder
与 Word Encoder 相似,对句子级别也使用双向 RNN 获取上下句的信息。Sentence Attention
与 Word Attention 相似,对所有句子进行 Attention 操作,获得一个每个句子加权平均作为整个输入的特征向量。Document Classification
常规全连接网络
其中,Attention 机制大概就是一个 MLP + softmax:
class SelfAttention(nn.Module):
def __init__(self, input_size, hidden_size):
super(SelfAttention, self).__init__()
self.W = nn.Linear(input_size, hidden_size, True)
self.u = nn.Linear(hidden_size, 1)
def forward(self, x):
u = torch.tanh(self.W(x))
a = F.softmax(self.u(u), dim=1)
x = a.mul(x).sum(1)
return x
代码实现
#%%
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F
dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class SelfAttention(nn.Module):
def __init__(self, input_size, hidden_size):
super(SelfAttention, self).__init__()
self.W = nn.Linear(input_size, hidden_size, True)
self.u = nn.Linear(hidden_size, 1)
def forward(self, x):
u = torch.tanh(self.W(x))
a = F.softmax(self.u(u), dim=1)
x = a.mul(x).sum(1)
return x
class HAN(nn.Module):
def __init__(self):
super(HAN1, self).__init__()
num_embeddings = 5844 + 1
num_classes = 10
num_sentences = 30
num_words = 60
embedding_dim = 200
hidden_size_gru = 50
hidden_size_att = 100
self.num_words = num_words
self.embed = nn.Embedding(num_embeddings, embedding_dim, 0)
self.gru1 = nn.GRU(embedding_dim, hidden_size_gru, bidirectional=True, batch_first=True)
self.att1 = SelfAttention(hidden_size_gru * 2, hidden_size_att)
self.gru2 = nn.GRU(hidden_size_att, hidden_size_gru, bidirectional=True, batch_first=True)
self.att2 = SelfAttention(hidden_size_gru * 2, hidden_size_att)
self.fc = nn.Linear(hidden_size_att, num_classes, True)
def forward(self, x):
x = x.view(x.size(0) * self.num_words, -1).contiguous()
x = self.embed(x)
x, _ = self.gru1(x)
x = self.att1(x)
x = x.view(x.size(0) // self.num_words, self.num_words, -1).contiguous()
x, _ = self.gru2(x)
x = self.att2(x)
x = self.fc(x)
x = F.log_softmax(x, dim=1) # softmax
return x