实战PyTorch(一):Bi-LSTM 情感分类实战

情感分类也就是对一段文本信息来分析是好评还是差评。

实战PyTorch(一):Bi-LSTM 情感分类实战_第1张图片

对文本使用glove编码方式,把每个词编码为向量,再送入LSTM,输出label为1/0也就判断是好/坏。

 Load dataset

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) #torchtext提供IMDB数据库

Network

class RNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNN, self).__init__()       
        # [0-10001] => [100] 编码10000个词,每个词有100个特征
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #[10000,100]
        # [100] => [256] 使用bi-lstm,使用dropout防止过拟合
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,                       
                           bidirectional=True, dropout=0.5)
        # [256*2] => [1]
        self.fc = nn.Linear(hidden_dim*2, 1) 
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        """
        x: [seq_len, b] vs [b, 3, 28, 28]
        """
        # [seq, b, 1] => [seq, b, 100] 先编码再dropout
        embedding = self.dropout(self.embedding(x))

        # output: [seq, b, hid_dim*2]
        # c/h: [num_layers*2, b, hid_dim]
        output, (hidden, cell) = self.rnn(embedding) #[h,c]默认为0所以输入省了
        
        # 取ht,和ht2做一次连接
        # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        # [b, hid_dim*2] => [b, 1]
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
      
        return out

load work embedding

rnn = RNN(len(TEXT.vocab), 100, 256)
#根据glove得到权值W覆盖掉embedding里的W=em的feature就来自glove
pretrained_embedding = TEXT.vocab.vectors 
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

Train

def train(rnn, iterator, optimizer, criteon):
    avg_acc = []
    rnn.train()
    
    for i, batch in enumerate(iterator):
        # [seq, b] => [b, 1] => [b]
        pred = rnn(batch.text).squeeze(1)
        loss = criteon(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()
        avg_acc.append(acc)
        #不断更新优化,使pred接近label
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Test

def eval(rnn, iterator, criteon):
    avg_acc = []
    rnn.eval()
    #test不要梯度更新
    with torch.no_grad():
        for batch in iterator:
            # [b, 1] => [b]
            pred = rnn(batch.text).squeeze(1)
            loss = criteon(pred, batch.label)
            acc = binary_acc(pred, batch.label).item()
            avg_acc.append(acc)
    avg_acc = np.array(avg_acc).mean()
    print('>>test:', avg_acc)

Others

def binary_acc(preds, y):
    """
    get accuracy
    """
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc
optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)
rnn.to(device)

for epoch in range(10):
    eval(rnn, test_iterator, criteon)
    train(rnn, train_iterator, optimizer, criteon)

 

你可能感兴趣的:(#,初始PyTorch,深度学习)