lstm token分类模型代码解析(直接传入batch数据,测试pad和pack_padded、pad_packed三函数)

lstm token分类模型代码解析

文章目录

  • lstm token分类模型代码解析
    • 一、加载数据
      • 1.2 定义dataset
      • 1.3 设置整理函数,将变长序列打包
        • 1.3.2 pad_sequence函数测试
        • 1.3.3 pack_padded_sequence函数测试
        • 1.3.4 lstm的输出测试
        • 1.3.5 pad_packed_sequence函数

代码参考车万翔老师的 《plm-nlp-code/chp4/lstm_postag.py 》
需要copy整个文件夹,安装nltk代码才能正常跑

import nltk
nltk.download()
#选择安装所有

一、加载数据

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from collections import defaultdict
from vocab import Vocab
from utils import 

batch_size=5#最开始只是为了测试,结果设了5....
train_data, test_data, vocab, pos_vocab = load_treebank()
train_dataset = LstmDataset(train_data)
test_dataset = LstmDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

1.2 定义dataset

class LstmDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

1.3 设置整理函数,将变长序列打包

def collate_fn(examples):
    lengths = torch.tensor([len(ex[0]) for ex in examples])
    inputs = [torch.tensor(ex[0]) for ex in examples]
    targets = [torch.tensor(ex[1]) for ex in examples]
    #将变长序列pad到同一长度
    inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab[""])
    targets = pad_sequence(targets, batch_first=True, padding_value=vocab[""])
    return inputs, lengths, targets, inputs != vocab[""]

1.3.2 pad_sequence函数测试

for batch in train_dataset[:5]:
    print(batch)

([2, 3, 4, 5, 6, 7, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [1, 1, 2, 3, 4, 5, 2, 6, 7, 8, 9, 10, 8, 5, 9, 1, 3, 11])
([19, 3, 20, 21, 22, 23, 24, 4, 10, 25, 26, 27, 18], [1, 1, 12, 9, 10, 1, 1, 2, 8, 1, 13, 9, 11])
([28, 29, 4, 30, 6, 7, 31, 32, 21, 22, 33, 34, 35, 36, 4, 37, 38, 39, 13, 14, 15, 22, 40, 41, 42, 43, 18], [1, 1, 2, 3, 4, 5, 14, 5, 9, 10, 1, 1, 1, 1, 2, 15, 16, 17, 8, 5, 9, 10, 8, 5, 5, 9, 11])
([44, 45, 22, 46, 47, 48, 49, 49, 50, 51, 52, 53, 54, 55, 56, 13, 57, 58, 22, 59, 60, 61, 13, 27, 22, 62, 63, 49, 50, 64, 65, 66, 67, 6, 68, 4, 69, 70, 71, 72, 18], [8, 9, 10, 9, 18, 16, 17, 17, 19, 7, 1, 9, 4, 12, 16, 8, 5, 9, 10, 9, 4, 10, 8, 9, 10, 4, 16, 17, 19, 20, 21, 10, 3, 4, 10, 2, 4, 15, 17, 17, 11])
([73, 46, 74, 4, 75, 4, 20, 76, 77, 47, 64, 78, 10, 79, 4, 80, 81, 82, 83, 50, 64, 84, 85, 86, 72, 87, 88, 89, 90, 4, 69, 91, 71, 92, 18], [8, 9, 9, 2, 9, 2, 12, 18, 5, 10, 20, 12, 8, 4, 2, 10, 18, 5, 4, 19, 20, 13, 4, 22, 17, 23, 24, 4, 5, 2, 4, 15, 17, 17, 11])
可以看到原始的dataset数据确实是长度不一的。
for batch in train_data_loader:
    inputs, lengths, targets, mask = [x for x in batch]
    #output=collate_fn(batch)
    #print(output)
    break
test_input=tensor([[1815, 1041, 6262, 6229, 2383,  104, 1424,  177,  501, 1672,  503,  670,
           50,  501,  734,  503,  670,   13, 6224,   18,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1],
        [ 204, 7889,    4, 7890,  420,  159,   13, 7891, 5282,   22, 2943,    4,
         1413,   50, 6373,  420,  152, 7892, 4969,   22, 2943, 3599,  118, 1393,
           18,    1,    1,    1,    1,    1],
        [ 570, 1267, 1472,   99,    4,   22,   96, 6408,    4,   31,  105,  376,
            4, 2664,  666, 7961,    4, 1111,   22,   96, 6408,    4, 7951,   50,
           13,  501, 5074,  503, 1059,   18],
        [1209,    4,   10, 1210,  566, 1211, 1212, 1213,  480,  189,  148,   13,
         1214,  259, 1106, 1215,  589,   22,  105, 1216, 1217,   22, 1110,   22,
           96, 1093,   18,    1,    1,    1],
        [  73,  376,  794,  267, 1012, 4561,   40, 8305,  259,   39, 9309,  619,
         5722,   49,   50, 4873, 1062, 1355,    4,  449,   10, 9312, 2601, 5270,
         9313,  683,   50,   10,  624,   18]])
lengths=tensor([20, 25, 30, 27, 30])

targets= tensor([[ 8,  5,  9,  4, 15, 10,  9, 10, 35,  3, 17, 17, 19, 35,  3, 17, 17,  8,
          9, 11,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [10,  4,  2,  3,  9, 23,  8,  5,  9, 10, 25,  2, 16, 19,  3,  9, 31, 17,
         23, 10, 25,  9, 27,  9, 11,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  2, 10,  1,  1,  2, 14, 25,  9,  2,  1,  1,  1,  2, 18,
         10,  1,  1,  2, 15, 19,  8, 35,  3, 17,  9, 11],
        [18,  2,  8,  1,  1,  1,  1, 15, 24, 10, 18,  8,  9, 10,  1, 13,  9, 10,
         25,  5,  9, 10,  1, 10,  1,  1, 11,  1,  1,  1],
        [ 8,  9,  6, 18, 18,  7,  8,  9, 10, 17, 13,  4,  5, 17, 19,  7, 10,  9,
          2, 10,  8,  1,  1, 12,  5,  9, 19,  8,  9, 11]])
mask=tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         False, False, False, False, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]])


返回的是上面四个值。(其实是连载一起,没有命名的)

  • pad之后的inputs
  • 记录句子原始长度的length(可以后续传入pack_padded_sequence函数进行打包(压缩去掉pad位置)
  • pad之后的target和target真假矩阵(pad位置的target为假。)

1.3.3 pack_padded_sequence函数测试

pack_padded_sequence源代码为:

 def pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True):
    ...
    if enforce_sorted:
        sorted_indices = None
    else:
        lengths, sorted_indices = torch.sort(lengths, descending=True)
        sorted_indices = sorted_indices.to(input.device)
        batch_dim = 0 if batch_first else 1
        input = input.index_select(batch_dim, sorted_indices)
        data, batch_sizes = _VF._pack_padded_sequence(input, lengths, batch_first)
    return _packed_sequence_init(data, batch_sizes, sorted_indices, None)

直接看测试结果:

x_pack = pack_padded_sequence(input1,lengths, batch_first=True, enforce_sorted=False)
x_pack 
#下面结果中, batch_sizes有20个5,表示前20次都是五个序列都取值了。所以最短序列长度是20。
#接着有5个4,表示第二短的序列长25.依次类推。
PackedSequence(data=tensor([ 570,   73, 1209,  204, 1815, 1267,  376,    4, 7889, 1041, 1472,  794,
          10,    4, 6262,   99,  267, 1210, 7890, 6229,    4, 1012,  566,  420,
        2383,   22, 4561, 1211,  159,  104,   96,   40, 1212,   13, 1424, 6408,
        8305, 1213, 7891,  177,    4,  259,  480, 5282,  501,   31,   39,  189,
          22, 1672,  105, 9309,  148, 2943,  503,  376,  619,   13,    4,  670,
           4, 5722, 1214, 1413,   50, 2664,   49,  259,   50,  501,  666,   50,
        1106, 6373,  734, 7961, 4873, 1215,  420,  503,    4, 1062,  589,  152,
         670, 1111, 1355,   22, 7892,   13,   22,    4,  105, 4969, 6224,   96,
         449, 1216,   22,   18, 6408,   10, 1217, 2943,    4, 9312,   22, 3599,
        7951, 2601, 1110,  118,   50, 5270,   22, 1393,   13, 9313,   96,   18,
         501,  683, 1093, 5074,   50,   18,  503,   10, 1059,  624,   18,   18]),

 batch_sizes=tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
        4, 3, 3, 2, 2, 2]), 
sorted_indices=tensor([2, 4, 3, 1, 0]), 
unsorted_indices=tensor([4, 3, 0, 2, 1]))

可以看到5个序列中都没有pad value。返回值有:

  • PackedSequence :即按位置依次取所有序列的token,跳过pad位置。如:在五个序列中依次取位置1的token、位置2的token一直到最短序列取完。然后取剩下4个序列的token。
  • batch_size表示返回的每个时间步取值数。(比如第一次取了5个序列,第21次只取了4个序列)
  • sorted_indices=tensor([2, 4, 3, 1, 0])表示length元素降序排列后,每个元素原来的位置索引。

举例:

torch.sort(input, dim=-1, descending=False, stable=False, *, out=None)
1.沿着给定的维度对input张量的元素进行升序排序。若参数dim没有给出,则选择 `input` 的最后一个维度。
2.descending=True,则元素按降序排序。
3.stable=True,则排序例程变得稳定,保留等效元素的顺序
lengths=tensor([20, 25, 30, 27, 30])
lengths, sorted_indices = torch.sort(lengths, descending=True)
print(lengths,'\n',sorted_indices )

tensor([30, 30, 27, 25, 20]) 
tensor([2, 4, 3, 1, 0])

1.3.4 lstm的输出测试

embedding_dim = 128
hidden_dim = 256

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(LSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.output = nn.Linear(hidden_dim, num_class)
        init_weights(self)

    def forward(self, inputs, lengths):
        embeddings = self.embeddings(inputs)
        x_pack = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted=False)
        hidden, (hn, cn) = self.lstm(x_pack)
        #hidden, len = pad_packed_sequence(hidden, batch_first=True)
        #outputs = self.output(hidden)
        #log_probs = F.log_softmax(outputs, dim=-1)
        return log_probs

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device) #将模型加载到GPU中(如果已经正确安装)

测试:
model1为注释掉上面三行,直接看 x_pack 输入lstm的结果hidden,
model2为注释两行加pad_packed_sequence的结果。

model1=LSTM1(len(vocab), embedding_dim, hidden_dim, num_class)
model2=LSTM2(len(vocab), embedding_dim, hidden_dim, num_class)
for batch in train_data_loader:
    inputs, lengths, targets, mask = [x for x in batch]
    hidden1= model1(inputs, lengths)
    print(inputs,inputs.shape)
    print('    1    ')
    print(hidden1[0],hidden1[1],hidden1[2],hidden1[3])
    print('    2  ++++++++++++++++++++++++++     ')
    print(hidden1[0].shape,hidden1[1].shape,hidden1[2].shape,hidden1[3].shape)
    
    print('***************************************************************')
    print('***************************************************************')
    hidden2= model2(inputs, lengths)
    print(inputs,inputs.shape)
    print('         3   #######################################              ')
    print(hidden2[0],hidden2[1],hidden2[2],hidden2[3])
    print('          4   ======================================          ')
    print(hidden2[0].shape,hidden2[1].shape,hidden2[2].shape,hidden2[3].shape)
    
    break
input=tensor([[1735, 4372, 4402,   39,   50, 4403,  811,  129, 4404,  149,  104,   10,
         2901,  118, 4405,   31, 4406, 4407,    4,   31, 1736, 1111,  523,  370,
          125, 1181, 1488,  104, 2943, 4408,   18,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1],
        [7110,  281, 4158,   22, 8176,    4,   64, 5679, 5260,   39,   50, 1168,
         6567,   22, 9791, 8695,  104,   10, 1424,   22,   13, 2374, 1968,  214,
          104,   40, 1386,    4,   13, 9766,   22, 2373,  214,  104,  193, 2470,
          214,  104,   40, 1386,   10,   96, 1231, 1232, 1233,   31,   10, 3962,
         7159, 2470,   18],
        [ 379, 4967,    4,   39, 4968,   86, 4955,    8, 4969,   13, 1724, 4970,
          294, 4925, 4926, 2434,    4, 4971, 1127,   88, 1225,   22, 4955, 1873,
          157,   10,  987,   18,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1],
        [  19, 7704, 1895,   19, 2113,   31, 1307, 3330, 4116, 1189,    4, 7798,
         7799,    4,  137,   39, 4027,   50, 1712,  157, 7800,   18,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1]]) 
            
torch.Size([4, 51])
hidden1=tensor([[ 0.0029,  0.0193,  0.0212,  ...,  0.0001, -0.0066,  0.0003],
        [-0.0144,  0.0252,  0.0266,  ...,  0.0096,  0.0107, -0.0115],
        [ 0.0010,  0.0102,  0.0305,  ...,  0.0034, -0.0094, -0.0114],
        ...,
        [-0.0067,  0.0104,  0.0418,  ..., -0.0184, -0.0153, -0.0887],
        [-0.0151,  0.0101,  0.0386,  ..., -0.0038, -0.0186, -0.0766],
        [-0.0219,  0.0197,  0.0579,  ...,  0.0024, -0.0250, -0.0852]],
       grad_fn=<CatBackward>)
       
        tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
        3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1])#hidden[1]就是length。原序列长[22,28,31,51],和为132。
        
        tensor([1, 0, 2, 3])
        tensor([1, 0, 2, 3])
        #hidden1各元素的shape。
        torch.Size([132, 256]) torch.Size([51]) torch.Size([4]) torch.Size([4])

可以看出x_pack后输入lstm的直接输出结果有四个元素:

  • 模型输出向量,形状torch.Size([132, 256]),即去掉pad之后的序列拉直的长度。
  • pack_padded_sequence时的length列表,长51。
  • sorted_indices和unsorted_indices,值一样。
hidden2=tensor([[-0.0149,  0.0012,  0.0159,  ..., -0.0335, -0.0006, -0.0271],
        [-0.0339,  0.0047,  0.0248,  ..., -0.0422, -0.0209, -0.0569],
        [-0.0379,  0.0104,  0.0459,  ..., -0.0629, -0.0205, -0.0533],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward>) 

tensor([[-0.0121,  0.0010,  0.0193,  ..., -0.0180, -0.0015, -0.0323],
        [-0.0154,  0.0147,  0.0470,  ..., -0.0219, -0.0197, -0.0456],
        [-0.0045,  0.0210,  0.0554,  ..., -0.0435, -0.0271, -0.0440],
        ...,
        [-0.0362, -0.0124,  0.0628,  ..., -0.0643, -0.0345, -0.0596],
        [-0.0319, -0.0056,  0.0538,  ..., -0.0695, -0.0578, -0.0691],
        [-0.0331, -0.0223,  0.0400,  ..., -0.0821, -0.0560, -0.0682]],
       grad_fn=<SelectBackward>) 

tensor([[-0.0119, -0.0006,  0.0208,  ..., -0.0235, -0.0029, -0.0189],
        [-0.0590,  0.0138,  0.0322,  ..., -0.0340, -0.0180, -0.0385],
        [-0.0382,  0.0104,  0.0454,  ..., -0.0520, -0.0409, -0.0286],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward>) 

tensor([[-0.0165,  0.0155,  0.0077,  ..., -0.0130,  0.0058, -0.0343],
        [-0.0369,  0.0260,  0.0282,  ..., -0.0433, -0.0221, -0.0409],
        [-0.0434,  0.0133,  0.0344,  ..., -0.0546, -0.0403, -0.0254],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward>)
          4   ======================================          
torch.Size([51, 256]) torch.Size([51, 256]) torch.Size([51, 256]) torch.Size([51, 256])

1.3.5 pad_packed_sequence函数

def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_length=None):
    max_seq_length = sequence.batch_sizes.size(0)
    if total_length is not None:
        max_seq_length = total_length
    ...

total_length,它是 sequence 需要去被padding的长度,我们期望的一般都是padding到和输入序列一样的 time_step 长度 ,但是PackedSequence 型数据并没有记录这个数据,因此它用的是sequence.batch_sizes.size(0),也就是 batch_sizes 这个tensor的长度。

总结:

  1. pad_sequence可以将变长序列打包,写在整理函数里面。返回pad后的输入、原始length和 sorted_indices、unsorted_indices。
  2. pack_padded_sequence可以去掉pad部分(需要传入上一步的length)得到PackedSequence是一维数组,选取的是各个时间步的序列值,和原来句子序列已经不是一样的位置了。而且直接输入lstm模型得到结果。必须进行处理
  3. pad_packed_sequence 将上一步错乱的结果恢复成原来顺序,并pad到同一长度。
  4. 所以pack_padded_sequence和pad_packed_sequence必须同时使用,特别是token标注的时候。如果是句子分类,使用的是最后时刻的隐藏向量,就不用管恢复原顺序了。

bert输出和token分类任务头参考https://colab.research.google.com/drive/1m1x6Vu1b5_koh5jB34hRce766uS_1lbz#scrollTo=putqMIZgcNDm

#pandas读取到dataset后分词:
tokenized_trains_ds=trains_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)
#句子是分割的字
tokenized_trains_ds=data_test.map(lambda examples:tokenizer(examples['words'],is_split_into_words=True,truncation=True,padding=True),batched=True)
#去除列
trains_ds=Dataset.from_pandas(trains_df).remove_columns(["texts","text"])

#pandas直接处理
train_encoding=fast_tokenizer(list(train_df['summary']),truncation=True,padding=True,return_tensors='pt')
def collate_fn(examples):
  labels=[ex['labels'] for ex in examples]
  mask=[ex['attention_mask'] for ex in examples]
return examples
    
train_dataset=XFeiDataset(train_encoding,list(train_label))
val_dataset=XFeiDataset(val_encoding,list(val_label))

# 单个读取到批量读取
from torch.utils.data import Dataset,DataLoader,TensorDataset

train_loader=DataLoader(train_dataset,collate_fn=collate_fn,batch_size=batch_size,shuffle=True)
val_loader=DataLoader(val_dataset,collate_fn=collate_fn,batch_size=batch_size,shuffle=True)

你可能感兴趣的:(lstm,分类,batch)