利用pytorch和torchtext预处理数据

pytorch

广播矩阵

当自定义矩阵来和一个 batch 的数据 X 做乘法的时候,设X.shape = [batch_size, dimx, dimy],那么自定义的矩阵 W.shape = [input_dim, output_dim]只需要input_dim==dimy就可以使用torch.matmul(X, W),pytorch在计算的时候会自动广播矩阵W,但计算完 W.shape 不会发生变化,也可以使用torch.matmul(W, X),同理只需要output_dim==dimx即可。

还有一个方法是torch.bmm,其实 bmm 就是 batch matrix multiply 的意思,用法和上面是一样的。

X = torch.randn(3, 2, 3) 
W = torch.randn(3, 3, 4)
result = torch.matmul(X, W) 
# result = torch.bmm(X, W) 与上面那行效果一致

result.size()
# torch.Size([3, 2, 4])

torchtext

数据预处理流程

Load data:加载各种文件格式的语料 corpus

Tokenization: 将每个句子分解成其包含的单词组成的列表

Build vocab: 构建当前 corpus 的词汇表

Numericalize: 将单词映射成在词汇表中的索引 index

Embedding: 构建 word embedding 矩阵

torchtext.data.Field

Field对象定义了如何处理数据

# tokenizer = lambda x: x.split() 如果语料已经全部是处理好的句子,直接分词就可以了

import spacy
spacy_en = spacy.load('en')

def tokenizer(text):
    return [toke.text for toke in spacy_en.tokenizer(text)]
# 也可以直接在Field里用 tokenize='spacy',效果等同于上面自定义的tokenizer函数,只不过需要先link好,这里不展开说了
# REVIEW 用来存储用户评论,include_lengths 设为 True 方便后续使用 pack_padded_sequence
REVIEW = data.Field(sequential=True,tokenize=tokenizer, include_lengths=True)
ASPECT = data.Field(sequential=False)
POLARITY = data.LabelField(sequential=False, dtype = torch.float)
# fields字典里,元组的第一个元素将成为接下来从数据sample出的每个batch的属性,里面存放着对应的数据,第二个元素是对应的Field
fields = [('review', REVIEW), ('aspect', ASPECT), ('polarity', POLARITY)]

torchtext.data.TabularDataset

加载csv/tsv文件

csv文件的每一列必须和fields字典存储的数据顺序相对应,即第一列存放 review,第二列 aspect,第三列 polarity,如果需要跳过aspect,那么应该写成这样fields = [('review', REVIEW), (None, None), ('polarity', POLARITY)]

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'data',
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False # 是否跳过文件的第一行
)

根据 corpus 构建词汇表

REVIEW.build_vocab(train_data,
                   max_size=400000, # 词汇表容量
                   vectors='glove.840B.300d', # torchtext会自动下载到当前目录的.vector_cache文件夹内
                   unk_init=torch.Tensor.normal_ # 初始化对不在词汇表的单词,默认为zero tensor
                   )
ASPECT.build_vocab(train_data)
POLARITY.build_vocab(train_data)

# 打印出现次数最多的20个单词
print(REVIEW.vocab.freqs.most_common(20))
# [('the', 202475), (',', 192116), ('.', 165085), ('a', 109230), ('and', 109174), ('of', 101087), ('to', 93504), ('is', 76396), ('in', 61292), ('I', 54006), ('it', 53321), ('that', 48902), ('"', 44039), ("'s", 43236), ('this', 42363), ('-', 37002), ('/>

# 按词汇表索引顺序打印单词
print(REVIEW.vocab.itos[:10])
# ['', '', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']

# 打印单词到index的字典
print(REVIEW.vocab.stoi)
# {'': 1, 'the':2, ······}

torchtext.data.BucketIterator

默认情况下,训练集在每个 epoch 会被 shuffle,但验证集和测试集是排好序的。但是 torchtext 不知道怎么排序,我们必须设置排序的方法,否则就会报错。

sort_key不但是 batch 内的排序方式(降序排列),也是通过它,BucketIterator才知道将长度相近的数据放到同一个 batch 内来sample,这样可以最小化每个 batch 需要padding的个数,提高计算效率。

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        sort_within_batch=True, # 每个batch内的数据按照sork_key降序排列,为pack_padded_sequence做准备
        sort_key = lambda x:len(x.review), # 注意 lambda 的参数 x 是train_data的 Example 对象
        device=device) # device 可以是 torch.device

for batch in train_iterator:
  review, review_lengths = batch.review # 因为 REVIEW 设置了 include_lengths=True,所以每个 review 是一个元组
  aspect = batch.aspect
  polarity = batch.polarity
  
	# 可见,review 长度相近的数据被 sample 到了同一个 batch 中
  print(batch.review[1])
  tensor([181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 180, 180,
          180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
          180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
          180, 180, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
          179, 179, 179, 179, 179, 179, 179, 179])

构建一个简单的模型

此处拿一个外国教程里的实例作说明

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))
      
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = REVIEW.vocab.stoi[REVIEW.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

加载预训练 Embedding weight

pretrained_embeddings = REVIEW.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# 把 unk 和 pad 置零
UNK_IDX = REVIEW.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

你可能感兴趣的:(深度学习笔记,nlp)