pytorch入门(3)pytorch-seq2seq模型

pytorch入门(3)pytorch-seq2seq模型 
https://github.com/IBM/pytorch-seq2seq/ 此模型不包含embedding,且最大长度为10

Get Started

Prepare toy dataset

# Run script to generate the reverse toy dataset
# The generated data is stored in data/toy_reverse by default
scripts/toy.sh
查看得知:toy.sh 中内容为 python generation.py

①generation.py
from __future__ import print_function
import argparse
import os
import shutil
import random

parser = argparse.ArgumentParser()
parser.add_argument('--dir', help="data directory", default="../data")
parser.add_argument('--max-len', help="max sequence length", default=10)
args = parser.parse_args()     #调用解析

def generate_dataset(root, name, size):     # eg:    generate_dataset(toy_dir, 'train', 10000) # toy_dir = ../data/toy_reverse
    path = os.path.join(root, name)                #  path = ../data / toy_reverse / train
    if not os.path.exists(path):
        os.mkdir(path)
    #generate data file
    data_path = os.path.join(path, 'data.txt')  #  data_path  = ../data / toy_reverse / train /data.txt
    with open(data_path, 'w') as fout:
        for _ in range(size):                                 #  size = 10000 
            length = random.randint(1, args.max_len)  #  args.max_len = default=10     length=randint(1~10)
            seq = [ ]
            for _ in range(length):
                seq.append(str(random.randint(0, 9)))    # seq中加入 randint( 0~9 )  
            fout.write("\t".join([" ".join(seq), " ".join(reversed(seq))]))   reversed 是一个反转序列对象    1 5 2 2 5 1  有10000份
            fout.write('\n') 
    #generate vocabulary
    src_vocab = os.path.join(path, 'vocab.source') src_vocab =  ../data / toy_reverse / train / vocab.source【012345789】
    with open(src_vocab, 'w') as fout:
        fout.write("\n".join([str(i) for i in range(10)])) # src_vocab 文件内容:1 2 3 4 5 6 7 8 9 
    tgt_vocab = os.path.join(path, 'vocab.target')# 创建文件:tgt_vocab =  ../data / toy_reverse / train / vocab.target 
    shutil.copy(src_vocab, tgt_vocab) # shutil 强大的文件复制函数 (相当于可重命名)

if __name__ == '__main__':
    data_dir = args.dir                          #data_dir   :    路径: default="../data" 
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    toy_dir = os.path.join(data_dir, 'toy_reverse')       #  toy_dir = ../data/toy_reverse
    if not os.path.exists(toy_dir):
        os.mkdir(toy_dir)
    generate_dataset(toy_dir, 'train', 10000)
#创建训练数据
    generate_dataset(toy_dir, 'dev', 1000)
#创建开发数据

    generate_dataset(toy_dir, 'test', 1000)#创建测试数据(都是类似于1 2 44 2 1具包含正序和逆序的)

Train and play

TRAIN_PATH=data/toy_reverse/train/data.txt
DEV_PATH=data/toy_reverse/dev/data.txt
# Start training
python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH

运行【from torch.optim.lr_scheduler import StepLR    时出现错误:no model name lr_scheduler

改为from torch.optim import  lr_scheduler    】依然报错:can ot import lr_scheduler

改为【from torch import optim   】则可以成功运行,运行过程如下:

2017-11-05 18:02:38,303 root         INFO     Namespace(dev_path='data/toy_reverse/dev/data.txt', expt_dir='./experiment', load_checkpoint=None, log_level='info', resume=False, train_path='data/toy_reverse/train/data.txt')
2017-11-05 18:02:46,009 seq2seq.trainer.supervised_trainer INFO     Optimizer: , Scheduler: None
2017-11-05 18:02:49,285 seq2seq.trainer.supervised_trainer INFO     Progress: 1%, Train Perplexity: 13.2855 ······

2017-11-05 18:03:01,961 seq2seq.trainer.supervised_trainer INFO     Finished epoch 1: Train Perplexity: 7.0578, Dev Perplexity: 156.0297, Accuracy: 0.4440
2017-11-05 18:03:02,061 seq2seq.trainer.supervised_trainer INFO     Progress: 17%, Train Perplexity: 771.4534  ·。。。 

2017-11-05 18:03:12,004 seq2seq.trainer.supervised_trainer INFO     Progress: 33%, Train Perplexity: 1.0049
2017-11-05 18:03:12,811 seq2seq.trainer.supervised_trainer INFO     Finished epoch 2: Train Perplexity: 30.4321, Dev Perplexity: 40.3084, Accuracy: 0.5819
2017-11-05 18:03:12,867 seq2seq.trainer.supervised_trainer INFO     Progress: 33%, Train Perplexity: 81.9282
2017-11-05 18:03:12,980 seq2seq.trainer.supervised_trainer INFO     Progress: 34%, Train Perplexity: 5.6954···。。。

2017-11-05 18:03:23,701 seq2seq.trainer.supervised_trainer INFO     Progress: 49%, Train Perplexity: 1.0036
2017-11-05 18:03:24,764 seq2seq.trainer.supervised_trainer INFO     Finished epoch 3: Train Perplexity: 5.5954, Dev Perplexity: 1.0226, Accuracy: 0.9917
2017-11-05 18:03:24,781 seq2seq.trainer.supervised_trainer INFO     Progress: 50%, Train Perplexity: 1.1207
2017-11-05 18:03:24,908 seq2seq.trainer.supervised_trainer INFO     Progress: 50%, Train Perplexity: 1.1238···。。。 

2017-11-05 18:04:05,345 seq2seq.trainer.supervised_trainer INFO     Progress: 99%, Train Perplexity: 1.0001
2017-11-05 18:04:05,830 seq2seq.trainer.supervised_trainer INFO     Progress: 99%, Train Perplexity: 1.0001
2017-11-05 18:04:07,324 seq2seq.trainer.supervised_trainer INFO     Finished epoch 6: Train Perplexity: 1.0521, Dev Perplexity: 339.5657, Accuracy: 0.5387

It will take about 3 minutes to train on CPU and less than 1 minute with a Tesla K80. Once training is complete, you will be prompted to enter a new sequence to translate and the model will print out its prediction (use ctrl-C to terminate). Try the example below!

Input:  1 3 5 7 9
Expected output: 9 7 5 3 1 EOS

②examples/sample.py

import os
import argparse
import logging
import torch
from torch.optim.lr_scheduler import StepLR   学习率调度程序torch.optim.lr_scheduler
import torchtext
import seq2seq  # 文件夹seq2seq 
from seq2seq.trainer import SupervisedTrainer  # 文件夹seq2seq.trainer 下的init.py的内容为 from .supervised_trainer import SupervisedTrainer
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq # 三个py文件
from seq2seq.loss import Perplexity # 困惑度:由loss计算得出【http://blog.csdn.net/jiaqiang_ruan/article/details/77989459】
from seq2seq.optim import Optimizer
from seq2seq.dataset import SourceField, TargetField
from seq2seq.evaluator import Predictor
from seq2seq.util.checkpoint import Checkpoint#都是

try:
    raw_input          # Python 2 :内置函数raw_input   :直接读取控制台的输入【定义raw_input函数,为了防止因python版本不同出现错误】
except NameError:
    raw_input = input  # Python 3

# Sample usage:
#     # training
#     python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH --expt_dir $EXPT_PATH
#       从实验的最新检查点恢复
#      python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH --expt_dir $EXPT_PATH--resume#      #   从一个特定的检查点恢复
#      python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH --expt_dir $EXPT_PATH--load_checkpoint $CHECKPOINT_DIR 

parser = argparse.ArgumentParser()
parser.add_argument('--train_path', action='store', dest='train_path',
                    help='Path to train data') #无default所以需要人 输入$TRAIN_PATH $DEV_PATH
parser.add_argument('--dev_path', action='store', dest='dev_path',
                    help='Path to dev data') 开发数据 路径 
parser.add_argument('--expt_dir', action='store', dest='expt_dir', default='./experiment',   
                    help='Path to experiment directory. If load_checkpoint is True, then path to checkpoint directory has to be provided') # 实验目录 路径 
parser.add_argument('--load_checkpoint', action='store', dest='load_checkpoint', 
                    help='The name of the checkpoint to load, usually an encoded time string') ##加载检查点的名称
parser.add_argument('--resume', action='store_true', dest='resume', default=False,
                    help='Indicates if training has to be resumed from the latest checkpoint')##指示是否必须从最新的检查点恢复培训
parser.add_argument('--log-level', dest='log_level', default='info',
                    help='Logging level.')
opt = parser.parse_args() # 命令行解析函数 创建解析对象 添加参数 进行解析(从命令行参数中返回数据)

LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, opt.log_level.upper()))
logging.info(opt)


if opt.load_checkpoint is not None:   #如果有检查点  则从检查点开始
    logging.info("loading checkpoint from {}".format(   os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint    )))
    checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint)
    checkpoint = Checkpoint.load(checkpoint_path)
    seq2seq = checkpoint.model
    input_vocab = checkpoint.input_vocab
    output_vocab = checkpoint.output_vocab

else: # 否则【   Prepare dataset】

    src = SourceField() # 来自seq2seq/dataset文件夹下的field.py#  train /data.txt的每一行形式如123 321;包括了source和target

    tgt = TargetField()   # 实例化得到的两个对象

    max_len = 50
    def  len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len 
    train = torchtext.data.TabularDataset(  #https://github.com/pytorch/text/blob/master/torchtext/data/dataset.py
        path=opt.train_path, format='tsv',#由给定的路径下的文件(以及格式)创建一个表格数据集
        fields=[('src', src), ('tgt', tgt)], # 本类的构造函数中:(src tgt是field)将field()变成列表形式
        filter_pred=len_filter # bool类型 判断是否跳过第一行 
    )
    dev = torchtext.data.TabularDataset(#实例化得到的两个对象
        path=opt.dev_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    src.build_vocab(train, max_size=50000) #执行函数建立词汇对象https://github.com/pytorch/text/blob/master/torchtext/data/field.py
    tgt.build_vocab(train, max_size=50000) #50000只用来传递给vocab的构造函数
    input_vocab = src.vocab  #上一步执行函数后得到vocab,传递给变量 input_vocab
    output_vocab = tgt.vocab 

    # seq2seq.src_field_name = 'src'    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss准备loss函数

    weight = torch.ones(len(tgt.vocab))#weight = 一个所有值都是1的矩阵【len(tgt.vocab)】
    pad = tgt.vocab.stoi[tgt.pad_token]#tgt.pad_token用作填充的字符串记号

    loss = Perplexity(weight, pad) #seq2seq.loss // Perplexity
    if torch.cuda.is_available():#看是否支持cuda 是则调用这个函数 以后的计算使用gpu来计算
        loss.cuda()
    seq2seq = None
    optimizer = None
    if not opt.resume:   #不是从检查点恢复
        # Initialize model初始化模型 
        hidden_size=128
        bidirectional = True  #  双向的
        encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True)#是否使用可变长的RNN
        decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else 1,
                             dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id)
        seq2seq = Seq2seq(encoder, decoder) #三个对象
        if torch.cuda.is_available(): seq2seq.cuda()
        for param in seq2seq.parameters():
            param.data.uniform_(-0.08, 0.08)

        #
通过显式构造对象和传递给训练器 Optimizer & learning rate scheduler can be  定制 # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5)

        # scheduler = StepLR(optimizer.optimizer, 1)       # optimizer.set_scheduler(scheduler)

    # train
    t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=50,print_every=10,expt_dir=opt.expt_dir)

    seq2seq = t.train ( seq2seq, train,  num_epochs=6, dev_data=dev,  optimizer=optimizer,  teacher_forcing_ratio=0.5,   resume=opt.resume )


predictor = Predictor(seq2seq, input_vocab, output_vocab)
while True:           # 训练完成后 执行该循环 ctrlc 跳出
    seq_str = raw_input("Type in a source sequence:")   
    seq = seq_str.strip().split()   # strip()删除空白字符   split()按空白字符分割
    print( predictor.predict(seq) )

seq2seq/dataset文件夹下的field.py

import logging
import torchtext
class SourceField( torchtext.data.Field ): ’‘’‘’‘包装类of torchtext.data.Field  迫使batch_first和include_lengths = True.’‘’‘’‘

    def __init__(self, **kwargs):
        logger = logging.getLogger(__name__)# 对日志对象进行初始化 name可不填
        if kwargs.get( ' batch_first ' ) is False:
            logger.warning ( " Option batch_first has to be set to use pytorch-seq2seq.  Changed to True." )
        kwargs[ ' batch_first ' ] = True
        if kwargs.get( ' batch_first ' ) is False:# 应为 include_lengths
            logger.warning( " Option include_lengths has to be set to use pytorch-seq2seq.  Changed to True." )
        kwargs['include_lengths'] = True
        super(SourceField, self).__init__(**kwargs)  # 从torchtext.data.Field继承

class TargetField(torchtext.data.Field):
    """ batch_first = True # prepend & append to sequences in preprocessing step.  sos_id: index of the start of sentence symbol"""
    SYM_SOS = ''
    SYM_EOS = ''
    def __init__(self, **kwargs):
        logger = logging.getLogger(__name__)
        if kwargs.get('batch_first') == False:
            logger.warning("Option batch_first has to be set to use pytorch-seq2seq.  Changed to True.")
        kwargs['batch_first'] = True
        if kwargs.get( ' preprocessing ' ) is None:
            kwargs[ ' preprocessing ' ] = lambda seq: [self.SYM_SOS] +seq + [self.SYM_EOS]#匿名函数
        else:
            func = kwargs[ ' preprocessing ' ]
            kwargs['preprocessing'] = lambda seq: [self.SYM_SOS] +func(seq) + [self.SYM_EOS]
        self.sos_id = None
        self.eos_id = None
        super(TargetField, self).__init__(**kwargs)

    def build_vocab(self, *args, **kwargs):
        super(TargetField, self).build_vocab(*args, **kwargs)
        self.sos_id = self.vocab.stoi [self.SYM_SOS]#stoi 字符串to整型
        self.eos_id = self.vocab.stoi [self.SYM_EOS]

 

#seq2seq.loss // Perplexity ⑤loss = Perplexity(weight, pad)

 

⑥ seq2seq.models // EncoderRNN, DecoderRNN, Seq2seq # 三个py文件

class EncoderRNN(BaseRNN):
   
r"""
    Applies a multi-layer RNN to an input sequence.将一个多层RNN应用于一个输入序列
    Args:

        vocab_size (int): size of the vocabulary
        max_len (int): a maximum allowed length for the sequence to be processed     #50
        hidden_size (int): the number of features in the hidden state `h`
 #128
        input_dropout_p (float, optional): dropout probability for the input sequence (default: 0) #退出概率:防止过拟合
        dropout_p (float, optional): dropout probability for the output sequence (default: 0)
        n_layers (int, optional): number of recurrent layers (default: 1) #重复层数
        bidirectional (bool, optional): if True, becomes a bidirectional encodr (defulat False)#是否双向
        rnn_cell (str, optional): type of RNN cell (default: gru)
#rnn单元类型 默认gru
        variable_lengths (bool, optional): if use variable length RNN (default: False)#是否可变长

    Inputs: inputs, input_lengths
        - **inputs**: 序列列表,其长度为批处理大小
在该序列列表中的每个sequence is a list of token IDs.
      - **input_lengths** (list of int, optional): list that contains the lengths of sequences in the mini-batch, it must be provided when using variable length RNN (default: `None`)
            
    Outputs: output, hidden
        - **output** (batch, seq_len, hidden_size): tensor  containing输入序列的编码特征
        - **hidden** (num_layers * num_directions, batch, hidden_size): tensor containing the features in the hidden state `h`

    Examples::

         >>> encoder = EncoderRNN(input_vocab, max_seq_length, hidden_size)
         >>> output, hidden = encoder(input)

    """

    def __init__(self, vocab_size, max_len, hidden_size, input_dropout_p=0, dropout_p=0,n_layers=1, bidirectional=False, rnn_cell='gru', variable_lengths=False):
        super(EncoderRNN, self).__init__(vocab_size, max_len, hidden_size, input_dropout_p, dropout_p, n_layers, rnn_cell)

        self.variable_lengths = variable_lengths
        self.embedding = nn.Embedding(vocab_size, hidden_size)#(len(src.vocab),Hiddensize=128)
        self.rnn = self.rnn_cell(hidden_size, hidden_size, n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout_p)

    def forward(self, input_var, input_lengths=None):
        """
        Applies a multi-layer RNN to an input sequence.
        Args:
            input_var (batch, seq_len): tensor containing the features of the input sequence.
            input_lengths (list of int, optional): A list that contains the lengths of sequences        in the mini-batch

        Returns: output, hidden
            - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
            - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
        """

        embedded = self.embedding(input_var) #输入是对应的索引列表,输出是词嵌入
        embedded = self.input_dropout(embedded) #随机除去一些防止噪声防止过拟合#RNN单元的隐层结点数量
        if self.variable_lengths: #若是可变长rnn
            embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, batch_first=True)
        output, hidden = self.rnn(embedded)
        if self.variable_lengths:
            output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        return output, hidden



相似的seq2seq模型讲解(tensorflow不是pytorch但是原理相同):讲解的很清晰:http://www.sohu.com/a/155746637_505915

其中的超参数设置:

Number of Epochs:epochs = 60# #训练数据一共过几遍

Batch Size:batch_size = 128# #每次处理多少样本(使用pad进行补全,一个batch内的样本要具有相同的序列长度)

RNN Size:rnn_size = 50# #RNN单元的隐层结点数量

Number of Layers:num_layers = 2# #堆叠的RNN单元数量

Embedding Size:encoding_embedding_size = 15 = decoding_embedding_size = 15# #embedding的大小

Learning Ratelearning_rate = 0.001





train loss 和 test loss上升和下降结果分析:http://blog.csdn.net/smf0504/article/details/71698354

train loss 训练集 在模型中训练结果和预测结果的误差



你可能感兴趣的:(机器学习,自然语言处理)