#导入包
import os
import torch
from torch import nn
from torch.autograd import Variable
import pickle
from torch.nn.utils import weight_norm
import argparse
import time
import math
import torch.optim as optim
#数据读入和预处理
def data_generator(data_path):
corpus = Corpus(data_path)#生成train,test,valid的语料库
pickle.dump(corpus,open(data_path + "/corpus","wb"))
#pickle.dump(obj,file)是指将obj保存在文件file中。
#file:对象保存的文件对象,file必须有write()接口
return corpus
#将获得单词赋予索引,将word->index,可以理解为生成索引字典
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
def add_word(self,word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def __len__(self):
return len(self.idx2word)
class Corpus(object):
def __init__(self,path):
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path,"train.txt"))
self.valid = self.tokenize(os.path.join(path,"valid.txt"))
self.test = self.tokenize(os.path.join(path,"test.txt"))
def tokenize(self,path):
"""Tokenize a text file."""
assert os.path.exists(path)#断言存在这个路径,如果不存在这个路径,则返回错误
#将word添加到dictionary中
with open(path,"r") as f:
tokens = 0 #统计每个文件中有多少字
for line in f:
words = line.split() + [""] #文件中每行单词分开变成字符列表,每个列表最后一个元素为""
tokens += len(words) #每行的字符个数相加
for word in words: #将每行字放到字典中,如果字典中这个字不存在,就给这个字一个索引,最终结果是将每个文件中所有字都赋予一个索引
self.dictionary.add_word(word)
with open(path,"r") as f: #将文件找那个每个汉字转化为一个已知的索引,就是将每个字换成索引,(上边是生成字典,下边引用字典)
ids = torch.LongTensor(tokens) #比如这个文件有73760个汉字,就生成随机的73760个tensor,比如:将第100个汉字随机用156254表示
token = 0
for line in f:
words = line.split() + [""]
for word in words:
ids[token] = self.dictionary.word2idx[word]#将随机数转换成索引,比如:将第100个随机表示的数变成第100个汉字在字典中的索引
token += 1
return ids#返回的是每个字在字典中的索引
def batchify(data,batch_size,cuda): #返回批量化后的数据
nbatch = data.size(0)//batch_size #nbatch是批次次数
data = data.narrow(0,0,nbatch * batch_size)
data = data.view(batch_size,-1)
if cuda:
data = data.cuda()
return data
def get_batch(source,i,seq_len,seq_le = None,evaluation = False):
seq_le = min(seq_le if seq_le else seq_len,source.size(1) -1 -i)
data = Variable(source[:,i:i+seq_le],volatile = evaluation)
target = Variable(source[:,i+1:i+1+seq_le])
return data,target
cuda = True #是否使用GPU
data_path = "./data/penn" #文件路径
batch_size = 16 #每次训练时批量数据大小
nhid = 600 #定义神经网络中每层隐藏层单元数
levels = 4 #残差模块数,用来计算通道数
emsize = 600 #词嵌入长度
k_size = 3 #卷积核大小
dropout = 0.45 #网络层中的随机dropout比率
emb_dropout = 0.25 #嵌入层中的dropout比率
tied = True #是否让编码器和解码器的权重相同
lr = 4 #初始化的学习率
optimization = "SGD" #梯度下降法
validseqlen = 40 #用来验证序列长度
seq_len = 80 #总序列的长度
log_interval = 100 #记录最后结果的间隔
clip = 0.35 #梯度截断的设定,-1表示不采用梯度截断
epochs = 10 # 一共训练多少轮
torch.manual_seed(11)
if torch.cuda.is_available():
if not cuda:
print("WARNING:you should probably run with --cuda")
corpus = data_generator(data_path) #得到语料库
eval_batch_size = 10
train_data = batchify(corpus.train,batch_size,cuda)
print("train_data:",train_data.size())
val_data = batchify(corpus.valid,eval_batch_size,cuda)
print("val_data:",val_data.size())
test_data = batchify(corpus.test,eval_batch_size,cuda)
print("test_data:",test_data.size())
n_words = len(corpus.dictionary)#语料库的大小
print("n_words:",n_words)
num_chans = [nhid] * (levels - 1) + [emsize]
print("num_chans",num_chans)
#定义实现因果卷积的类
class Chomp1d(nn.Module):
def __init__(self,chomp_size):
super(Chomp1d,self).__init__()
self.chomp_size = chomp_size
def forward(self,x):
return x[:,:,:-self.chomp_size].contiguous()
#残差网络
class TemporalBlock(nn.Module):
def __init__(self,n_inputs,n_outputs,kernel_size,stride,dilation,padding,dropout=0.2):
super(TemporalBlock,self).__init__()
self.conv1 = weight_norm(nn.Conv1d(n_inputs,n_outputs,kernel_size,stride = stride,padding = padding,dilation=dilation))
self.chomp1 = Chomp1d(padding)
self.relu1 = nn.ReLU()
self.dropout1 = nn.Dropout2d(dropout)
self.conv2 = weight_norm(nn.Conv1d(n_outputs,n_outputs,kernel_size,stride = stride,padding = padding,dilation=dilation))
self.chomp2 = Chomp1d(padding)
self.relu2 = nn.ReLU()
self.dropout2 = nn.Dropout2d(dropout)
self.net = nn.Sequential(self.conv1,self.chomp1,self.relu1,self.dropout1,
self.conv2,self.chomp2,self.relu2,self.dropout2)
self.downsample = nn.Conv1d(n_inputs,n_outputs,1) if n_inputs != n_outputs else None
self.relu = nn.ReLU()
self.init_weight()
def init_weight(self):
self.conv1.weight.data.normal_(0,0.01)
self.conv2.weight.data.normal_(0,0.01)
if self.downsample is not None:
self.downsample.weight.data.normal_(0,0.01)
def forward(self,x):
out = self.net(x)
res = x if self.downsample is None else self.dowmsample(x)
return self.relu(out + res)
#时间卷积网络的架构
class TemporalConvNet(nn.Module):
def __init__(self,num_inputs,num_channels,kernel_size = 2,dropout = 0.2):
super(TemporalConvNet,self).__init__()
layers = []
num_levels = len(num_channels)
for i in range(num_levels):
dilation_size = 2 ** i
in_channels = num_inputs if i == 0 else num_channels[i-1]
out_channels = num_channels[i]
layers += [TemporalBlock(in_channels,out_channels,kernel_size,stride = 1,dilation = dilation_size,padding = (kernel_size - 1) * dilation_size,dropout=dropout)]
self.network = nn.Sequential(*layers)
def forward(self,x):
return self.network(x)
#TCN
class TCN(nn.Module):
def __init__(self,input_size,output_size,num_channels,kernel_size = 2,dropout = 0.3,emb_dropout = 0.1,tied_weight = False):
super(TCN,self).__init__()
self.encoder = nn.Embedding(output_size,input_size)
self.tcn = TemporalConvNet(input_size,num_channels,kernel_size,dropout=dropout)
self.decoder = nn.Linear(num_channels[-1],output_size)
if tied_weight:
if num_channels[-1] != input_size:
raise ValueError("When using the tied flag")
self.decoder.weight = self.encoder.weight
print("Weight tied")
self.drop = nn.Dropout(emb_dropout)
self.emb_dropout = emb_dropout
self.init_weights()
def init_weights(self):
self.encoder.weight.data.normal_(0,0.01)
self.decoder.bias.data.fill_(0)
self.decoder.weight.data.normal_(0,0.01)
def forward(self,input):
emb = self.drop(self.encoder(input))
y = self.tcn(emb.transpose(1,2)).transpose(1,2)
y = self.decoder(y)
return y.contiguous()
model = TCN(emsize,n_words,num_chans,dropout = dropout,emb_dropout = emb_dropout,kernel_size=k_size,tied_weight=tied)
if cuda:
model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = getattr(optim,optimization)(model.parameters(),lr = lr)
def evaluate(data_source):
model.eval()
total_loss = 0.0
processed_data_size = 0
for i in range(0,data_source.size(1) - 1,validseqlen):
if i + seq_len -validseqlen >= data_source.size(1) - 1:
continue
data,targets = get_batch(data_source,i,seq_len,evaluation = True)
output = model(data)
eff_history = seq_len - validseqlen
final_output = output[:,eff_history:].contiguous().view(-1,n_words)
final_target = targets[:,eff_history:].contiguous().view(-1)
loss = criterion(final_output,final_target)
total_loss += (data.size(1) - eff_history) * loss.data
processed_data_size += data.size(1) - eff_history
return total_loss.item() / processed_data_size
#训练
def train():
global train_data
model.train()
total_loss = 0
start_time = time.time()
for batch_idx,i in enumerate(range(0,train_data.size(1) - 1, validseqlen)):
if i + seq_len -validseqlen >= train_data.size(1) - 1:
continue
data,targets = get_batch(train_data,i,seq_len)
optimizer.zero_grad()
output = model(data)
eff_history = seq_len - validseqlen
if eff_history < 0:
raise ValueError("Valid sequence length must be smaller than sequence length!")
final_target = targets[:,eff_history:].contiguous().view(-1)
final_output = output[:,eff_history:].contiguous().view(-1,n_words)
loss = criterion(final_output,final_target)
loss.backward()
if clip > 0:
torch.nn.utils.clip_grad_norm(model.parameters(),clip)
optimizer.step()
total_loss += loss.data
if batch_idx % log_interval == 0 and batch_idx > 0:
cur_loss = total_loss.item() / log_interval
elapsed = time.time() - start_time
print("| epoch{:3d}|{:5d}/{:5d} batches | lr {:02.5f} | ms/batch{:5.5f}|loss{:5.2f} |ppl{:8.2f}".format(epoch,batch_idx,train_data.size(1)//validseqlen,lr,elapsed * 1000 /log_interval,cur_loss,math.exp(cur_loss)))
total_loss = 0
start_time = time.time()
import math
best_vloss = 1e8
try:
all_vloss = []
for epoch in range(1,epochs + 1):
epoch_start_time = time.time()
train()
val_loss = evaluate(val_data)
test_loss = evaluate(test_data)
print("-" * 89)
print("| end of epoch{:3d}|time:{:5.2f}s|valid loss{:5.2f}|valid ppl{:8.2f}".format(epoch,(time.time() - epoch_start_time),val_loss,math.exp(val_loss)))
print("| end of epoch{:3d}|time:{:5.2f}s|test loss{:5.2f}|test ppl{:8.2f}".format(epoch,(time.time() - epoch_start_time),test_loss,math.exp(test_loss)))
print("-" * 89)
if val_loss < best_vloss:
with open("model.pt","wb") as f:
print("Save model!\n")
torch.save(model,f)
best_vloss = val_loss
if epoch > 5 and val_loss >= max(all_vloss[-5:]):
lr = lr / 2
for param_group in optimizer.param_groups:
param_group["lr"] = lr
all_vloss.append(val_loss)
except KeyboardInterrupt:
print("-" * 89)
print("Exiting from training early")
with open("model.pt","rb") as f:
model = torch.load(f)
test_loss = evaluate(test_data)
print("-" * 89)
print("| End of training |test loss {:5.2f} | test ppl{:8.2f}".format(test_loss,math.exp(test_loss)))
print("-" * 89)
结果:
| epoch 1| 100/ 1452 batches | lr 4.00000 | ms/batch280.01413|loss 7.98 |ppl 2909.55
| epoch 1| 200/ 1452 batches | lr 4.00000 | ms/batch246.63045|loss 6.82 |ppl 913.98
| epoch 1| 300/ 1452 batches | lr 4.00000 | ms/batch246.75989|loss 6.59 |ppl 724.55
| epoch 1| 400/ 1452 batches | lr 4.00000 | ms/batch245.88235|loss 6.37 |ppl 584.86
| epoch 1| 500/ 1452 batches | lr 4.00000 | ms/batch245.80256|loss 6.23 |ppl 507.13
| epoch 1| 600/ 1452 batches | lr 4.00000 | ms/batch245.80250|loss 6.21 |ppl 497.23
| epoch 1| 700/ 1452 batches | lr 4.00000 | ms/batch245.87236|loss 6.12 |ppl 454.27
| epoch 1| 800/ 1452 batches | lr 4.00000 | ms/batch247.28858|loss 6.02 |ppl 409.61
| epoch 1| 900/ 1452 batches | lr 4.00000 | ms/batch248.93418|loss 5.98 |ppl 397.15
| epoch 1| 1000/ 1452 batches | lr 4.00000 | ms/batch246.29124|loss 5.93 |ppl 374.40
| epoch 1| 1100/ 1452 batches | lr 4.00000 | ms/batch245.70310|loss 5.90 |ppl 365.01
| epoch 1| 1200/ 1452 batches | lr 4.00000 | ms/batch245.88201|loss 5.89 |ppl 360.10
| epoch 1| 1300/ 1452 batches | lr 4.00000 | ms/batch247.16889|loss 5.77 |ppl 319.88
| epoch 1| 1400/ 1452 batches | lr 4.00000 | ms/batch246.34108|loss 5.76 |ppl 316.05
C:\study_soft\anaconda\lib\site-packages\ipykernel_launcher.py:3: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.
This is separate from the ipykernel package so we can avoid doing imports until
-----------------------------------------------------------------------------------------
| end of epoch 1|time:377.76s|valid loss 5.65|valid ppl 284.48
| end of epoch 1|time:377.76s|test loss 5.62|test ppl 275.72
-----------------------------------------------------------------------------------------
Save model!
h 2| 100/ 1452 batches | lr 4.00000 | ms/batch249.01427|loss 5.81 |ppl 334.69
| epoch 2| 200/ 1452 batches | lr 4.00000 | ms/batch246.79957|loss 5.70 |ppl 298.97
| epoch 2| 300/ 1452 batches | lr 4.00000 | ms/batch245.85242|loss 5.67 |ppl 291.44
| epoch 2| 400/ 1452 batches | lr 4.00000 | ms/batch246.45082|loss 5.56 |ppl 259.60
| epoch 2| 500/ 1452 batches | lr 4.00000 | ms/batch245.97209|loss 5.53 |ppl 253.25
| epoch 2| 600/ 1452 batches | lr 4.00000 | ms/batch246.24160|loss 5.59 |ppl 267.57
| epoch 2| 700/ 1452 batches | lr 4.00000 | ms/batch245.69263|loss 5.55 |ppl 258.13
| epoch 2| 800/ 1452 batches | lr 4.00000 | ms/batch245.52330|loss 5.52 |ppl 248.41
| epoch 2| 900/ 1452 batches | lr 4.00000 | ms/batch245.81253|loss 5.51 |ppl 248.10
| epoch 2| 1000/ 1452 batches | lr 4.00000 | ms/batch245.58347|loss 5.48 |ppl 240.21
| epoch 2| 1100/ 1452 batches | lr 4.00000 | ms/batch245.74290|loss 5.51 |ppl 246.12
| epoch 2| 1200/ 1452 batches | lr 4.00000 | ms/batch246.47044|loss 5.51 |ppl 247.05
| epoch 2| 1300/ 1452 batches | lr 4.00000 | ms/batch247.00934|loss 5.39 |ppl 219.67
| epoch 2| 1400/ 1452 batches | lr 4.00000 | ms/batch249.70210|loss 5.42 |ppl 226.40
-----------------------------------------------------------------------------------------
| end of epoch 2|time:374.45s|valid loss 5.37|valid ppl 215.94
| end of epoch 2|time:374.45s|test loss 5.34|test ppl 208.01
-----------------------------------------------------------------------------------------
Save model!
| epoch 3| 100/ 1452 batches | lr 4.00000 | ms/batch249.21343|loss 5.51 |ppl 246.17
| epoch 3| 200/ 1452 batches | lr 4.00000 | ms/batch246.24137|loss 5.43 |ppl 227.71
| epoch 3| 300/ 1452 batches | lr 4.00000 | ms/batch246.46080|loss 5.41 |ppl 224.13
| epoch 3| 400/ 1452 batches | lr 4.00000 | ms/batch248.13631|loss 5.30 |ppl 199.69
| epoch 3| 500/ 1452 batches | lr 4.00000 | ms/batch247.92523|loss 5.28 |ppl 196.96
| epoch 3| 600/ 1452 batches | lr 4.00000 | ms/batch246.82980|loss 5.35 |ppl 210.58
| epoch 3| 700/ 1452 batches | lr 4.00000 | ms/batch247.56773|loss 5.33 |ppl 206.88
| epoch 3| 800/ 1452 batches | lr 4.00000 | ms/batch246.76000|loss 5.30 |ppl 199.46
| epoch 3| 900/ 1452 batches | lr 4.00000 | ms/batch246.90958|loss 5.29 |ppl 199.29
| epoch 3| 1000/ 1452 batches | lr 4.00000 | ms/batch248.27629|loss 5.28 |ppl 196.51
| epoch 3| 1100/ 1452 batches | lr 4.00000 | ms/batch247.19882|loss 5.32 |ppl 205.03
| epoch 3| 1200/ 1452 batches | lr 4.00000 | ms/batch246.83944|loss 5.32 |ppl 204.87
| epoch 3| 1300/ 1452 batches | lr 4.00000 | ms/batch246.72045|loss 5.19 |ppl 180.27
| epoch 3| 1400/ 1452 batches | lr 4.00000 | ms/batch246.54025|loss 5.25 |ppl 190.62
-----------------------------------------------------------------------------------------
| end of epoch 3|time:375.52s|valid loss 5.23|valid ppl 186.82
| end of epoch 3|time:375.52s|test loss 5.18|test ppl 178.27
-----------------------------------------------------------------------------------------
Save model!
| epoch 4| 100/ 1452 batches | lr 4.00000 | ms/batch248.80432|loss 5.34 |ppl 208.22
| epoch 4| 200/ 1452 batches | lr 4.00000 | ms/batch246.83978|loss 5.26 |ppl 193.02
| epoch 4| 300/ 1452 batches | lr 4.00000 | ms/batch246.68035|loss 5.26 |ppl 193.25
| epoch 4| 400/ 1452 batches | lr 4.00000 | ms/batch246.38940|loss 5.13 |ppl 169.09
| epoch 4| 500/ 1452 batches | lr 4.00000 | ms/batch245.46346|loss 5.13 |ppl 169.24
| epoch 4| 600/ 1452 batches | lr 4.00000 | ms/batch246.73009|loss 5.20 |ppl 181.63
| epoch 4| 700/ 1452 batches | lr 4.00000 | ms/batch246.30150|loss 5.19 |ppl 179.32
| epoch 4| 800/ 1452 batches | lr 4.00000 | ms/batch245.74242|loss 5.16 |ppl 173.46
| epoch 4| 900/ 1452 batches | lr 4.00000 | ms/batch245.83215|loss 5.16 |ppl 174.91
| epoch 4| 1000/ 1452 batches | lr 4.00000 | ms/batch246.20148|loss 5.14 |ppl 171.07
| epoch 4| 1100/ 1452 batches | lr 4.00000 | ms/batch246.63070|loss 5.20 |ppl 181.71
| epoch 4| 1200/ 1452 batches | lr 4.00000 | ms/batch247.15859|loss 5.19 |ppl 180.17
| epoch 4| 1300/ 1452 batches | lr 4.00000 | ms/batch246.08685|loss 5.05 |ppl 156.78
| epoch 4| 1400/ 1452 batches | lr 4.00000 | ms/batch245.68288|loss 5.13 |ppl 168.84
-----------------------------------------------------------------------------------------
| end of epoch 4|time:374.32s|valid loss 5.13|valid ppl 168.77
| end of epoch 4|time:374.32s|test loss 5.08|test ppl 161.32
-----------------------------------------------------------------------------------------
Save model!
| epoch 5| 100/ 1452 batches | lr 4.00000 | ms/batch248.63499|loss 5.23 |ppl 187.33
| epoch 5| 200/ 1452 batches | lr 4.00000 | ms/batch245.99209|loss 5.16 |ppl 173.53
| epoch 5| 300/ 1452 batches | lr 4.00000 | ms/batch246.11167|loss 5.15 |ppl 172.13
| epoch 5| 400/ 1452 batches | lr 4.00000 | ms/batch246.11419|loss 5.01 |ppl 150.34
| epoch 5| 500/ 1452 batches | lr 4.00000 | ms/batch246.22650|loss 5.02 |ppl 151.81
| epoch 5| 600/ 1452 batches | lr 4.00000 | ms/batch246.05286|loss 5.10 |ppl 163.64
| epoch 5| 700/ 1452 batches | lr 4.00000 | ms/batch245.78291|loss 5.08 |ppl 161.54
| epoch 5| 800/ 1452 batches | lr 4.00000 | ms/batch246.14139|loss 5.05 |ppl 156.51
| epoch 5| 900/ 1452 batches | lr 4.00000 | ms/batch245.92964|loss 5.06 |ppl 157.16
| epoch 5| 1000/ 1452 batches | lr 4.00000 | ms/batch246.12137|loss 5.04 |ppl 154.88
| epoch 5| 1100/ 1452 batches | lr 4.00000 | ms/batch246.52372|loss 5.10 |ppl 164.31
| epoch 5| 1200/ 1452 batches | lr 4.00000 | ms/batch245.92190|loss 5.10 |ppl 164.37
| epoch 5| 1300/ 1452 batches | lr 4.00000 | ms/batch246.23444|loss 4.95 |ppl 141.31
| epoch 5| 1400/ 1452 batches | lr 4.00000 | ms/batch245.78305|loss 5.04 |ppl 154.05
-----------------------------------------------------------------------------------------
| end of epoch 5|time:374.01s|valid loss 5.07|valid ppl 158.45
| end of epoch 5|time:374.01s|test loss 5.02|test ppl 150.99
-----------------------------------------------------------------------------------------
Save model!
| epoch 6| 100/ 1452 batches | lr 4.00000 | ms/batch248.71913|loss 5.14 |ppl 170.33
| epoch 6| 200/ 1452 batches | lr 4.00000 | ms/batch246.10142|loss 5.07 |ppl 158.62
| epoch 6| 300/ 1452 batches | lr 4.00000 | ms/batch246.02196|loss 5.07 |ppl 158.47
| epoch 6| 400/ 1452 batches | lr 4.00000 | ms/batch245.78348|loss 4.93 |ppl 137.94
| epoch 6| 500/ 1452 batches | lr 4.00000 | ms/batch246.26133|loss 4.93 |ppl 138.39
| epoch 6| 600/ 1452 batches | lr 4.00000 | ms/batch245.99204|loss 5.00 |ppl 148.54
| epoch 6| 700/ 1452 batches | lr 4.00000 | ms/batch245.97210|loss 5.00 |ppl 148.63
| epoch 6| 800/ 1452 batches | lr 4.00000 | ms/batch246.17156|loss 4.97 |ppl 143.66
| epoch 6| 900/ 1452 batches | lr 4.00000 | ms/batch245.97853|loss 4.97 |ppl 144.41
| epoch 6| 1000/ 1452 batches | lr 4.00000 | ms/batch245.80253|loss 4.96 |ppl 141.88
| epoch 6| 1100/ 1452 batches | lr 4.00000 | ms/batch245.77245|loss 5.03 |ppl 153.47
| epoch 6| 1200/ 1452 batches | lr 4.00000 | ms/batch245.88457|loss 5.02 |ppl 150.72
| epoch 6| 1300/ 1452 batches | lr 4.00000 | ms/batch246.22144|loss 4.87 |ppl 129.84
| epoch 6| 1400/ 1452 batches | lr 4.00000 | ms/batch246.00063|loss 4.97 |ppl 143.66
-----------------------------------------------------------------------------------------
| end of epoch 6|time:374.04s|valid loss 5.01|valid ppl 149.86
| end of epoch 6|time:374.04s|test loss 4.96|test ppl 142.58
-----------------------------------------------------------------------------------------
Save model!
| epoch 7| 100/ 1452 batches | lr 4.00000 | ms/batch248.54545|loss 5.06 |ppl 157.99
| epoch 7| 200/ 1452 batches | lr 4.00000 | ms/batch245.83248|loss 4.99 |ppl 146.95
| epoch 7| 300/ 1452 batches | lr 4.00000 | ms/batch246.07182|loss 5.00 |ppl 147.68
| epoch 7| 400/ 1452 batches | lr 4.00000 | ms/batch245.79258|loss 4.85 |ppl 127.65
| epoch 7| 500/ 1452 batches | lr 4.00000 | ms/batch245.72767|loss 4.87 |ppl 130.20
| epoch 7| 600/ 1452 batches | lr 4.00000 | ms/batch245.75236|loss 4.93 |ppl 139.02
| epoch 7| 700/ 1452 batches | lr 4.00000 | ms/batch245.53363|loss 4.93 |ppl 138.79
| epoch 7| 800/ 1452 batches | lr 4.00000 | ms/batch245.99204|loss 4.90 |ppl 133.76
| epoch 7| 900/ 1452 batches | lr 4.00000 | ms/batch245.91230|loss 4.91 |ppl 135.29
| epoch 7| 1000/ 1452 batches | lr 4.00000 | ms/batch245.86267|loss 4.89 |ppl 132.52
| epoch 7| 1100/ 1452 batches | lr 4.00000 | ms/batch245.94178|loss 4.97 |ppl 143.56
| epoch 7| 1200/ 1452 batches | lr 4.00000 | ms/batch245.67290|loss 4.95 |ppl 140.96
| epoch 7| 1300/ 1452 batches | lr 4.00000 | ms/batch245.72305|loss 4.79 |ppl 120.41
| epoch 7| 1400/ 1452 batches | lr 4.00000 | ms/batch245.76272|loss 4.90 |ppl 134.25
-----------------------------------------------------------------------------------------
| end of epoch 7|time:373.78s|valid loss 4.96|valid ppl 142.14
| end of epoch 7|time:373.78s|test loss 4.91|test ppl 135.72
-----------------------------------------------------------------------------------------
Save model!
| epoch 8| 100/ 1452 batches | lr 4.00000 | ms/batch248.23614|loss 4.99 |ppl 146.92
| epoch 8| 200/ 1452 batches | lr 4.00000 | ms/batch245.86097|loss 4.93 |ppl 137.88
| epoch 8| 300/ 1452 batches | lr 4.00000 | ms/batch245.75274|loss 4.93 |ppl 139.06
| epoch 8| 400/ 1452 batches | lr 4.00000 | ms/batch245.77259|loss 4.78 |ppl 119.30
| epoch 8| 500/ 1452 batches | lr 4.00000 | ms/batch245.73273|loss 4.80 |ppl 121.78
| epoch 8| 600/ 1452 batches | lr 4.00000 | ms/batch245.65296|loss 4.87 |ppl 130.84
| epoch 8| 700/ 1452 batches | lr 4.00000 | ms/batch245.93220|loss 4.87 |ppl 130.26
| epoch 8| 800/ 1452 batches | lr 4.00000 | ms/batch245.85243|loss 4.83 |ppl 125.43
| epoch 8| 900/ 1452 batches | lr 4.00000 | ms/batch245.86239|loss 4.85 |ppl 127.30
| epoch 8| 1000/ 1452 batches | lr 4.00000 | ms/batch247.04923|loss 4.82 |ppl 123.81
| epoch 8| 1100/ 1452 batches | lr 4.00000 | ms/batch246.34851|loss 4.92 |ppl 136.48
| epoch 8| 1200/ 1452 batches | lr 4.00000 | ms/batch245.86233|loss 4.89 |ppl 132.87
| epoch 8| 1300/ 1452 batches | lr 4.00000 | ms/batch245.78239|loss 4.74 |ppl 114.05
| epoch 8| 1400/ 1452 batches | lr 4.00000 | ms/batch246.10208|loss 4.84 |ppl 127.05
-----------------------------------------------------------------------------------------
| end of epoch 8|time:373.95s|valid loss 4.91|valid ppl 136.15
| end of epoch 8|time:373.95s|test loss 4.87|test ppl 130.18
-----------------------------------------------------------------------------------------
Save model!
| epoch 9| 100/ 1452 batches | lr 4.00000 | ms/batch248.37098|loss 4.93 |ppl 138.72
| epoch 9| 200/ 1452 batches | lr 4.00000 | ms/batch246.13769|loss 4.88 |ppl 131.19
| epoch 9| 300/ 1452 batches | lr 4.00000 | ms/batch245.73247|loss 4.88 |ppl 131.61
| epoch 9| 400/ 1452 batches | lr 4.00000 | ms/batch245.92217|loss 4.73 |ppl 113.26
| epoch 9| 500/ 1452 batches | lr 4.00000 | ms/batch245.94756|loss 4.74 |ppl 114.95
| epoch 9| 600/ 1452 batches | lr 4.00000 | ms/batch245.94827|loss 4.81 |ppl 123.10
| epoch 9| 700/ 1452 batches | lr 4.00000 | ms/batch246.02216|loss 4.81 |ppl 123.10
| epoch 9| 800/ 1452 batches | lr 4.00000 | ms/batch245.81283|loss 4.78 |ppl 119.18
| epoch 9| 900/ 1452 batches | lr 4.00000 | ms/batch245.88699|loss 4.80 |ppl 120.94
| epoch 9| 1000/ 1452 batches | lr 4.00000 | ms/batch245.65296|loss 4.76 |ppl 116.75
| epoch 9| 1100/ 1452 batches | lr 4.00000 | ms/batch245.68288|loss 4.86 |ppl 128.96
| epoch 9| 1200/ 1452 batches | lr 4.00000 | ms/batch245.70279|loss 4.84 |ppl 125.95
| epoch 9| 1300/ 1452 batches | lr 4.00000 | ms/batch245.47396|loss 4.67 |ppl 106.81
| epoch 9| 1400/ 1452 batches | lr 4.00000 | ms/batch245.67257|loss 4.79 |ppl 120.77
-----------------------------------------------------------------------------------------
| end of epoch 9|time:373.76s|valid loss 4.87|valid ppl 130.85
| end of epoch 9|time:373.76s|test loss 4.83|test ppl 124.74
-----------------------------------------------------------------------------------------
Save model!
| epoch 10| 100/ 1452 batches | lr 4.00000 | ms/batch248.40121|loss 4.88 |ppl 131.73
| epoch 10| 200/ 1452 batches | lr 4.00000 | ms/batch245.74276|loss 4.83 |ppl 125.02
| epoch 10| 300/ 1452 batches | lr 4.00000 | ms/batch245.76266|loss 4.83 |ppl 124.85
| epoch 10| 400/ 1452 batches | lr 4.00000 | ms/batch245.49339|loss 4.68 |ppl 107.32
| epoch 10| 500/ 1452 batches | lr 4.00000 | ms/batch245.86963|loss 4.70 |ppl 109.82
| epoch 10| 600/ 1452 batches | lr 4.00000 | ms/batch245.52363|loss 4.77 |ppl 117.90
| epoch 10| 700/ 1452 batches | lr 4.00000 | ms/batch245.79255|loss 4.76 |ppl 117.31
| epoch 10| 800/ 1452 batches | lr 4.00000 | ms/batch246.01742|loss 4.73 |ppl 112.81
| epoch 10| 900/ 1452 batches | lr 4.00000 | ms/batch245.98019|loss 4.74 |ppl 114.54
| epoch 10| 1000/ 1452 batches | lr 4.00000 | ms/batch245.82754|loss 4.72 |ppl 112.25
| epoch 10| 1100/ 1452 batches | lr 4.00000 | ms/batch245.82964|loss 4.82 |ppl 123.96
| epoch 10| 1200/ 1452 batches | lr 4.00000 | ms/batch245.84798|loss 4.79 |ppl 120.52
| epoch 10| 1300/ 1452 batches | lr 4.00000 | ms/batch245.81256|loss 4.62 |ppl 101.88
| epoch 10| 1400/ 1452 batches | lr 4.00000 | ms/batch245.86656|loss 4.76 |ppl 117.08
-----------------------------------------------------------------------------------------
| end of epoch 10|time:373.74s|valid loss 4.85|valid ppl 127.99
| end of epoch 10|time:373.74s|test loss 4.80|test ppl 121.41
-----------------------------------------------------------------------------------------
Save model!
-----------------------------------------------------------------------------------------
| End of training |test loss 4.80 | test ppl 121.41
-----------------------------------------------------------------------------------------