关于语言模型:官网提供案例供参考
https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html?highlight=lstm
主要内容:
import random
from collections import defaultdict, Counter
from pathlib import Path
import os
import numpy as np
import torch
import torch.nn as nn
def set_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
set_random_seed(2020)
设定计算设备与数据集路径
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ',device)
DATA_ROOT = '../data/'
print('PyTorch Version:', torch.__version__)
print('DATA_ROOT:',DATA_ROOT)
device: cpu
PyTorch Version: 1.2.0
DATA_ROOT: ../data/
Vocab
类用于存储单词表Vocab
类当中包含了单词(token)与索引(index)之间的映射https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html?highlight=lstm
class Dictionary(object):
def __init__(self, vocab_path):
self.stoi = {} # token -> index (dict)
self.itos = [] # index -> token (list)
# 初始化两个特殊表示符号
self.stoi['' ] = 0
self.stoi['' ] = 1
self.itos.append('' )
self.itos.append('' )
with open(vocab_path,'r',encoding='utf-8') as f:
for w in f.readlines():
self.add_word(w.strip())
def add_word(self, word):
if word not in self.stoi:
self.itos.append(word)
self.stoi[word] = len(self.itos) - 1
return self.stoi[word]
def __len__(self):
return len(self.itos)
简单测试
vocab = Dictionary( os.path.join(DATA_ROOT, 'bobsue.voc.txt') )
print('单词表大小:', len(vocab))
print('-' * 60)
print('样例(单词 -> 索引):')
print(list(vocab.stoi.items())[:5])
print('-' * 60)
print('样例(索引 -> 单词):')
print(list(enumerate(vocab.itos))[:5])
单词表大小: 1500
------------------------------------------------------------
样例(单词 -> 索引):
[('', 0), ('', 1), ('', 2), ('', 3), ('.', 4)]
------------------------------------------------------------
样例(索引 -> 单词):
[(0, ''), (1, ''), (2, ''), (3, ''), (4, '.')]
Corpus
类读取训练集、验证集、测试集语料Vocab
转换成索引列表class Corpus:
def __init__(self, data_path, sort_by_len=False):
self.vocab = Dictionary(os.path.join(data_path, 'bobsue.voc.txt'))
self.sort_by_len = sort_by_len
self.train_data = self.tokenize(os.path.join(data_path, 'bobsue.lm.train.txt'))
self.valid_data = self.tokenize(os.path.join(data_path,'bobsue.lm.dev.txt'))
self.test_data = self.tokenize(os.path.join(data_path,'bobsue.lm.test.txt'))
def tokenize(self, text_path):
with open(text_path,'r',encoding='utf-8') as f:
index_data = [] # 索引数据,存储每个样本的单词索引列表
for s in f.readlines():
index_data.append(
self.sentence_to_index(s)
)
if self.sort_by_len: # 为了提升训练速度,可以考虑将样本按照长度排序,这样可以减少padding
index_data = sorted(index_data, key=lambda x: len(x), reverse=True)
return index_data
def sentence_to_index(self, s):
return [self.vocab.stoi[w] for w in s.split()]
def index_to_sentence(self, x):
return ' '.join([self.vocab.itos[i] for i in x])
简单测试
corpus = Corpus(DATA_ROOT, sort_by_len=False)
print('训练集句子数目:', len(corpus.train_data))
print('验证集句子数目:', len(corpus.valid_data))
print('测试集句子数目:', len(corpus.test_data))
print('-' * 60)
print('训练集总共单词数目:', sum([len(x) for x in corpus.train_data]))
print('验证集总共单词数目:', sum([len(x) for x in corpus.valid_data]))
print('测试集总共单词数目:', sum([len(x) for x in corpus.test_data]))
print('-' * 60)
print('训练集预测单词数目:', sum([len(x) - 1 for x in corpus.train_data]))
print('验证集预测单词数目:', sum([len(x) - 1 for x in corpus.valid_data]))
print('测试集预测单词数目:', sum([len(x) - 1 for x in corpus.test_data]))
print('-' * 60)
print('数据样本:')
for i in range(5):
print(corpus.train_data[i])
print(corpus.index_to_sentence(corpus.train_data[i]))
训练集句子数目: 6036
验证集句子数目: 750
测试集句子数目: 750
------------------------------------------------------------
训练集总共单词数目: 71367
验证集总共单词数目: 8707
测试集总共单词数目: 8809
------------------------------------------------------------
训练集预测单词数目: 65331
验证集预测单词数目: 7957
测试集预测单词数目: 8059
------------------------------------------------------------
数据样本:
[2, 18, 237, 374, 12, 62, 5, 77, 620, 41, 4, 3]
She ate quickly and asked to be taken home .
[2, 40, 194, 224, 34, 33, 6, 4, 3]
The girl broke up with Bob .
[2, 9, 844, 4, 3]
Sue apologized .
[2, 14, 152, 20, 10, 263, 5, 548, 104, 7, 1099, 4, 3]
He tried for a year to break into the market .
[2, 202, 708, 16, 7, 28, 17, 429, 230, 4, 3]
So far , the day had gone well .
DataSet
来构建我们自己的语言模型数据集DataSet
后,要实现__len__
与__getitem__
方法我们的目标 是获取 feature 和 output
class MyDataSet(torch.utils.data.Dataset):
def __init__(self, index_data):
self.index_data = index_data
def __getitem__(self, i):
# 根据语言模型定义,这里我们要用前n-1个单词预测后n-1个单词
feature = self.index_data[i][:-1]
output = self.index_data[i][1:]
return feature,output
def __len__(self):
return len(self.index_data)
简单测试
train_set = MyDataSet(corpus.train_data)
print('训练集大小:', len(train_set))
print('训练集样本:')
print('\t输入:', train_set[10][0])
print('\t ', corpus.index_to_sentence(train_set[10][0]))
print('\t目标:', train_set[10][1])
print('\t ', corpus.index_to_sentence(train_set[10][1]))
训练集大小: 6036
训练集样本:
输入: [2, 9, 210, 603, 30, 12, 1278, 27, 7, 711, 509, 4]
Sue sat behind him and stared at the cute guy .
目标: [9, 210, 603, 30, 12, 1278, 27, 7, 711, 509, 4, 3]
Sue sat behind him and stared at the cute guy .
collate_fn
具体详见官方文档: https://pytorch.org/docs/stable/data.html
其中官方文档的一个介绍
A custom collate_fn can be used to customize collation, e.g., padding sequential data to max length of a batch. See this section on more about collate_fn.
我们使用获取字典中 PAD_IDX 来进行填充
PAD_IDX = vocab.stoi['' ] #
def lm_collate_fn(batch):
"""
DataLoader 中对每个batch 进行预处理的函数
输入batch
"""
# 这里输入的batch格式为[(input_1, target_1), ... ,(input_n, target_n)]
# 我们要将其格式转换为[(input_1, ... , input_n), (target_1, ... , target_n)]
batch = list(zip(*batch))
# 生成长度列表
lengths = torch.LongTensor( [ len(x) for x in batch[0] ]).to(device)
# 对输入和目标进行padding
inputs = [torch.LongTensor(x).to(device) for x in batch[0]]
inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True,padding_value=PAD_IDX) # pad_sequence 可以按照最大长度句子进行填充统一长度标准(0填充)
targets = [torch.LongTensor(x).to(device) for x in batch[1]]
targets = nn.utils.rnn.pad_sequence(targets, batch_first=True,padding_value=PAD_IDX)
# 因为输入数据没有 “1” 的索引,存在“1” 表示是padding_index 的结构 ,由此生成mask矩阵
mask = (inputs != 1).float().to(device) # 1 表示该位置存储/ 0 表示该位置不存在 是填充的数据padding(注意:计算loss 的时候需要去掉)
#print(mask)
# 在之后的训练中因为还要进行pack_padded_sequence操作,所以在这里按照长度降序排列
lengths, sorted_index = lengths.sort(descending=True)
# 根据排序后的perm_index ,进行重新获取数据列表
inputs = inputs[sorted_index]
targets = targets[sorted_index]
mask = mask[sorted_index]
return inputs, targets, lengths, mask
简单测试
test_loader = torch.utils.data.DataLoader(
dataset=train_set,
batch_size=64,
shuffle=False,
collate_fn=lm_collate_fn
)
inputs, targets, lengths, mask = next(iter(test_loader))
print('输入:')
print(inputs)
print('-' * 60)
print('目标:')
print(targets)
print('-' * 60)
print('Mask:')
print(mask)
print('-' * 60)
print('每个样本的实际长度:')
print(lengths)
print('-' * 60)
输入:
tensor([[ 2, 161, 10, ..., 38, 44, 4],
[ 2, 14, 233, ..., 94, 4, 1],
[ 2, 14, 185, ..., 4, 1, 1],
...,
[ 2, 14, 8, ..., 1, 1, 1],
[ 2, 6, 957, ..., 1, 1, 1],
[ 2, 9, 844, ..., 1, 1, 1]])
------------------------------------------------------------
目标:
tensor([[161, 10, 272, ..., 44, 4, 3],
[ 14, 233, 27, ..., 4, 3, 1],
[ 14, 185, 5, ..., 3, 1, 1],
...,
[ 14, 8, 707, ..., 1, 1, 1],
[ 6, 957, 553, ..., 1, 1, 1],
[ 9, 844, 4, ..., 1, 1, 1]])
------------------------------------------------------------
Mask:
tensor([[1., 1., 1., ..., 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 0.],
[1., 1., 1., ..., 1., 0., 0.],
...,
[1., 1., 1., ..., 0., 0., 0.],
[1., 1., 1., ..., 0., 0., 0.],
[1., 1., 1., ..., 0., 0., 0.]])
------------------------------------------------------------
每个样本的实际长度:
tensor([17, 16, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8,
8, 8, 8, 7, 7, 6, 5, 5, 5, 4])
------------------------------------------------------------
forward
过程中使用pack_padded_sequence
和pad_packed_sequence
方法处理变长输入https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html?highlight=lstm
class BiLSTM(nn.Module):
"""语言模型网络架构
Args:
vocab_size: 词表中的单词数目
embedding_size: 词向量维度
hidden_size: LSTM隐含状态的维度
dropout: Dropout概率
"""
def __init__(self, vocab_size, embedding_size=200, hidden_size=200, dropout=0.5,num_layers=1):
"""
input_size(embedding_size) – The number of expected features in the input x
hidden_size – The number of features in the hidden state h
num_layers – Number of recurrent layers.
batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
"""
super(BiLSTM, self).__init__()
self.num_layers=num_layers
self.drop = nn.Dropout(dropout)
self.embed = nn.Embedding(vocab_size, embedding_size)
self.encoder = nn.LSTM(
input_size = embedding_size,
hidden_size = hidden_size,
num_layers = 1,
batch_first=True,bidirectional=True)
# bidirectional = True 设置False 可以进行对比,效果非常的明显
self.decoder = nn.Linear(2 * hidden_size, vocab_size)
def forward(self, inputs, lengths):
# inputs shape: (batch_size, max_length)
# x_emb shape: (batch_size, max_length, embed_size)
x_emb = self.drop(self.embed(inputs))
packed_emb = nn.utils.rnn.pack_padded_sequence(
x_emb,
lengths,
batch_first=True
)
# 这里LSTM的h_0,c_0使用全0的默认初始化,LSTM层经过后丢弃
packed_out, _ = self.encoder(packed_emb)
# x_out shape: (batch_size, max_length, hidden_size)
x_out, _ = nn.utils.rnn.pad_packed_sequence(
packed_out, batch_first=True
)
# outputs shape: (batch, max_length, vocab_size)
return self.decoder(self.drop(x_out))
model = BiLSTM(len(corpus.vocab), 200, 200)
model.to(device)
BiLSTM(
(drop): Dropout(p=0.5, inplace=False)
(embed): Embedding(1500, 200)
(encoder): LSTM(200, 200, batch_first=True, bidirectional=True)
(decoder): Linear(in_features=400, out_features=1500, bias=True)
)
简单测试
inputs, targets, lengths, mask = next(iter(test_loader))
outputs = model(inputs, lengths)
print('模型输入Shape:', inputs.shape)
print('模型输出Shape:', outputs.shape)
模型输入Shape: torch.Size([64, 17])
模型输出Shape: torch.Size([64, 17, 1500])
CrossEntropyLoss
class MaskCrossEntropyLoss(nn.Module):
def __init__(self):
super(MaskCrossEntropyLoss, self).__init__()
self.celoss = nn.CrossEntropyLoss(reduction='none')
def forward(self, outputs, targets, mask):
# outputs shape: (batch_size * max_len, vocab_size)
outputs = outputs.view(-1, outputs.size(2)) # outputs.size(2) 获取第2 个维度的大小
#print('outputs: ',outputs.shape)
# targets shape: (batch_size * max_len)
targets = targets.view(-1)
#print('targets: ',targets.shape)
# mask shape: (batch_size * max_len)
mask = mask.view(-1)
#print('mask: ',mask.shape)
#print('loss: ',self.celoss(outputs, targets).shape)
#print(mask)
#print(self.celoss(outputs, targets))
loss = self.celoss(outputs, targets) * mask # 把pading的loss设置为0
return torch.sum(loss) / torch.sum(mask) # 非0的loss之和 ➗ 所有非0的个数 平均loss
简单测试
inputs, targets, lengths, mask = next(iter(test_loader))
outputs = model(inputs, lengths)
criterion = MaskCrossEntropyLoss().to(device)
loss = criterion(outputs, targets, mask)
print('损失值:', loss)
损失值: tensor(7.3246, grad_fn=)
LanguageModelLearner
类完成模型训练class LanguageModelLearner:
def __init__(self, corpus, embedding_size=200, hidden_size=200, dropout=0.5,
batch_size=128, early_stopping_round=5):
self.corpus = corpus
self.batch_size = batch_size # 每次加载记录数
self.early_stopping_round = early_stopping_round
self.model = BiLSTM(len(corpus.vocab), embedding_size, hidden_size, dropout).to(device) # 初始化model
self.criterion = MaskCrossEntropyLoss().to(device) # 自定义CrossEntroyLoss(删除了padding 的数据-效果会更好)
self.optimizer = torch.optim.Adam(self.model.parameters()) # 采用Adam 剃度下降算法 更新权重
self.history = defaultdict(list) # list ,存储训练结果
def fit(self, num_epochs):
# 定义训练集dataloader
train_set = MyDataSet(self.corpus.train_data)
train_loader = torch.utils.data.DataLoader(
dataset=train_set,
batch_size=self.batch_size,
shuffle=True,
collate_fn=lm_collate_fn # 使用自定义 lm_collate_fn 对训练集长度统一化操作
)
# 定义验证集dataloader
valid_set = MyDataSet(self.corpus.valid_data)
valid_loader = torch.utils.data.DataLoader(
dataset=valid_set,
batch_size=self.batch_size,
shuffle=True,
collate_fn=lm_collate_fn
)
# 记录验证集没有提高的轮数,用于EarlyStopping
no_improve_round = 0
for epoch in range(num_epochs):
train_loss, train_acc, train_words = self._make_train_step(train_loader)
#if (epoch + 1) % 10 == 0:
print(f'Epoch {epoch+1}:')
print('Train Step --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(
train_loss, train_acc, train_words))
# 记录训练信息
self.history['train_loss'].append(train_loss)
self.history['train_acc'].append(train_acc)
valid_loss, valid_acc, valid_words = self._make_valid_step(valid_loader)
#if (epoch + 1) % 10 == 0:
print('Valid Step --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(
valid_loss, valid_acc, valid_words))
self.history['valid_loss'].append(valid_loss)
self.history['valid_acc'].append(valid_acc)
# 根据验证集的准确率进行EarlyStopping
if self.history['valid_acc'][-1] < max(self.history['valid_acc']):
no_improve_round += 1
else:
no_improve_round = 0
if no_improve_round == self.early_stopping_round:
print(f'Early Stopping at Epoch {epoch+1}')
break
def predict(self):
test_set = MyDataSet(self.corpus.test_data)
# 这里注意,为了方便之后分析不要shuffle,batch_size设置为1
test_loader = torch.utils.data.DataLoader(
dataset=test_set,
batch_size=1,
shuffle=False,
collate_fn=lm_collate_fn
)
# 验证模式
self.model.eval()
# 总损失
total_loss = 0.0
# 正确预测的数目,单词总数
total_correct, total_words = 0, 0
# 预测结果字典,包含preds和targets
test_result = defaultdict(list)
with torch.no_grad():
for inputs, targets, lengths, mask in test_loader:
# 计算模型输出
outputs = self.model(inputs, lengths)
# 统计当前预测正确的数目
total_correct += (outputs.argmax(-1) == targets).sum().item()
# 统计当前总预测单词数
total_words += torch.sum(lengths).item()
# 记录结果
test_result['preds'].append(outputs.argmax(-1).data.cpu().numpy()[0])
test_result['targets'].append(targets.data.cpu().numpy()[0])
# 计算模型Mask交叉熵损失
loss = self.criterion(outputs, targets, mask)
# 统计总损失
total_loss += loss.item() * torch.sum(mask).item()
return total_loss / total_words, total_correct / total_words, total_words, test_result
def _make_train_step(self, train_loader):
# 训练模式
self.model.train()
# 总损失
total_loss = 0.0
# 正确预测的数目,单词总数
total_correct, total_words = 0, 0
for inputs, targets, lengths, mask in train_loader:
# 计算模型输出
outputs = self.model(inputs, lengths)
# 统计当前预测正确的数目
total_correct += (outputs.argmax(-1) == targets).sum().item()
# 统计当前总预测单词数
total_words += torch.sum(lengths).item()
# 计算模型Mask交叉熵损失
loss = self.criterion(outputs, targets, mask)
# 统计总损失
total_loss += loss.item() * torch.sum(mask).item() # torch.sum(mask) 表示真正有效的单词
# 反向传播
self.optimizer.zero_grad()
loss.backward() # w = d_loss/dw
self.optimizer.step() #w -= w-learning_rate*w
return total_loss / total_words, total_correct / total_words, total_words
def _make_valid_step(self, valid_loader):
# 验证模式
self.model.eval()
# 总损失
total_loss = 0.0
# 正确预测的数目,单词总数
total_correct, total_words = 0, 0
with torch.no_grad():
for inputs, targets, lengths, mask in valid_loader:
# 计算模型输出
outputs = self.model(inputs, lengths)
# 统计当前预测正确的数目
total_correct += (outputs.argmax(-1) == targets).sum().item()
# 统计当前总预测单词数
total_words += torch.sum(lengths).item()
# 计算模型Mask交叉熵损失
loss = self.criterion(outputs, targets, mask)
# 统计总损失
total_loss += loss.item() * torch.sum(mask).item()
return total_loss / total_words, total_correct / total_words, total_words
torch.cuda.empty_cache()
learner = LanguageModelLearner(corpus, embedding_size=200, hidden_size=200, dropout=0.5, batch_size=128)
learner.fit(10)
Epoch 1:
Train Step --> Loss: 5.357, Acc: 0.252, Words: 65331
Valid Step --> Loss: 3.945, Acc: 0.385, Words: 7957
Epoch 2:
Train Step --> Loss: 3.596, Acc: 0.436, Words: 65331
Valid Step --> Loss: 2.950, Acc: 0.522, Words: 7957
Epoch 3:
Train Step --> Loss: 2.857, Acc: 0.539, Words: 65331
Valid Step --> Loss: 2.375, Acc: 0.609, Words: 7957
Epoch 4:
Train Step --> Loss: 2.395, Acc: 0.603, Words: 65331
Valid Step --> Loss: 1.997, Acc: 0.668, Words: 7957
Epoch 5:
Train Step --> Loss: 2.080, Acc: 0.648, Words: 65331
Valid Step --> Loss: 1.725, Acc: 0.714, Words: 7957
Epoch 6:
Train Step --> Loss: 1.836, Acc: 0.685, Words: 65331
Valid Step --> Loss: 1.500, Acc: 0.751, Words: 7957
Epoch 7:
Train Step --> Loss: 1.630, Acc: 0.718, Words: 65331
Valid Step --> Loss: 1.321, Acc: 0.781, Words: 7957
Epoch 8:
Train Step --> Loss: 1.457, Acc: 0.744, Words: 65331
Valid Step --> Loss: 1.168, Acc: 0.808, Words: 7957
Epoch 9:
Train Step --> Loss: 1.307, Acc: 0.768, Words: 65331
Valid Step --> Loss: 1.037, Acc: 0.831, Words: 7957
Epoch 10:
Train Step --> Loss: 1.182, Acc: 0.788, Words: 65331
Valid Step --> Loss: 0.921, Acc: 0.849, Words: 7957
test_loss, test_acc, test_words, test_result = learner.predict()
print('测试集上的结果 --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(
test_loss, test_acc, test_words))
测试集上的结果 --> Loss: 0.978, Acc: 0.842, Words: 8059
print('预测句子数量:', len(test_result['preds']))
print('-' * 60)
sample_index = 20
print('结果样例:')
print('预测值\t', test_result['preds'][sample_index])
print('实际值\t', test_result['targets'][sample_index])
print('预测句子\t', corpus.index_to_sentence(test_result['preds'][sample_index]))
print('实际句子\t', corpus.index_to_sentence(test_result['targets'][sample_index]))
预测句子数量: 750
------------------------------------------------------------
结果样例:
预测值 [373 16 15 227 6 33 33 108 200 200 4 3]
实际值 [373 16 15 227 6 188 33 108 241 200 4 3]
预测句子 Suddenly , he noticed Bob with with some kids kids .
实际句子 Suddenly , he noticed Bob outside with some other kids .
上述结果来看还不错。
mistake_counter = Counter()
for i in range(len(test_result['targets'])):
for j in range(len(test_result['targets'][i])):
pred, target = test_result['preds'][i][j], test_result['targets'][i][j]
if pred != target:
pred, target = corpus.vocab.itos[pred], corpus.vocab.itos[target]
mistake_counter[(target, pred)] += 1
mistake_counter.most_common(35)
[(('brand', 'new'), 3),
(('couple', 'few'), 3),
(('ring', 'that'), 2),
(('skating', 'cream'), 2),
(('two', 'weeks'), 2),
(('move', 'find'), 2),
(('crying', 'started'), 2),
(('whole', 'time'), 2),
(('worried', 'nervous'), 2),
(('ticket', 'for'), 2),
(('eye', '.'), 2),
(('sand', 'party'), 2),
(('finish', 'get'), 2),
(('black', 'great'), 2),
(('staying', 'been'), 2),
(('complained', 'about'), 2),
(('big', 'next'), 2),
(('several', 'she'), 2),
(('high', 'school'), 2),
(('sister', 'dad'), 2),
(('grandpa', 'parents'), 2),
(('clean', 'up'), 2),
(('talked', 'never'), 2),
(('soon', 'as'), 2),
(('show', 'be'), 2),
(('on', 'one'), 2),
(('delicious', 'and'), 2),
(('fast', 'as'), 2),
(('mean', 'away'), 2),
(('gathered', 'told'), 2),
(('Soon', 'Suddenly'), 2),
(('problem', 'big'), 2),
(('playing', 'his'), 2),
(('met', 'found'), 2),
(('possible', 'well'), 2)]
我们这里使用BiLSTM语言模型,感觉效果还不错
correct_counter = Counter()
for i in range(len(test_result['targets'])):
for j in range(len(test_result['targets'][i])):
pred, target = test_result['preds'][i][j], test_result['targets'][i][j]
if pred == target:
pred, target = corpus.vocab.itos[pred], corpus.vocab.itos[target]
correct_counter[(target, pred)] += 1
correct_counter.most_common(35)
[(('', ''), 747),
(('.', '.'), 720),
(('to', 'to'), 313),
(('the', 'the'), 241),
(('Bob', 'Bob'), 198),
(('was', 'was'), 184),
(('a', 'a'), 147),
(('Sue', 'Sue'), 144),
(('her', 'her'), 141),
(('He', 'He'), 136),
(('he', 'he'), 128),
(('and', 'and'), 124),
(('She', 'She'), 112),
((',', ','), 106),
(('his', 'his'), 104),
(('she', 'she'), 81),
(('it', 'it'), 78),
(('had', 'had'), 68),
(('for', 'for'), 67),
(('in', 'in'), 66),
(('of', 'of'), 57),
(('decided', 'decided'), 56),
(('on', 'on'), 47),
(('day', 'day'), 45),
(('him', 'him'), 44),
(('got', 'got'), 41),
(("'t", "'t"), 39),
(("'s", "'s"), 34),
(('went', 'went'), 34),
(('with', 'with'), 30),
(('up', 'up'), 30),
(('at', 'at'), 30),
(('out', 'out'), 30),
(('get', 'get'), 29),
(('go', 'go'), 28)]