使用BERT从文本数据中提取特征,即单词和句子的嵌入向量。
我们可以用这些词和句子的嵌入向量做什么?
完整代码:
import torch
import matplotlib.pyplot as plt
from pytorch_pretrained_bert import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained(r'D:\Pretrained_model\bert-base-uncased')
text01 = "Here is the sentence I want embeddings for."
text02 = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
marked_text = "[CLS] " + text01 + " [SEP] " + text02 + " [SEP]"
print('marked_text = ', marked_text)
subtokenized_text = tokenizer.tokenize(marked_text)
print('subtokenized_text = ', subtokenized_text)
subtoken_ids = tokenizer.convert_tokens_to_ids(subtokenized_text)
print('subtoken_ids = ', subtoken_ids)
for idx, tup in enumerate(zip(subtokenized_text, subtoken_ids)):
print("idx = {0}; tup = {1}".format(idx, tup))
sep_positions = [-1] + [idx for idx, subtoken_id in enumerate(subtoken_ids) if subtoken_id == 102] # 获取[SEP]符号在整篇文章的所有subtoken中所处位置的序号
print('sep_positions = ', sep_positions)
segment_len_list = [sep_positions[i] - sep_positions[i - 1] for i in range(1, len(sep_positions))] # 获取各个[SEP]符号之间文本的长度
print('segment_len_list = ', segment_len_list)
segments_ids = [] # 每一个segment的长度算上各自的[CLS]/[SEP]
for i, segment_len in enumerate(segment_len_list):
if i % 2 == 0:
segments_ids += segment_len * [0]
else:
segments_ids += segment_len * [1]
print('segments_ids = ', segments_ids)
# Convert inputs to PyTorch tensors
subtokens_tensor = torch.tensor([subtoken_ids])
segments_tensor = torch.tensor([segments_ids])
# Load pre-trained model (weights)
model = BertModel.from_pretrained(r'D:\Pretrained_model\bert-base-uncased')
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
print('model = ', model)
# Predict hidden states features for each layer
with torch.no_grad():
encoded_layers, pooled_output = model(input_ids=subtokens_tensor, token_type_ids=segments_tensor)
print('encoded_layers = \n', encoded_layers)
print("Number of layers:", len(encoded_layers))
layer_i = 0
print("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0
print("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0
print("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))
# For the 5th token in our sentence, select its feature values from layer 5.
token_i = 5
layer_i = 5
vec = encoded_layers[layer_i][batch_i][token_i]
# Plot the values as a histogram to show their distribution.
plt.figure(figsize=(10, 10))
plt.hist(vec, bins=200)
plt.show()
# [ # tokens, # layers, # features]
# Convert the hidden state embeddings into single token vectors
# Holds the list of 12 layer embeddings for each token
# Will have the shape: [# tokens, # layers, # features]
token_embeddings = []
# For each token in the sentence...
for token_i in range(len(subtokenized_text)):
# Holds 12 layers of hidden states for each token
hidden_layers = []
# For each of the 12 layers...
for layer_i in range(len(encoded_layers)):
# Lookup the vector for `token_i` in `layer_i`
vec = encoded_layers[layer_i][batch_i][token_i]
hidden_layers.append(vec)
token_embeddings.append(hidden_layers)
# Sanity check the dimensions:
print("\nNumber of tokens in sequence:", len(token_embeddings))
print("Number of layers per token:", len(token_embeddings[0]))
concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072]
summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # [number_of_tokens, 768]
print("len(concatenated_last_4_layers) = {0}; concatenated_last_4_layers[0].shape = {1}".format(len(concatenated_last_4_layers), concatenated_last_4_layers[0].shape))
print("len(summed_last_4_layers) = {0}; summed_last_4_layers[0].shape = {1}".format(len(summed_last_4_layers), summed_last_4_layers[0].shape))
# 句向量
sentence_embedding = torch.mean(encoded_layers[11], 1)
print("Our final sentence embedding vector of shape:", sentence_embedding.shape)
bank_vec01 = summed_last_4_layers[19][:5]
print("First fifth values of 'bank' as in 'bank vault':", bank_vec01)
bank_vec02 = summed_last_4_layers[23][:5]
print("First fifth values of 'bank' as in 'bank robber':", bank_vec02)
bank_vec03 = summed_last_4_layers[32][:5]
print("First fifth values of 'bank' as in 'river bank':", bank_vec03)
from sklearn.metrics.pairwise import cosine_similarity
# Compare "bank" as in "bank robber" to "bank" as in "bank vault"
same_bank = cosine_similarity(summed_last_4_layers[19].reshape(1, -1), summed_last_4_layers[23].reshape(1, -1))[0][0]
print("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'bank vault':", same_bank)
# Compare "bank" as in "bank robber" to "bank" as in "river bank"
different_bank = cosine_similarity(summed_last_4_layers[23].reshape(1, -1), summed_last_4_layers[32].reshape(1, -1))[0][0]
print("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'river bank':", different_bank)
使用Hugging Face的github仓库来安装pytorch接口。(这个库包含其他预训练语言模型的接口,比如OpenAI的GPT和GPT-2)我们之所以选择pytorch接口,是因为它在高级api(易于使用,但不能深入了解工作原理)和tensorflow代码(其中包含了很多细节,但通常会让我们忽略关于tensorflow的内容,此处的目的是BERT!)之间取得了很好的平衡。
!pip install pytorch-pretrained-bert
现在我们导入pytorch、预训练的BERT模型和BERT tokenizer。
from pytorch_pretrained_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(r'D:\Pretrained_model\bert-base-uncased')
因为BERT是一个预训练的模型,它期望以特定的格式输入数据,所以我们需要:
幸运的是,这个接口为我们处理了这些输入规范中的一些,因此我们只需要手动创建其中的一些BERT可以接受一到两句话作为输入,并希望每句话的开头和结尾都有特殊的标记:
from pytorch_pretrained_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(r'D:\Pretrained_model\bert-base-uncased')
text01 = "Here is the sentence I want embeddings for."
text02 = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
marked_text = "[CLS] " + text01 + " [SEP] " + text02 + " [SEP]"
print('marked_text = ', marked_text)
subtokenized_text = tokenizer.tokenize(marked_text)
print('subtokenized_text = ', subtokenized_text)
打印结果:
marked_text = [CLS] Here is the sentence I want embeddings for. [SEP] After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank. [SEP]
subtokenized_text = ['[CLS]', 'here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.', '[SEP]', 'after', 'stealing', 'money', 'from', 'the', 'bank', 'vault', ',', 'the', 'bank', 'robber', 'was', 'seen', 'fishing', 'on', 'the', 'mississippi', 'river', 'bank', '.', '[SEP]']
注意“embeddings”一词是如何表示的:
[‘em’, ‘##bed’, ‘##ding’, ‘##s’]
原来的单词被分成更小的子单词和字符。这些子单词前面的两个 “#” 号只是我们的tokenizer用来表示这个子单词或字符是一个更大单词的一部分,并在其前面加上另一个子单词的方法。因此,例如,‘##bed’ 这个token与 'bed’这个token是分开的,当一个较大的单词中出现’bed’时,使用第一种方法,当一个独立的bed这个token表示 “thing you sleep on”出现时,使用第二种方法。
为什么会这样?这是因为BERT tokenizer 是用WordPiece模型创建的。这个模型使用贪心法创建了一个固定大小的词汇表,其中包含单个字符、子单词和最适合我们的语言数据的单词。由于我们的BERT tokenizer模型的词汇量限制大小为30,000,因此,用WordPiece模型生成一个包含所有英语字符的词汇表,再加上该模型所训练的英语语料库中发现的~30,000个最常见的单词和子单词。这个词汇表包含个东西:
要在此模型下对单词进行记号化,
由于这个原因,我们总是可以将一个单词表示为至少是它的单个字符的集合。
接下来,我们需要调用tokenizer来匹配tokens在tokenizer词汇表中的索引:
subtoken_ids = tokenizer.convert_tokens_to_ids(subtokenized_text)
print('subtoken_ids = ', subtoken_ids)
for tup in zip(subtokenized_text, subtoken_ids):
print("tup = ", tup)
打印结果:
subtoken_ids = [101, 2182, 2003, 1996, 6251, 1045, 2215, 7861, 8270, 4667, 2015, 2005, 1012, 102, 2044, 11065, 2769, 2013, 1996, 2924, 11632, 1010, 1996, 2924, 27307, 2001, 2464, 5645, 2006, 1996, 5900, 2314, 2924, 1012, 102]
tup = ('[CLS]', 101)
tup = ('here', 2182)
tup = ('is', 2003)
tup = ('the', 1996)
tup = ('sentence', 6251)
tup = ('i', 1045)
tup = ('want', 2215)
tup = ('em', 7861)
tup = ('##bed', 8270)
tup = ('##ding', 4667)
tup = ('##s', 2015)
tup = ('for', 2005)
tup = ('.', 1012)
tup = ('[SEP]', 102)
tup = ('after', 2044)
tup = ('stealing', 11065)
tup = ('money', 2769)
tup = ('from', 2013)
tup = ('the', 1996)
tup = ('bank', 2924)
tup = ('vault', 11632)
tup = (',', 1010)
tup = ('the', 1996)
tup = ('bank', 2924)
tup = ('robber', 27307)
tup = ('was', 2001)
tup = ('seen', 2464)
tup = ('fishing', 5645)
tup = ('on', 2006)
tup = ('the', 1996)
tup = ('mississippi', 5900)
tup = ('river', 2314)
tup = ('bank', 2924)
tup = ('.', 1012)
tup = ('[SEP]', 102)
注意:由于我们总是可以将一个单词表示为至少是它的单个字符的集合。因此,不是将词汇表中没有的单词分配给诸如“OOV”或“UNK”之类的全集令牌,而是将词汇表中没有的单词分解为子单词和字符令牌,然后我们可以为它们生成嵌入。因此,我们没有将“embeddings”和词汇表之外的每个单词分配给一个重载的未知词汇表标记,而是将其拆分为子单词标记[’ em ‘、’ ##bed ‘、’ ##ding ‘、’ ##s '],这些标记将保留原单词的一些上下文含义。我们甚至可以平均这些子单词的嵌入向量来为原始单词生成一个近似的向量。
from pytorch_transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab_size = tokenizer.vocab_size
print('vocab_size = ', vocab_size)
word_piece01 = tokenizer.tokenize('embedding')
print('\nword_piece01 = ', word_piece01)
input_ids_01 = tokenizer.encode("embedding")
print('input_ids_01 = ', input_ids_01)
word_piece02 = tokenizer.tokenize("oyzbzl")
print('\nword_piece02 = ', word_piece02)
input_ids_02 = tokenizer.encode("oyzbzl")
print('input_ids_02 = ', input_ids_02)
for tup in zip(word_piece02, input_ids_02):
print("tup = ", tup)
打印结果:
vocab_size = 30522
word_piece01 = ['em', '##bed', '##ding']
input_ids_01 = [7861, 8270, 4667]
word_piece02 = ['o', '##y', '##z', '##b', '##z', '##l']
input_ids_02 = [1051, 2100, 2480, 2497, 2480, 2140]
tup = ('o', 1051)
tup = ('##y', 2100)
tup = ('##z', 2480)
tup = ('##b', 2497)
tup = ('##z', 2480)
tup = ('##l', 2140)
BERT接受了句子对的训练,并期望使用1和0来区分这两个句子。也就是说,对于“tokenized_text”中的每个标记,我们必须指定它属于哪个句子:句子0(一系列0)或句子1(一系列1)。对于我们的目的,单句输入只需要一系列的1,所以我们将为输入语句中的每个标记创建一个1向量。
如果你想处理两个句子,请将第一个句子中的每个token赋值为0,第二个句子中的每个token赋值为1。
注意:每一个segment的长度算上各自的[CLS]、[SEP]。
sep_positions = [-1] + [idx for idx, subtoken_id in enumerate(subtoken_ids) if subtoken_id == 102] # 获取[SEP]符号在整篇文章的所有subtoken中所处位置的序号
print('sep_positions = ', sep_positions)
segment_len_list = [sep_positions[i] - sep_positions[i - 1] for i in range(1, len(sep_positions))] # 获取各个[SEP]符号之间文本的长度
print('segment_len_list = ', segment_len_list)
segments_ids = [] # 每一个segment的长度算上各自的[CLS]/[SEP]
for i, segment_len in enumerate(segment_len_list):
if i % 2 == 0:
segments_ids += segment_len * [0]
else:
segments_ids += segment_len * [1]
print('segments_ids = ', segments_ids)
打印结果:
sep_positions = [-1, 13, 34]
segment_len_list = [14, 21]
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
接下来,我们需要将数据转换为torch张量并调用BERT模型。BERT PyTorch接口要求数据使用torch张量而不是Python列表,所以我们在这里转换列表——这不会改变形状或数据。
# Convert inputs to PyTorch tensors
subtokens_tensor = torch.tensor([subtoken_ids])
segments_tensor = torch.tensor([segments_ids])
eval()将我们的模型置于评估模式,而不是训练模式。在这种情况下,评估模式关闭了训练中使用的dropout正则化。
调用 from_pretrained 将获取模型。当我们加载 bert-base-uncased时,我们会在日志中看到打印的模型定义。该模型是一个12层的深度神经网络!
# Load pre-trained model (weights)
model = BertModel.from_pretrained(r'D:\Pretrained_model\bert-base-uncased')
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
print('model = ', model)
打印结果:
model = BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
...................
...................
...................
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
接下来,让我们获取网络的隐藏状态。
torch.no_grad禁用梯度计算,节省内存,并加快计算速度(我们不需要梯度或反向传播,因为我们只是运行向前传播)。
# Predict hidden states features for each layer
with torch.no_grad():
encoded_layers, pooled_output = model(input_ids=subtokens_tensor, token_type_ids=segments_tensor)
print('encoded_layers = \n'