Google Bert模型下载地址:https://github.com/google-research/bert
环境要求:TensorFlow 1.11.0和、Python2 和或Python3(TensorFlow 1.12.0、python3.6实测可运行)
首先,在github上下载相关文件,包括:bert程序的压缩包、预训练模型(根据实际需求下载相应的预训练模型,本文使用模型:uncased_L-12_H-768_A-12)、相关数据集(根据实际需求下载对应任务的数据集,本文以MRPC任务即:判断两句话是否表达一个意思?作为讲解示例)。
下载完成全部所需的数据,将其放置在相应的项目文件夹中,本文构建项目文件夹目录如下:
--BERT
--bert-master
--GLUE
--BERT_BASE_DIR
--uncased_L-12_H-768_A-12
--glue_data
--MRPC
--output
项目部署完成后,我们开始运行run_classifier.py程序,运行参数设置如下:
python run_classifier.py/
--task_name=MRPC
--do_train=true
--do_eval=true
--data_dir=../GLUE/glue_data/MRPC
--vocab_file=../GLUE/BERT_BASE_DIR/uncased_L-12_H-768_A-12/vocab.txt
--bert_config_file=../GLUE/BERT_BASE_DIR/uncased_L-12_H-768_A-12/bert_config.json
--init_checkpoint=../GLUE/BERT_BASE_DIR/uncased_L-12_H-768_A-12/bert_model.ckpt
--max_seq_length=128
--train_batch_size=32
--learning_rate=2e-5
--num_train_epochs=3.0
--output_dir=../GLUE/output
等程序跑完后,我们可以看到:eval_accuracy、eval_loss、global_step、loss等相关信息。
接下来开始正式地对模型的核心程序讲解。
首先读取训练所需要的train.tsv文件,存取train_examples参数中
//main()
train_examples = processor.get_train_examples(FLAGS.data_dir)
//get_train_examples()
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
//_create_examples()
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[3]) //读入第一句话,tokenization.convert_to_unicode使文本转化为utf-8编码;
//Eg:text_a:'Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.'
text_b = tokenization.convert_to_unicode(line[4]) //读入第二句话
//Eg:text_b:'Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.'
if set_type == "test":
label = "0" //如果是模型预测,则对应的标签标“0”
else:
label = tokenization.convert_to_unicode(line[0]) //如果是模型训练,则读入该示例所对应的标签
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) //将读取数据转化为bert模型输入格式存入examples列表
//Eg:example:(guid:'train1',text_a,text_b,label=1)
return examples
获取到训练数据后,计算训练总共需要迭代多少次
//train_batch_size = 32
//num_train_epochs = 3.0
num_train_steps = int(
len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
对读入的原始数据进行预处理(重点)
//main()函数中数据预处理模块
file_based_convert_examples_to_features(
train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
在file_based_convert_examples_to_features()函数中,将读取到的数据依次进行预处理操作,将处理后的词向量信息保存成定义的InputFeatures类型的数据
//file_based_convert_examples_to_features()
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
通过convert_single_example()函数,对输入数据进行分词、映射成词向量、加入相关信息等
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
return InputFeatures(
input_ids=[0] * max_seq_length,
input_mask=[0] * max_seq_length,
segment_ids=[0] * max_seq_length,
label_id=0,
is_real_example=False)
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i //将标签映射成数值;Eg:label_map:{'0':0,'1':1}
tokens_a = tokenizer.tokenize(example.text_a) //对text_a分词
//Eg:tokens_a:['am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '"', 'the', 'witness', '"', ',', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.']
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b) //如果存在第二句话,则对text_b分词
//Eg:['referring', 'to', 'him', 'as', 'only', '"', 'the', 'witness', '"', ',', 'am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.']
if tokens_b:
//如果tokens_b存在,修改“tokens_a”和“tokens_b”,使总长度小于指定长度,由于有[CLS], [SEP], [SEP]标签,最大限度即:max_seq_length-3
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
//反之,最大限度即:max_seq_length-2
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
tokens = [] //[CLS] tokens_a [SEP] tokens_b
segment_ids = [] //标记是第一句话还是第二句话,0:第一句;1:第二句
tokens.append("[CLS]") //添加句首标签[CLS]
segment_ids.append(0) //为[CLS]添加标签0
for token in tokens_a:
tokens.append(token) //将tokens_a加入tokens中
segment_ids.append(0) //添加标签0,标识是第一句话
tokens.append("[SEP]") //添加两句话中间的连接符[SEP]
segment_ids.append(0) //为[SEP]添加标签0
if tokens_b:
for token in tokens_b:
tokens.append(token) //将tokens_b加入tokens中
segment_ids.append(1) //添加标签1,标识是第二句话
tokens.append("[SEP]") //在句子后添加标签[SEP]
segment_ids.append(1) //为[SEP]添加标签1
//Eg: tokens:['[CLS]', 'am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '"', 'the', 'witness', '"', ',', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]', 'referring', 'to', 'him', 'as', 'only', '"', 'the', 'witness', '"', ',', 'am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]']
//Eg:segment_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
input_ids = tokenizer.convert_tokens_to_ids(tokens) //将tokens中的的单词应射程对应的id
//Eg:input_ids:[101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102]
//加入mask编码,mask为1表示该词向量具有实际的意义,mask为0表示该词向量不具备实际意义,是为了统一长度补充的无意义0
input_mask = [1] * len(input_ids) //在mask列表中添加输入序列实际长度的1
//添加0补齐序列长度
while len(input_ids) < max_seq_length: //规范序列长度,长度不够则补0
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
//Eg:input_ids:[101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 0, 0 ...]
//Eg:input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 ...]
//Eg:segment_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 ...]
//判断input_ids、input_mask、segment_ids是否都等于最大序列长度
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
//获取标签对应的数值
label_id = label_map[example.label]
//Eg:label_id:1
//打印输出相关信息
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("guid: %s" % (example.guid))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
//将以所有特征向量存入InputFeatures类的实例对象feature
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id,
is_real_example=True)
return feature
需要注意的是在convert_single_example()函数中的几个特殊列表及字符:
将预处理完的数据输入模型训练
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids, //词向量序列维度(batch_size,每一句话的最大长度);Eg:(8,128)
input_mask=input_mask, //词向量序列维度(batch_size,每一句话的最大长度);Eg:(8,128)
token_type_ids=segment_ids, //词向量序列维度(batch_size,每一句话的最大长度);Eg:(8,128)
use_one_hot_embeddings=use_one_hot_embeddings)
//modeling/BertModel()
config = copy.deepcopy(config)
if not is_training:
config.hidden_dropout_prob = 0.0
config.attention_probs_dropout_prob = 0.0
input_shape = get_shape_list(input_ids, expected_rank=2)
batch_size = input_shape[0] //读取batch_size值;Eg:8
seq_length = input_shape[1] //读取最大序列长度;Eg:128
if input_mask is None: //如果未进行mask编码,则默认mask编码为1
input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
if token_type_ids is None: //若未进行句子分类编码,则默认只有一句话
token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
BERT模型通过Embeddings层将输入序列映射成定义维度的词向量序列,添加type_id及位置编码等信息。
Embeddings层的程序实现如下:
with tf.variable_scope(scope, default_name="bert"): //构建bert模型
with tf.variable_scope("embeddings"): //构建embedding层
# Perform embedding lookup on the word ids.
(self.embedding_output, self.embedding_table) = embedding_lookup(
input_ids=input_ids, //输入序列向量
vocab_size=config.vocab_size, //预训练模型语料库
embedding_size=config.hidden_size, //指定将词映射成词向量的维度
initializer_range=config.initializer_range, //初始化取值范围
word_embedding_name="word_embeddings",
use_one_hot_embeddings=use_one_hot_embeddings)
//加入位置编码等信息
self.embedding_output = embedding_postprocessor(
input_tensor=self.embedding_output, //前一步embedding输出词向量序列(batch_size,seq_length,embedding_size)
use_token_type=True, //是否使用type_ids标识哪一句话
token_type_ids=token_type_ids, //输入type_ids
token_type_vocab_size=config.type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True, //是否添加位置信息
position_embedding_name="position_embeddings",
initializer_range=config.initializer_range,
max_position_embeddings=config.max_position_embeddings, //定义位置信息最大长度
dropout_prob=config.hidden_dropout_prob)
先讲解Embeddings层中的embedding_lookup()函数,其作用是将词转换成词向量
输入值:(batch_size,seq_length),例如(8,128)即8个样本,一个样本有128个词
输出值:(batch_size,seq_length,embedding_size),例如(8,128,768)即8个样本,一个样本有128个词,一个词映射成768维的向量
def embedding_lookup(input_ids,
vocab_size,
embedding_size=128,
initializer_range=0.02,
word_embedding_name="word_embeddings",
use_one_hot_embeddings=False):
//return shape [batch_size, seq_length, embedding_size];Eg:[8,128,768]
if input_ids.shape.ndims == 2: //输入是2维,输出是3维,先加1维向量方便后续处理
input_ids = tf.expand_dims(input_ids, axis=[-1])
//获取语料表
embedding_table = tf.get_variable(
name=word_embedding_name,
shape=[vocab_size, embedding_size], //[语料表大小,映射词向量维度]
initializer=create_initializer(initializer_range))
flat_input_ids = tf.reshape(input_ids, [-1]) //计算总共需要到语料表中查询的词的个数;batch_size*最大序列长度;Eg:8*128=1024
if use_one_hot_embeddings:
one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
output = tf.matmul(one_hot_input_ids, embedding_table)
else:
output = tf.gather(embedding_table, flat_input_ids) //查询到相应词的词向量,返回(flat_input_ids,定义词向量的维度);Eg:(1024,768)即:总共1024个词,一个词的维度是768维
input_shape = get_shape_list(input_ids)
output = tf.reshape(output,
input_shape[0:-1] + [input_shape[-1] * embedding_size]) //构建返回结果(batch_size,共有多少个词,词映射的词向量维度);Eg:(8,128,768)即:8个样本,一个样本中有128个词,每个词映射成768维向量
return (output, embedding_table)
其次是Embeddings层中的embedding_postprocessor()函数,其作用是在输入序列中加入type_ids和位置编码信息,输入值和输出值一样,都是(batch_size,seq_length,embedding_size)
def embedding_postprocessor(input_tensor,
use_token_type=False,
token_type_ids=None,
token_type_vocab_size=16,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=0.02,
max_position_embeddings=512,
dropout_prob=0.1):
input_shape = get_shape_list(input_tensor, expected_rank=3) //输入上一层embedding构建的词向量序列;Eg:(8,128,768)
batch_size = input_shape[0] //batch_size;Eg:8
seq_length = input_shape[1] //一个样本中词向量的个数;Eg:128
width = input_shape[2] //一个词向量的维度;Eg:768
output = input_tensor //初始化输出;Eg:(8,128,768)
if use_token_type:
if token_type_ids is None: //判断type_ids是否存在,若存在则将其转换成对应的词向量
raise ValueError("`token_type_ids` must be specified if"
"`use_token_type`
//获取语料表
token_type_table = tf.get_variable(
name=token_type_embedding_name,
shape=[token_type_vocab_size, width],
initializer=create_initializer(initializer_range)) //[2,词向量维度],只有两种可能性(0,1),0:第一句,1:第二句;Eg:(2,768)
flat_token_type_ids = tf.reshape(token_type_ids, [-1]) //计算需要查找的个数;batch_size*seq_length;Eg:8*128=1024
one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) //one_hot_ids(总共词的个数,2),2:只有0,1
token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) //进行矩阵相乘,获取所有词位置编码的词向量,(总共词的个数,词向量的维度);Eg:(1024,768)
token_type_embeddings = tf.reshape(token_type_embeddings,
[batch_size, seq_length, width]) //实现和之前输出相同的格式,(batch_size,词向量个数,词向量维度);Eg:(8,128,768)
output += token_type_embeddings //输入词向量加上type_ids的词向量编码,此时的词向量既有输入词语的词向量信息,也有type_ids的词向量信息
if use_position_embeddings: //添加词向量位置编码信息
assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) //位置编码向量,维度(seq_length,max_position_embeddings)即(输入序列长度,定义的最大位置个数);Eg:(128,512)
with tf.control_dependencies([assert_op]):
full_position_embeddings = tf.get_variable(
name=position_embedding_name,
shape=[max_position_embeddings, width],
initializer=create_initializer(initializer_range)) //(512,词向量的维度),定义512个位置信息,将位置信息编码成词向量相同的维度;Eg:(512,768)
position_embeddings = tf.slice(full_position_embeddings, [0, 0],
[seq_length, -1]) //根据词向量个数,截取实际的位置编码个数;Eg:(128,768)
num_dims = len(output.shape.as_list())
position_broadcast_shape = []
for _ in range(num_dims - 2):
position_broadcast_shape.append(1)
position_broadcast_shape.extend([seq_length, width])
position_embeddings = tf.reshape(position_embeddings,
position_broadcast_shape) //将位置编码信息映射添加一个维度,转换成和词向量编码信息相同维度的向量,因为位置编码与不同句无关,所以添加的以为向量用1表示,即(1,seq_length,embedding_size);Eg:(1,128,768)
output += position_embeddings //加入位置信息编码信息
output = layer_norm_and_dropout(output, dropout_prob) //加入位置编码及type_ids的词向量序列
return output
Embeddings层主要有以下三个作用:
通过Embeddings,将输出包含有输入信息、type_ids信息和位置编码信息的多维张量,张量维度是(batch_size,seq_length,embedding_size)
with tf.variable_scope("encoder"):
//给mask添加一个维度,将二维的mask转换为三维的mask,添加的维度用来表示每一个词需要与其它词进行Attention计算的个数标识
attention_mask = create_attention_mask_from_input_mask(
input_ids, input_mask)
//`sequence_output` shape = [batch_size, seq_length, hidden_size].
//输入Transformer模型进行训练
self.all_encoder_layers = transformer_model(
input_tensor=self.embedding_output, //Embeddings层输出词向量
attention_mask=attention_mask, //mask标记
hidden_size=config.hidden_size, //向量的维度;Eg:768
num_hidden_layers=config.num_hidden_layers, //模型中神经元的个数
num_attention_heads=config.num_attention_heads,//Transformer模型中多头机制头的数量
intermediate_size=config.intermediate_size, //全连接层神经元的个数
intermediate_act_fn=get_activation(config.hidden_act),
hidden_dropout_prob=config.hidden_dropout_prob,
attention_probs_dropout_prob=config.attention_probs_dropout_prob,
initializer_range=config.initializer_range,
do_return_all_layers=True)
self.sequence_output = self.all_encoder_layers[-1]
def transformer_model(input_tensor,
attention_mask=None,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12, #多头机制头的数量
intermediate_size=3072,
intermediate_act_fn=gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False):
if hidden_size % num_attention_heads != 0: //判断定义的向量维度是否能整除头的数量
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, num_attention_heads))
attention_head_size = int(hidden_size / num_attention_heads) //计算每个头需要多少维度的向量特征
input_shape = get_shape_list(input_tensor, expected_rank=3) //输入模型向量(batch_size,句子的长度,词向量的维度)
batch_size = input_shape[0] //总共有多少个样本
seq_length = input_shape[1] //一个样本中句子的长度
input_width = input_shape[2] //一个词对应的向量维度
if input_width != hidden_size: //判断输入输出的词向量维度是否一样,使残差链接能够正确相加
raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
(input_width, hidden_size))
prev_output = reshape_to_matrix(input_tensor) //将输入张量维度从3维降维2维,(batch_size,seq_length,embedding_size)->(batch_size*seq_length,embedding_size);Eg:(8,128,768)->(1024,768)
//进行Attention计算
all_layer_outputs = []
for layer_idx in range(num_hidden_layers): //attention机制,模型共有12层,遍历每一层
with tf.variable_scope("layer_%d" % layer_idx):
layer_input = prev_output //每一层的输出作为下一层的输入
with tf.variable_scope("attention"):
attention_heads = []
with tf.variable_scope("self"):
attention_head = attention_layer(
from_tensor=layer_input, //self-Attention中from_tensor和to_tensor相等,都是模型的输入向量
to_tensor=layer_input,
attention_mask=attention_mask, //标记哪些词具有实际意义,需要进行attention操作
num_attention_heads=num_attention_heads,//attention机制有多少个头
size_per_head=attention_head_size, //每个头有多少维度的向量特征
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_2d_tensor=True,
batch_size=batch_size,
from_seq_length=seq_length,
to_seq_length=seq_length)
attention_heads.append(attention_head)
attention_output = None
if len(attention_heads) == 1:
attention_output = attention_heads[0]
else:
attention_output = tf.concat(attention_heads, axis=-1)
//添加全连接层:with `layer_input`.
with tf.variable_scope("output"):
attention_output = tf.layers.dense(
attention_output,
hidden_size,
kernel_initializer=create_initializer(initializer_range))
attention_output = dropout(attention_output, hidden_dropout_prob)
attention_output = layer_norm(attention_output + layer_input) //进行残差连接
// The activation is only applied to the "intermediate" hidden layer.
with tf.variable_scope("intermediate"):
intermediate_output = tf.layers.dense(
attention_output,
intermediate_size,
activation=intermediate_act_fn,
kernel_initializer=create_initializer(initializer_range))
///将经过全连接层,特征维度转换成3072的特征向量降维到768维,Down-project back to `hidden_size` then add the residual.
with tf.variable_scope("output"):
layer_output = tf.layers.dense(
intermediate_output,
hidden_size,
kernel_initializer=create_initializer(initializer_range))
layer_output = dropout(layer_output, hidden_dropout_prob)
layer_output = layer_norm(layer_output + attention_output)
prev_output = layer_output
all_layer_outputs.append(layer_output)
//判断是否需要返回所有结果
if do_return_all_layers:
final_outputs = []
for layer_output in all_layer_outputs:
final_output = reshape_from_matrix(layer_output, input_shape)
final_outputs.append(final_output)
return final_outputs
else:
final_output = reshape_from_matrix(prev_output, input_shape)
return final_output
Self-Atention计算
Self-Attention计算中引入了三个特殊矩阵计算特征向量,即:Query、Key、Value,其中Query向量代表是后续将要区查询计算的向量,Key向量表示后续将要被查询计算的向量,Value向量表示当前的实际特征。
Self-Attention计算的程序实现如下:
def attention_layer(from_tensor,
to_tensor,
attention_mask=None,
num_attention_heads=1,
size_per_head=512,
query_act=None, #Q矩阵
key_act=None, #K矩阵
value_act=None, #V矩阵
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None):
def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
seq_length, width):
output_tensor = tf.reshape(
input_tensor, [batch_size, seq_length, num_attention_heads, width])
output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
return output_tensor
from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) //(batch_size*seq_length,embedding_size);Eg:(1024,768)
to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])//(batch_size*seq_length,embedding_size);Eg:(1024,768)
if len(from_shape) != len(to_shape): //判断from_shape和to_shape是否维度相同
raise ValueError(
"The rank of `from_tensor` must match the rank of `to_tensor`.")
if len(from_shape) == 3:
batch_size = from_shape[0]
from_seq_length = from_shape[1]
to_seq_length = to_shape[1]
elif len(from_shape) == 2: //判断计算需要参数是否都存在
if (batch_size is None or from_seq_length is None or to_seq_length is None):
raise ValueError(
"When passing in rank 2 tensors to attention_layer, the values "
"for `batch_size`, `from_seq_length`, and `to_seq_length` "
"must all be specified.")
// Scalar dimensions referenced here:
// B = batch size (number of sequences) ;Eg:8
// F = `from_tensor` sequence length;Eg:128
// T = `to_tensor` sequence length;Eg:128
// N = `num_attention_heads`;attention机制中头的数量;Eg:12
// H = `size_per_head`;一个头有多少维的特征向量;Eg:64
from_tensor_2d = reshape_to_matrix(from_tensor) //转换成2维向量;Eg:(1024,768)
to_tensor_2d = reshape_to_matrix(to_tensor) //转换成2维向量;Eg:(1024,768)
//构建查询矩阵Q:`query_layer` = [B*F, N*H];Eg:[8*128,12*64]:[1024,768]
query_layer = tf.layers.dense(
from_tensor_2d,
num_attention_heads * size_per_head,
activation=query_act,
name="query",
kernel_initializer=create_initializer(initializer_range))
//构建备查矩阵K: `key_layer` = [B*T, N*H];Eg:[8*128,12*64]:[1024,768]
key_layer = tf.layers.dense(
to_tensor_2d,
num_attention_heads * size_per_head,
activation=key_act,
name="key",
kernel_initializer=create_initializer(initializer_range))
//构建实际特征矩阵V(与K矩阵完全一样):`value_layer` = [B*T, N*H];Eg:[8*128,12*64]
value_layer = tf.layers.dense(
to_tensor_2d,
num_attention_heads * size_per_head,
activation=value_act,
name="value",
kernel_initializer=create_initializer(initializer_range))
// `query_layer` = [B, N, F, H];变换维度,每句话只和自己做attention计算;Eg:[8,12,128,64]
query_layer = transpose_for_scores(query_layer, batch_size,
num_attention_heads, from_seq_length,
size_per_head)
//`key_layer` = [B, N, T, H];Eg:[8,12,128,64];变换维度和Q矩阵做内积
key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
to_seq_length, size_per_head)
//`attention_scores` = [B, N, F, T]
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) //矩阵Q和矩阵K做内积
attention_scores = tf.multiply(attention_scores,
1.0 / math.sqrt(float(size_per_head))) //做softmax消除维度对结果的影响
if attention_mask is not None: //引入mask机制,考虑实际句子长度
//`attention_mask` = [B, 1, F, T];Eg:[8,1,128,128]
attention_mask = tf.expand_dims(attention_mask, axis=[1])
adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 //mask标记为1的转换结果为0,标记为0的转换结果为无穷小,后续softmax计算中0转换成1,无穷小转换成0
attention_scores += adder //内积值加上mask标记
// Q和K做内积的概率值矩阵:`attention_probs` = [B, N, F, T];Eg:[8,12,128,128]
attention_probs = tf.nn.softmax(attention_scores) //softmax机制将q,k内积转换为概率值
attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
//#将V矩阵转换成和概率值矩阵相同的维度以计算
//`value_layer` = [B, T, N, H]
value_layer = tf.reshape(
value_layer,
[batch_size, to_seq_length, num_attention_heads, size_per_head])
// `value_layer` = [B, N, T, H]
value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
//V矩阵和概率值矩阵进行矩阵乘法运算,计算出最终特征向量
// `context_layer` = [B, N, F, H]
context_layer = tf.matmul(attention_probs, value_layer)
//`context_layer` = [B, F, N, H]
context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
//构建返回矩阵
if do_return_2d_tensor:
//`context_layer` = [B*F, N*H]
context_layer = tf.reshape(
context_layer,
[batch_size * from_seq_length, num_attention_heads * size_per_head]) //将矩阵运算结果转换成输入矩阵维度相同的结果
else:
// `context_layer` = [B, F, N*H]
context_layer = tf.reshape(
context_layer,
[batch_size, from_seq_length, num_attention_heads * size_per_head])
return context_layer
//只返回[CLS]的特征向量
def get_pooled_output(self):
return self.pooled_output
//返回序列中所有词的特征向量
def get_sequence_output(self):
return self.sequence_output
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, use_one_hot_embeddings):
"""Creates a classification model."""
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids, //转换为编码的输入序列
input_mask=input_mask, //标记词是否有实际意义
token_type_ids=segment_ids, //标记词是属于哪一句话
use_one_hot_embeddings=use_one_hot_embeddings)
output_layer = model.get_pooled_output() //获取[CLS]返回的特征向量
hidden_size = output_layer.shape[-1].value //获取特征向量的维度;Eg:768
//创建分类概率矩阵,维度(标签个数,向量维度);Eg:(2,768)
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
//构建偏置矩阵
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True) //输出特征矩阵乘分类权重矩阵
logits = tf.nn.bias_add(logits, output_bias) //加上偏置矩阵
probabilities = tf.nn.softmax(logits, axis=-1) //计算概率值
log_probs = tf.nn.log_softmax(logits, axis=-1) //分类器计算的概率分布
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) //实际标签的one_hot向量
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) //计算模型损失函数
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, logits, probabilities)
至此,我们讲完了BERT模型中的核心程序,包括:
根据实际项目的需求,我们可以通过对数据处理模块及下游任务进行调整,使用BERT模型来实现多标签文本分类、QA问答和机器翻译等自然语言处理任务。