bert下游_BERT 下游任务应用 代码详解 以情感分析为例

1、下载/制备数据集,这一步自行解决。

2 、用bert.runclassifier.InputExample(guid , text_a = , textb = , label= )处理成bert可读形式。text_a的格式接受pd.Dataframe格式

3、完成数据预处理。这部分应完成:

1)大写变小写

2)tokenize(i.e. "sally says hi" -> ["sally", "says", "hi"])

3)将词粉碎成wordpieces(i.e. "calling" -> ["call", "##ing"])

4)通过bert提供的vocab文件将word映射成index(包括wordpieces吗?)

5)加入CLS和SEP

6)加入"index"和"segment"token

tf hub上有大写转小写的module,使用方法如下:

# This is a path to an uncased (all lowercase) version of BERT

BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():

"""Get the vocab file and casing info from the Hub module."""

with tf.Graph().as_default():

bert_module = hub.Module(BERT_MODEL_HUB)

tokenization_info = bert_module(signature="tokenization_info", as_dict=True)

with tf.Session() as sess:

vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],

tokenization_info["do_lower_case"]])

return bert.tokenization.FullTokenizer(

vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

不过以上预处理这些在bert下,run_classifier_with_tfhub.py都包含了,不用自己写。在运行run_classifier.convert_examples_to_features,就会全部吧InputExamples转化成BERT理解的features了。

4、创建模型

以情感分析为例,创建模型代码如下,可以自己理解。这个在run_classifier_with_tfhub.py包含了。

def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,

num_labels):

"""Creates a classification model."""

bert_module = hub.Module(

BERT_MODEL_HUB,

trainable=True)

bert_inputs = dict(

input_ids=input_ids,

input_mask=input_mask,

segment_ids=segment_ids)

bert_outputs = bert_module(

inputs=bert_inputs,

signature="tokens",

as_dict=True)

# Use "pooled_output" for classification tasks on an entire sentence.

#句子级分类用pooled_output

# Use "sequence_outputs" for token-level output.

#token级任务用sequence_outputs

output_layer = bert_outputs["pooled_output"]

hidden_size = output_layer.shape[-1].value

# Create our own layer to tune for politeness data.

#创建自用的层来finetune

#就一层

output_weights = tf.get_variable(

"output_weights", [num_labels, hidden_size],

initializer=tf.truncated_normal_initializer(stddev=0.02))

output_bias = tf.get_variable(

"output_bias", [num_labels], initializer=tf.zeros_initializer())

with tf.variable_scope("loss"):

# Dropout helps prevent overfitting

output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

logits = tf.matmul(output_layer, output_weights, transpose_b=True)

logits = tf.nn.bias_add(logits, output_bias)

log_probs = tf.nn.log_softmax(logits, axis=-1)

# Convert labels into one-hot encoding

one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))

# If we're predicting, we want predicted labels and the probabiltiies.

if is_predicting:

return (predicted_labels, log_probs)

# If we're train/eval, compute loss between predicted and actual label

per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)

loss = tf.reduce_mean(per_example_loss)

return (loss, predicted_labels, log_probs)

5、接下来就是定义训练,评估,预测模型的函数了。以下是实例,在run_classifier_with_tfhub.py里面也可以找到。

# model_fn_builder actually creates our model function

# using the passed parameters for num_labels, learning_rate, etc.

def model_fn_builder(num_labels, learning_rate, num_train_steps,

num_warmup_steps):

"""Returns `model_fn` closure for TPUEstimator."""

def model_fn(features, labels, mode, params): # pylint: disable=unused-argument

"""The `model_fn` for TPUEstimator."""

input_ids = features["input_ids"]

input_mask = features["input_mask"]

segment_ids = features["segment_ids"]

label_ids = features["label_ids"]

is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

# TRAIN and EVAL

if not is_predicting:

(loss, predicted_labels, log_probs) = create_model(

is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

train_op = bert.optimization.create_optimizer(

loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

# Calculate evaluation metrics.

def metric_fn(label_ids, predicted_labels):

accuracy = tf.metrics.accuracy(label_ids, predicted_labels)

f1_score = tf.contrib.metrics.f1_score(

label_ids,

predicted_labels)

auc = tf.metrics.auc(

label_ids,

predicted_labels)

recall = tf.metrics.recall(

label_ids,

predicted_labels)

precision = tf.metrics.precision(

label_ids,

predicted_labels)

true_pos = tf.metrics.true_positives(

label_ids,

predicted_labels)

true_neg = tf.metrics.true_negatives(

label_ids,

predicted_labels)

false_pos = tf.metrics.false_positives(

label_ids,

predicted_labels)

false_neg = tf.metrics.false_negatives(

label_ids,

predicted_labels)

return {

"eval_accuracy": accuracy,

"f1_score": f1_score,

"auc": auc,

"precision": precision,

"recall": recall,

"true_positives": true_pos,

"true_negatives": true_neg,

"false_positives": false_pos,

"false_negatives": false_neg

}

eval_metrics = metric_fn(label_ids, predicted_labels)

if mode == tf.estimator.ModeKeys.TRAIN:

return tf.estimator.EstimatorSpec(mode=mode,

loss=loss,

train_op=train_op)

else:

return tf.estimator.EstimatorSpec(mode=mode,

loss=loss,

eval_metric_ops=eval_metrics)

else:

(predicted_labels, log_probs) = create_model(

is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

predictions = {

'probabilities': log_probs,

'labels': predicted_labels

}

return tf.estimator.EstimatorSpec(mode, predictions=predictions)

# Return the actual model function in the closure

return model_fn

6、模型的配置

# Compute train and warmup steps from batch size

#设置训练epochs,和warm up steps(学习率需要warmup)

# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)

BATCH_SIZE = 32

LEARNING_RATE = 2e-5

NUM_TRAIN_EPOCHS = 3.0

# Warmup is a period of time where hte learning rate

# is small and gradually increases--usually helps training.

#学习率需要预热,从小到大

WARMUP_PROPORTION = 0.1

# Model configs

SAVE_CHECKPOINTS_STEPS = 500

SAVE_SUMMARY_STEPS = 100

# Compute # train and warmup steps from batch size

num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)

num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Specify outpit directory and number of checkpoint steps to save

#模型保存位置

run_config = tf.estimator.RunConfig(

model_dir=OUTPUT_DIR,

save_summary_steps=SAVE_SUMMARY_STEPS,

save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

#创建模型

model_fn = model_fn_builder(

num_labels=len(label_list),

learning_rate=LEARNING_RATE,

num_train_steps=num_train_steps,

num_warmup_steps=num_warmup_steps)

#创建估算器

estimator = tf.estimator.Estimator(

model_fn=model_fn,

config=run_config,

params={"batch_size": BATCH_SIZE})

7、还记得第三步最后通过run_classifier.convert_examples_to_features预处理好的train_features吗。在这里需要经一部处理成一个生成器,用来之后训练。

# Create an input function for training. drop_remainder = True for using TPUs.

train_input_fn = bert.run_classifier.input_fn_builder(

features=train_features,

seq_length=MAX_SEQ_LENGTH,

is_training=True,

drop_remainder=False)

8、开始训练。

print(f'Beginning Training!')

current_time = datetime.now()

estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

print("Training took time ", datetime.now() - current_time)

#test测试

estimator.evaluate(input_fn=test_input_fn, steps=None)

9、预测label

def getPrediction(in_sentences):

labels = ["Negative", "Positive"]

#把待测的句子转换成example

input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label

#进一步转换成feature

input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

#进一步转换成生成器

predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)

#predict结果

predictions = estimator.predict(predict_input_fn)

return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

你可能感兴趣的:(bert下游)