文本分类:Bert微调并完成下游任务

1. 克隆Bert并获取预训练模型

$ git clone https://github.com/google-research/bert.git

依赖和环境:

  • Tensorflow-gpu version 1.15 (不建议使用TF2)
  • Python version: 3.7
  • CUDA Version: 10.2
  • 预训练模型: https://github.com/google-research/bert

2. 改写自己的分类器读写函数

run_classifier.py

class MyProcessor(DataProcessor):
  """Processor for the my data set."""
  def get_train_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

  def get_dev_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

  def get_test_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

  def get_labels(self):
    """See base class."""
    return ["0", "1"]

  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # Only the test set has a header
      if set_type == "test" and i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(___)
        text_b = tokenization.convert_to_unicode(___)
        label = "0"
      else:
        text_a = tokenization.convert_to_unicode(___)
        text_b = tokenization.convert_to_unicode(___)
        label = tokenization.convert_to_unicode(___)
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  processors = {
      "cola": ColaProcessor,
      "mnli": MnliProcessor,
      "mrpc": MrpcProcessor,
      "xnli": XnliProcessor,
      "mypc": MyProcessor
  }
  ... ...

3. 传入需要的flag参数并微调模型

#!/bin/bash

export CURRENT_PATH=$(cd "$(dirname "$0")";pwd)
export BERT_BASE_DIR=$CURRENT_PATH/models/YOUR_MODEL_PATH
export MY_DATASET=$CURRENT_PATH/YOUR_DATA_PATH/

python run_classifier.py \
  --task_name=mypc \
  --do_train=true \
  --do_eval=true \
  --do_predict=true \
  --data_dir=$MY_DATASET \
  --vocab_file=$BERT_BASE_DIR/vocab.txt \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
  --max_seq_length=64 \
  --train_batch_size=16 \
  --learning_rate=5e-5 \
  --num_train_epochs=3.0 \
  --do_lower_case=False \  # 多语言模型时
  --output_dir=$CURRENT_PATH/YOUR_OUTPUT_PATH

你可能感兴趣的:(机器学习,自然语言处理,深度学习,tensorflow,nlp)