test.csv
I am a handsome boy! really, perfect!
you are a beautiful girl. did you konw
perfect... I am fine
输出
这里设置max_length为20
input_ids根据字典查询序列,在前面加[CLS],两句间隔以及后面加[SEP],padding一些0直至20位。input_mask同理。segment_ids用来表示句子1和句子2,同样padding一些0至20位数。label_id和is_real_example是单个变量、非列表。
{'input_ids': [101, 1045, 2066, 2009, 1012, 102, 1045, 1044, 23644, 2232, 2016, 1029, 102, 0, 0, 0, 0, 0, 0, 0], 'input_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 'segment_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 'label_id': 1, 'is_real_example': True}
tf.train.Feature 和 tf.train.Example两个函数
def file_based_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer, output_file):
"""Convert a set of `InputExample`s to a TFRecord file."""
writer = tf.python_io.TFRecordWriter(output_file)
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(feature.input_ids)
features["input_mask"] = create_int_feature(feature.input_mask)
features["segment_ids"] = create_int_feature(feature.segment_ids)
features["label_ids"] = create_int_feature([feature.label_id])
features["is_real_example"] = create_int_feature(
[int(feature.is_real_example)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
先看一下哪里调用了这个函数
在 main() 函数里面
if FLAGS.do_train:
train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
file_based_convert_examples_to_features(
train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
https://blog.csdn.net/shenxiaolu1984/article/details/52857437
https://luckycallor.xyz/20181211/HowToUseTFRecord.html
https://luckycallor.xyz/20181211/HowToUseTFRecord.html