def load_directory_data(directory):
data = {}
data["sentence"] = []
# data['sentiment'] = []
data["polarity"] = []
for file_path in os.listdir(directory):
with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
# txt
# data["sentence"].append(f.read())
# csv
reader = csv.reader(f, delimiter=",")
for line in reader:
# data['sentiment'].append(line[2])
# data["sentiment"].append(re.match("(\w+)\.csv", file_path).group(1))
return pd.DataFrame.from_dict(data)
parser = argparse.ArgumentParser()
# data preprocessing
parser.add_argument('--DATA_COLUMN', default="sentence", help="data column")
parser.add_argument('--LABEL_COLUMN', default="polarity", help="polarity")
parser.add_argument('--label_list', default="0,1", help="label_list ")
# This is a path to an uncased (all lowercase) version of BERT
# parser.add_argument("--BERT_MODEL_HUB", default="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1")
parser.add_argument("--BERT_INIT_CHKPNT", default="./bert_pretrain_model/bert_model.ckpt")
parser.add_argument("--BERT_VOCAB", default="./bert_pretrain_model/vocab.txt")
parser.add_argument("--BERT_CONFIG", default="./bert_pretrain_model/bert_config.json")
# We'll set sequences to be at most 128 tokens long.
parser.add_argument("--MAX_SEQ_LENGTH", default=128, type=int)
""" train hyper-parameters """
parser.add_argument("--BATCH_SIZE", default=32, type=int)
parser.add_argument("--LEARNING_RATE", default=2e-5, type=float)
parser.add_argument("--NUM_TRAIN_EPOCHS", default=3.0, type=float)
# Warmup is a period of time where hte learning rate
# is small and gradually increases--usually helps training.
parser.add_argument("--WARMUP_PROPORTION", default=0.1, type=float)
# Model configs
parser.add_argument("--SAVE_CHECKPOINTS_STEPS", default=500, type=int)
parser.add_argument("--SAVE_SUMMARY_STEPS", default=100, type=int)
""" save model """
parser.add_argument("--OUTPUT_DIR", default="./save_model/")
parser.add_argument("--model_output", default="bert_model")
def create_tokenizer_from_hub_module(hp):
create tokenizer
tokenization.validate_case_matches_checkpoint(True, hp.BERT_INIT_CHKPNT)
return tokenization.FullTokenizer(vocab_file=hp.BERT_VOCAB, do_lower_case=True)
def process_data(hp):
tokenizer = create_tokenizer_from_hub_module(hp)
train, test = download_and_load_datasets()
# print(train)
# train = train.sample(5000)
# test = test.sample(5000)
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None,
# Globally unique ID for bookkeeping, unused in this example
label=x[hp.LABEL_COLUMN]), axis=1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,
label=x[hp.LABEL_COLUMN]), axis=1)
# print(tokenizer.tokenize("This here's an example of using the BERT tokenizer"))
# Convert our train and test features to InputFeatures that BERT understands.
label_list = [int(i) for i in hp.label_list.split(",")]
train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, hp.MAX_SEQ_LENGTH,
test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, hp.MAX_SEQ_LENGTH,
return train_features, test_features
return {
"eval_accuracy": accuracy,
"f1_score": f1_score,
"auc": auc,
"precision": precision,
"recall": recall,
"true_positives": true_pos,
"true_negatives": true_neg,
"false_positives": false_pos,
"false_negatives": false_neg
if use_sentence:
output_layer = model.get_pooled_output()
output_layer = model.get_sequence_output()
return output_layer
[('i dont know why people think this is such a bad movie its got a pretty good plot some good action and the change of location for harry does not hurt either sure some of its offensive and gratuitous but this is not the only movie like that eastwood is in good form as dirty harry and i liked pat hingle in this movie as the small town cop if you liked dirty harry then you should see this one its a lot better than the dead pool', array([-0.21832937, -1.6289296 ], dtype=float32), '0'), ('i watched this video at a friends house im glad i did not waste money buying this one the video cover has a scene from the 1975 movie capricorn one the movie starts out with several clips of rocket blow-ups most not related to manned flight sibrels smoking gun is a short video clip of the astronauts preparing a video broadcast he edits in his own voice-over instead of letting us listen to what the crew had to say the video curiously ends with a showing of the zapruder film his claims about radiation shielding star photography and others lead me to believe is he extremely ignorant or has some sort of ax to grind against nasa the astronauts or american in general his science is bad and so is this video.', array([-0.6369945, -0.7526416], dtype=float32), '0'), ('this movie is full of references like mad max ii the wild one and many others the ladybug´s face it´s a clear reference or tribute to peter lorre this movie is a masterpiece we´ll talk much more about in the future', array([-1.7239381 , -0.19645582], dtype=float32), '1'), ('what happens when an army of wetbacks towelheads and godless eastern european commies gather their forces south of the border gary busey kicks their butts of course another laughable example of reagan-era cultural fallout bulletproof wastes a decent supporting cast headed by l q jones and thalmus rasulala', array([-1.4781466 , -0.25884843], dtype=float32), '1')]
最后当然是获取我们想要的词向量或者句子向量, model.get_pooled_output()