tokenizer.texts_to_sequences()

#如果不为不在语料库中的单词创建一个单独的标志,例如"",则可能出现下述情况
当seed_text为"I went to dublin"时,长度为4;当seed_text为"Lawrence went to dublin"时,长度为3,因为Lawrence不再字典中

seed_text = "I went to dublin"
next_words = 2
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	print("="*30)
	print(len(token_list))
	#当seed_text为"I went to dublin"时,长度为4;当seed_text为"Lawrence went to dublin"时,长度为3,因为Lawrence不再字典中
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)

为避免此种情况,在Tokenizer()初始化时应将""添加

oov_tok = ""#添加
vocab_size = 100 #词典大小
tokenizer = Tokenizer(num_wods = vocab_size,oov_token = oov_tok)

你可能感兴趣的:(NLP,python,nlp)