imdb中的数据已经被预处理好,为整数序列,每个整数代表着一个特定单词。可用imdb的词典进行翻译。(https://s3.amazonaws.com/text-datasets/imdb.npz)如果不能科学上网,可以在https://pan.baidu.com/s/1pNDbE3VMdYJiiXyaN2roaw 提取码:0wnn下载
import tensorflow as tf
from tensorflow import keras
import numpy as np
mdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data('/home/kesci/input/idmb2286/imdb.npz',num_words=10000)
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
Training entries: 25000, labels: 25000
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index[""] = 0
word_index[""] = 1
word_index[""] = 2 # unknown
word_index[""] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
" shown in australia as this incredibly bad movie is so bad that you become and have to watch it to the end just to see if it could get any worse and it does the storyline is so predictable it seems written by a high school class the sets are pathetic but marginally better than the and the acting is wooden br br the infant seems to have been stolen from the props of there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money"
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
len(train_data[0]), len(train_data[1])
[ 1 14 22 16 43 530 973 1622 1385 65 458 4468
66 3941 4 173 36 256 5 25 100 43 838 112
50 670 2 9 35 480 284 5 150 4 172 112
167 2 336 385 39 4 172 4536 1111 17 546 38
13 447 4 192 50 16 6 147 2025 19 14 22
4 1920 4613 469 4 22 71 87 12 16 43 530
38 76 15 13 1247 4 22 17 515 17 12 16
626 18 2 5 62 386 12 8 316 8 106 5
4 2223 5244 16 480 66 3785 33 4 130 12 16
38 619 5 25 124 51 36 135 48 25 1415 33
6 22 12 215 28 77 52 5 14 407 16 82
10311 8 4 107 117 5952 15 256 4 2 7 3766
5 723 36 71 43 530 476 26 400 317 46 7
4 12118 1029 13 104 88 4 381 15 297 98 32
2071 56 26 141 6 194 7486 18 4 226 22 21
134 476 26 480 5 144 30 5535 18 51 36 28
224 92 25 104 4 226 65 16 38 1334 88 12
16 283 5 16 4472 113 103 32 15 16 5345 19
178 32 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
模型需要一个损失函数和一个用于训练的优化器。 由于这是二元分类问题和概率模型输出(具有S形激活的单个单元层),我们将使用binary_crossentropy损失函数。
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]
import numpy as np
from keras.callbacks import Callback
from keras.engine.training import Model
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
class Metrics(Callback):
def on_train_begin(self, logs={}):
self.val_f1s = []
self.val_recalls = []
self.val_precisions = []
def on_epoch_end(self, epoch, logs={}):
val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
val_targ = self.validation_data[1]
_val_f1 = f1_score(val_targ, val_predict,average='weighted')
_val_recall = recall_score(val_targ, val_predict,average='weighted')
_val_precision = precision_score(val_targ, val_predict,average='weighted')
print( ' — val_f1: %f — val_precision: %f — val_recall %f' %(_val_f1, _val_precision, _val_recall))
metrics = Metrics()
from keras.callbacks import EarlyStopping
earlystopping=keras.callbacks.EarlyStopping(monitor='val_acc', patience=8, verbose=0, mode='max')
history = model.fit(partial_x_train,
validation_data=(x_val, y_val),
results = model.evaluate(test_data, test_labels)
25000/25000 [==============================] - 2s 61us/step
[0.31110355438232423, 0.87736]