加载IMDB数据集
"#-------------------------------------------------------\n",
#IMDB加载数据集
#加载imdb数据
from keras.datasets import imdb
#前10000个常用单词
(train_data, train_labels),(test_data, test_labels) = imdb.load_data(path=\"D:/jupyter/deepLearning/imdb.npz\", num_words=10000)
train_data和test_data都是由评论组成的列表,每条评论又是由单词索引组成的列表。
train_labels和test_labels都是由0和1组成的列表,0代表负面,1代表正面。
print(len(train_data), train_data[0])
25000 [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
下面这段代码可以将某条评论解码为英文单词。
word_index = imdb.get_word_index(path="D:/jupyter/deepLearning/imdb_word_index.json") #word_index是一个将单词映射为整数索引的字典
print(word_index)
{"'beetle": 53888, 'herz': 59460, 'collar': 7618, 'decides': 1065, "'urf": 47167, "karin's": 61898, "'seryozha": 83456, 'wertmueller': 67625,
'hrzgovia': 57693, 'bfg': 16650, 'horseshoes': 57584, 'eminating': 57519, 'pettiness': 49265, "paalgard's": 59417, 'kman': 53047, 'montand': 20255, "'classic": 80558, "approach\x97keaton's": 63119, 'atlantis': 4013, 'heartbreaking': 5445'"}
reverse_word_index = dict(
[(value, key) for (key, value) in word_index.items()])
print(reverse_word_index)
{1: 'the', 2: 'and', 3: 'a', 4: 'of', 5: 'to', 6: 'is', 7: 'br', 8: 'in', 9: 'it', 10: 'i', 11: 'this', 12: 'that', 13: 'was', 14: 'as', 15: 'for',
16: 'with', 17: 'movie', 18: 'but', 19: 'film', 20: 'on', 21: 'not', 22: 'you', 23: 'are', 24: 'his', 25: 'have', 26: 'he', 27: 'be', 28: 'one',
29: 'all', 30: 'at', 31: 'by', 32: 'an', 33: 'they', 34: 'who', 35: 'so', 36: 'from', 37: 'like', 38: 'her', 39: 'or', 40: 'just', 41: 'about',
42: "it's", 43: 'out', 44: 'has', 45: 'if', 46: 'some', 47: 'there', 48: 'what', 49: 'good', 50: 'more'}
#将评论解码
decoded_review = ' '.join(
[reverse_word_index.get(i-3, '?') for i in train_data[0]])
print(decoded_review)
? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all
import numpy as np
#创建一个形为(len(sequences)=len(train_data),dimension)的零矩阵
def vectorize_sequences(sequences, dimension=10000):
results = np.zeros((len(sequences), dimension))
# print(results.shape) #(25000, 10000)
#内置的enumerate函数可以同时获得索引和值
for i, sequence in enumerate(sequences):
#i行的sequence列设为1
results[i, sequence] = 1. #将result[i]的指定索引设为1
return results
x_train = vectorize_sequences(train_data)
print(x_train)
x_test = vectorize_sequences(test_data)
[[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
...
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]]
#将标签向量化
y_train = np.asarray(train_labels).astype('float32')
print(y_train)
y_test = np.asarray(test_labels).astype('float32')
Dense(16, activation=‘relu’) 表示传入Dense层的参数(16)是该层隐藏单元的个数,一个隐藏单元是该层表示空间的一个维度,每个带有relu激活的Dense层都实现了下列张量运算。
output = relu(dot(w, input) + b)
## 模型定义
from keras import models
from keras import layers
#两个中间层,每层16个隐藏单元
#第三层输出一个标量,预测当前评论的情感
model = models.Sequential()
#样本为(25000,10000),所以下边只能接收(10000,)的结构
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
#优化器 rmsprop 损失函数 binary_crossentropy
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
#配置优化器
from keras import optimizers
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
loss='binary_crossentropy',
metrics=['accuracy'])
#使用自定义的损失和指标
from keras import losses
from keras import metrics
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
loss=losses.binary_crossentropy,
metrics=[metrics.binary_accuracy])
#留出验证集 10000个样本
x_val = x_train[:10000] #(10000, 10000)
partial_x_train = x_train[10000:] #((15000, 10000))
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
#训练模型
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(partial_x_train, #输入的数据
partial_y_train, #标签
epochs=20, #模型训练20个轮次,对x_train,y_train两个张量中所有样本进行20次迭代
batch_size=512,
validation_data=(x_val, y_val)) #指定的验证集
训练结果:
Train on 15000 samples, validate on 10000 samples
Epoch 1/20
15000/15000 [==============================] - 3s 198us/step - loss: 0.5084 - acc: 0.7813 - val_loss: 0.3797 - val_acc: 0.8684
Epoch 2/20
15000/15000 [==============================] - 2s 160us/step - loss: 0.3004 - acc: 0.9047 - val_loss: 0.3004 - val_acc: 0.8897
Epoch 3/20
15000/15000 [==============================] - 2s 154us/step - loss: 0.2179 - acc: 0.9285 - val_loss: 0.3085 - val_acc: 0.8711
Epoch 4/20
15000/15000 [==============================] - 3s 168us/step - loss: 0.1750 - acc: 0.9437 - val_loss: 0.2840 - val_acc: 0.8832
Epoch 5/20
15000/15000 [==============================] - 2s 151us/step - loss: 0.1427 - acc: 0.9542 - val_loss: 0.2841 - val_acc: 0.8872
Epoch 6/20
15000/15000 [==============================] - 2s 155us/step - loss: 0.1150 - acc: 0.9650 - val_loss: 0.3166 - val_acc: 0.8772
Epoch 7/20
15000/15000 [==============================] - 2s 156us/step - loss: 0.0980 - acc: 0.9705 - val_loss: 0.3127 - val_acc: 0.8846
Epoch 8/20
15000/15000 [==============================] - 2s 154us/step - loss: 0.0807 - acc: 0.9763 - val_loss: 0.3859 - val_acc: 0.8649
Epoch 9/20
15000/15000 [==============================] - 2s 154us/step - loss: 0.0661 - acc: 0.9821 - val_loss: 0.3635 - val_acc: 0.8782
Epoch 10/20
15000/15000 [==============================] - 2s 157us/step - loss: 0.0562 - acc: 0.9852 - val_loss: 0.3843 - val_acc: 0.8792
Epoch 11/20
15000/15000 [==============================] - 2s 157us/step - loss: 0.0438 - acc: 0.9895 - val_loss: 0.4152 - val_acc: 0.8779
Epoch 12/20
15000/15000 [==============================] - 2s 159us/step - loss: 0.0381 - acc: 0.9919 - val_loss: 0.4532 - val_acc: 0.8688
Epoch 13/20
15000/15000 [==============================] - 3s 177us/step - loss: 0.0300 - acc: 0.9928 - val_loss: 0.4699 - val_acc: 0.8728
Epoch 14/20
15000/15000 [==============================] - 3s 193us/step - loss: 0.0247 - acc: 0.9945 - val_loss: 0.5024 - val_acc: 0.8725
Epoch 15/20
15000/15000 [==============================] - 3s 195us/step - loss: 0.0178 - acc: 0.9977 - val_loss: 0.5321 - val_acc: 0.8702
Epoch 16/20
15000/15000 [==============================] - 3s 205us/step - loss: 0.0178 - acc: 0.9965 - val_loss: 0.5674 - val_acc: 0.8691
Epoch 17/20
15000/15000 [==============================] - 3s 205us/step - loss: 0.0103 - acc: 0.9992 - val_loss: 0.6264 - val_acc: 0.8628
Epoch 18/20
15000/15000 [==============================] - 3s 206us/step - loss: 0.0132 - acc: 0.9968 - val_loss: 0.6379 - val_acc: 0.8673
Epoch 19/20
15000/15000 [==============================] - 2s 167us/step - loss: 0.0055 - acc: 0.9997 - val_loss: 0.7146 - val_acc: 0.8584
Epoch 20/20
15000/15000 [==============================] - 2s 160us/step - loss: 0.0089 - acc: 0.9983 - val_loss: 0.6951 - val_acc: 0.8657
调用model.fit()返回了一个History对象,这个对象有一个成员history。
#'val_loss', 'val_acc' 验证集的loss和acc
history_dict = history.history
history_dict.keys()
dict_keys(['acc', 'val_acc', 'val_loss', 'loss'])
#绘制训练损失和验证损失
import matplotlib.pyplot as plt
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
#绘制训练精度和验证精度
plt.clf() #清空图像
acc = history_dict['acc']
val_acc = history_dict['val_acc']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
出现了过拟合,因此我们可以从头开始训练一个新的网络,训练四轮,然后在测试数据上评估模型。
#从头开始训练新的模型
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)
Epoch 1/4
25000/25000 [==============================] - 3s 125us/step - loss: 0.4750 - acc: 0.8214
Epoch 2/4
25000/25000 [==============================] - 3s 109us/step - loss: 0.2651 - acc: 0.9092
Epoch 3/4
25000/25000 [==============================] - 3s 110us/step - loss: 0.1984 - acc: 0.9301
Epoch 4/4
25000/25000 [==============================] - 3s 112us/step - loss: 0.1673 - acc: 0.9406
25000/25000 [==============================] - 3s 135us/step
最终结果如下所示:
results
[0.3212816875743866, 0.87328]
#在新数据上生成预测成果,评论为正面的可能性大小
model.predict(x_test)
array([[0.1397081 ],
[0.9996941 ],
[0.31572327],
...,
[0.07281625],
[0.04466555],
[0.47163403]], dtype=float32)
结果所示:网络对某些样本的结果非常确信(大于等于0.99, 或小于等于0.01),但对其他的结果却不那么确信!