加载IMDB数据集
X_train[0]=[1,14,22,.....32] 长度为228
X_train=sequence.pad_sequences(X_train,maxlen=500)
x_train[0]变为[0,0,0.......1,14,22,....32] 长度为500
import numpy
from keras.datasets import imdb
from matplotlib import pyplot
from keras.preprocessing import sequence
(X_train,y_train),(X_test,y_test)=imdb.load_data()
print("Train data:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_train[0])
print("first length:")
print(len(X_train[0]))
print("classes:")
print(numpy.unique(y_train))
print("number of words:")
print(len(numpy.unique(numpy.hstack(X_train))))
print("review length:")
result=map(len, X_train)
#sequence.pad_sequences
#将228长度的句子,填充到500,在前面前冲0
X_train=sequence.pad_sequences(X_train,maxlen=500)
print(X_train[0])
print(len(X_train[0]))
print("mean %.2f words(%f)"% (numpy.mean(result),numpy.std(result)))
pyplot.subplot(121)
pyplot.boxplot(result)
pyplot.subplot(122)
pyplot.hist(result)
pyplot.show()
Word Embeddings
imdb.load_data(nb_words=5000,test__split=0.33)
X_train=sequence.pad_sequences(X_train,maxlen=500)
X_test=sequence.pad_sequences(X_test,maxlen=500)
model.add(Embedding(5000,32,input_length=500))
5000词汇量,每个句子500长度,每个词用32位向量表示
普通神经网络
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
seed=7
numpy.random.seed(seed)
top_words=5000
test_split=0.33
(X_train,y_train),(X_test,y_test)=imdb.load_data(num_words=top_words)
num_lizi=X_train.shape[0]/10
num_lizi2=X_test.shape[0]/10
X_train=X_train[0:num_lizi]
y_train=y_train[0:num_lizi]
X_test=X_test[0:num_lizi2]
y_test=y_test[0:num_lizi2]
max_words=500
X_train=sequence.pad_sequences(X_train,maxlen=max_words)
X_test=sequence.pad_sequences(X_test,maxlen=max_words)
model=Sequential()
model.add(Embedding(top_words,32,input_length=max_words))
model.add(Flatten())
model.add(Dense(250,activation='relu'))
model.add(Dense(1))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=2,batch_size=128,verbose=1)
scores=model.evaluate(X_test, y_test,verbose=0)
print("Accuracy: %.2f%%"%(scores[1]*100))
一维CNN处理IMDB问题
# CNN for the IMDB problem
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
seed = 7
numpy.random.seed(seed)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
test_split = 0.33
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# pad dataset to a maximum review length in words
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=2, batch_size=128, verbose=1)
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))