环境:kaggle kernel;
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from tqdm import tqdm
import operator
import os
import re
import gc
train = pd.read_csv("../input/imdb-dataset/labeledTrainData.tsv", header=0,delimiter="\t", quoting=3)
max_features = 10000
maxlen = 200
embed_size = 300
# 建立tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=True)
#word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(list(train['review']))
x_train = pad_sequences(x_train,maxlen=maxlen) # padding
y_train = list(train['sentiment'])
# 划分训练和验证集
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.3,random_state=0)
def load_word2vec(filename):
word2vec = KeyedVectors.load_word2vec_format(filename, binary=True)
embeddings_index = {}
for i, vec in tqdm(enumerate(word2vec.wv.vectors)):
embeddings_index[word2vec.wv.index2word[i]] = vec
return embeddings_index
EMBEDDING_FILE = '../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz'
embeddings_index = load_word2vec(EMBEDDING_FILE)
def build_matrix(embeddings_index,word_index):
embedding_matrix = np.zeros((max_features, 300))
for word, i in tqdm(word_index.items()):
if i >= max_features: continue
# word对应的vector
embedding_vector = embeddings_index[word]
# word不存在则使用unknown的vector
embedding_vector = embeddings_index["unknown"]
if embedding_vector is not None:
# 保证embedding_matrix行的向量与word_index中序号一致
embedding_matrix[i] = embedding_vector
return embedding_matrix
embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index)
def build_model(embedding_matrix=None):
inp = Input(shape=(maxlen,))
if embedding_matrix is None:
x = Embedding(max_features, embed_size)(inp)
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
model = build_model(embedding_matrix)
history = model.fit(x_train, y_train, batch_size=512, epochs=10, validation_data=(x_val, y_val))
Train on 17500 samples, validate on 7500 samples
Epoch 1/10
17500/17500 [==============================] - 2s 130us/step - loss: 0.0260 - acc: 0.9941 - val_loss: 0.4803 - val_acc: 0.8791
Epoch 2/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0181 - acc: 0.9970 - val_loss: 0.5183 - val_acc: 0.8787
Epoch 3/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0148 - acc: 0.9978 - val_loss: 0.5404 - val_acc: 0.8739
Epoch 4/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0132 - acc: 0.9979 - val_loss: 0.8335 - val_acc: 0.8148
Epoch 5/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0949 - acc: 0.9657 - val_loss: 0.4773 - val_acc: 0.8753
Epoch 6/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0189 - acc: 0.9958 - val_loss: 0.5145 - val_acc: 0.8796
Epoch 7/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0133 - acc: 0.9975 - val_loss: 0.5362 - val_acc: 0.8797
Epoch 8/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0082 - acc: 0.9992 - val_loss: 0.5615 - val_acc: 0.8789
Epoch 9/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0057 - acc: 0.9995 - val_loss: 0.5888 - val_acc: 0.8763
Epoch 10/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0050 - acc: 0.9997 - val_loss: 0.6223 - val_acc: 0.8771
# 对文本建立词典
def build_vocab(sentences):
# key is word,value is frequency
vocab = {}
for sentence in tqdm(sentences):
for word in sentence:
vocab[word] += 1
vocab[word] = 1
return vocab
def check_coverage(vocab,embeddings_index):
iv = {} # in vocab
oov = {} # out of vocba
k = 0
i = 0
for word in tqdm(vocab):
# 词典中的单词在embedding中
iv[word] = embeddings_index[word]
k += vocab[word]
oov[word] = vocab[word]
i += vocab[word]
print('Found embeddings for {:.2%} of vocab'.format(len(iv) / len(vocab)))
print('Found embeddings for {:.2%} of all text'.format(k / (k + i)))
sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
return sorted_x
sentences = train['review'].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
[('a', 155088),
('and', 152651),
('of', 142970),
('to', 132564),
The', 7243),
前4的单词是"a",“and”,“of”,“to”,这是因为GoogleNews Embedding在训练时已经去掉了,这些词我们之后处理。
'/>' in embeddings_index
# 用于清理文本
def clean_text(x):
x = str(x)
x.replace('/>', ' /> ')
for punct in "/-'":
x = x.replace(punct, ' ')
for punct in '&':
x = x.replace(punct, f' {punct} ')
for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
x = x.replace(punct, '')
return x
train["review"] = train["review"].progress_apply(lambda x: clean_text(x))
sentences = train["review"].apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
[('a', 156185),
('and', 155962),
('of', 144029),
('to', 133847),
('10', 4091),
('itbr', 985),
('moviebr', 838),
('filmbr', 790),
('20', 684),
('80', 591)]
def clean_numbers(x):
x = re.sub('[0-9]{5,}', '#####', x)
x = re.sub('[0-9]{4}', '####', x)
x = re.sub('[0-9]{3}', '###', x)
x = re.sub('[0-9]{2}', '##', x)
return x
train["review"] = train["review"].progress_apply(lambda x: clean_numbers(x))
sentences = train["review"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
[('a', 156185),
('and', 155962),
('of', 144029),
('to', 133847),
('itbr', 985),
('moviebr', 838),
('filmbr', 790),
('humour', 424),
('timebr', 342),
('favourite', 315)]
train["review"] = train["review"].progress_apply(lambda x:x.replace('br',' '))
sentences = train["review"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
[('a', 156571),
('and', 156104),
('of', 144079),
('to', 133914),
('illiant', 1094),
('humour', 433),
('ief', 378),
('favourite', 316),
('eaking', 311),
('utal', 293)]
def _get_mispell(mispell_dict):
mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
return mispell_dict, mispell_re
mispell_dict = {'colour':'color',
'didnt':'did not',
'doesnt':'does not',
'isnt':'is not',
'shouldnt':'should not',
'wwii':'world war 2',
'instagram': 'social medium',
'whatsapp': 'social medium',
'snapchat': 'social medium'
mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
def replace(match):
return mispellings[match.group(0)]
return mispellings_re.sub(replace, text)
train["review"] = train["review"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["review"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
def drop_word(x):
to_remove = [' a ',' to ',' of ',' and ']
for punct in to_remove:
x = x.replace(punct, ' ')
return x
train["review"] = train["review"].progress_apply(lambda x: drop_word(x))
sentences = train["review"].apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
# 建立tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=True)
#word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(list(train['review']))
x_train = pad_sequences(x_train,maxlen=maxlen) # padding
y_train = list(train['sentiment'])
# 划分训练和验证集
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.3,random_state=0)
embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index)
model = build_model(embedding_matrix)
history = model.fit(x_train, y_train, batch_size=512, epochs=10, validation_data=(x_val, y_val))
Train on 17500 samples, validate on 7500 samples
Epoch 1/10
17500/17500 [==============================] - 3s 192us/step - loss: 0.5736 - acc: 0.7273 - val_loss: 0.3915 - val_acc: 0.8536
Epoch 2/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.3264 - acc: 0.8633 - val_loss: 0.2806 - val_acc: 0.8833
Epoch 3/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.2271 - acc: 0.9112 - val_loss: 0.2614 - val_acc: 0.8899
Epoch 4/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.1750 - acc: 0.9376 - val_loss: 0.2695 - val_acc: 0.8935
Epoch 5/10
17500/17500 [==============================] - 2s 130us/step - loss: 0.1217 - acc: 0.9585 - val_loss: 0.2961 - val_acc: 0.8868
Epoch 6/10
17500/17500 [==============================] - 2s 129us/step - loss: 0.0801 - acc: 0.9757 - val_loss: 0.3426 - val_acc: 0.8837
Epoch 7/10
17500/17500 [==============================] - 2s 129us/step - loss: 0.0550 - acc: 0.9860 - val_loss: 0.3498 - val_acc: 0.8820
Epoch 8/10
17500/17500 [==============================] - 2s 130us/step - loss: 0.0425 - acc: 0.9903 - val_loss: 0.4001 - val_acc: 0.8807
Epoch 9/10
17500/17500 [==============================] - 2s 129us/step - loss: 0.0280 - acc: 0.9946 - val_loss: 0.4493 - val_acc: 0.8824
Epoch 10/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0178 - acc: 0.9974 - val_loss: 0.4903 - val_acc: 0.8797