Sentimental analysis on text (a binary classification model based on Keras)
# -*- coding: utf-8 -*-
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
import matplotlib.pyplot as plt
labels = []
texts = []
maxlen = 100
training_samples = 20000
validation_samples = 4000
max_words = 10000
embedding_index = {}
embedding_dim = 300
predict_texts = ['I love you and you are so beautiful, you are good, you are good, you are good',
'I hate you and you are bad, you are bad, you are bad',
'you are good']
def get_datasource():
imdb_dir = 'D:/DL/keras/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
for label_type in ['neg', 'pos']:
dir_name = os.path.join(train_dir, label_type)
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname), 'r', encoding='UTF-8')
if label_type == 'neg':
print('texts', texts[0:3])
print('len(texts)', len(texts))
return texts, labels
def text_split(training_samples, validation_samples, texts, labels):
tokenizer = Tokenizer(num_words=max_words)
# 将字符串转换为整数索引组成的列表
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print('data[0]: ', len(data[0]), '\n', data[0])
indices = np.arange(data.shape[0])
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
return x_train, y_train, x_val, y_val, word_index, tokenizer
def prepare_Glove(embedding_index, word_index, max_words, embedding_dim):
glove_dir = 'D:/DL/keras/glove.6B'
f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'), 'r', encoding='UTF-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embedding_index[word] = coefs
print('Found %s word vectors.' % len(embedding_index))
# word_index contains the words in corpus, embedding_index is the pre-trained embedding
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
if i < max_words:
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def predict_samples_vec(samples, tokenizer):
sequence = tokenizer.texts_to_sequences(samples)
pad_seq = pad_sequences(sequence, maxlen=maxlen)
print('pad_seq', pad_seq)
return pad_seq
def my_model(max_words, embedding_dim, maxlen):
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
# 将三维的嵌入张量展平成形状为二维张量
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
return model
def train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val):
model = my_model(max_words, embedding_dim, maxlen)
model.layers[0].set_weights([embedding_matrix]) # 加载预训练embedding
model.layers[0].trainable = False
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
save_model_file = 'pre_trained_glove_model.h5'
if not os.path.exists(save_model_file):
history =, y_train, epochs=20, batch_size=128, validation_data=(x_val, y_val))
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
def predict():
model = my_model(max_words, embedding_dim, maxlen)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
save_model_file = 'pre_trained_glove_model.h5'
print('model has restored...')
pred_test = model.predict(predict_texts)
for i in pred_test:
if i[0] < 0.3:
print('this is negtive comment')
print('this is positive comment')
if __name__ == "__main__":
text, label = get_datasource()
x_train, y_train, x_val, y_val, word_index, tokenizer = text_split(training_samples, validation_samples, text, label)
# embedding_matrix = prepare_Glove(embedding_index, word_index, max_words, embedding_dim)
predict_texts = predict_samples_vec(predict_texts, tokenizer)
# train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val)
There are two functions.
def prepare_Glove(embedding_index, word_index, max_words, embedding_dim):
glove_dir = 'D:/DL/keras/glove.6B'
f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'), 'r', encoding='UTF-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embedding_index[word] = coefs
print('Found %s word vectors.' % len(embedding_index))
# word_index contains the words in corpus, embedding_index is the pre-trained embedding
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
if i < max_words:
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val):
model = my_model(max_words, embedding_dim, maxlen)
model.layers[0].set_weights([embedding_matrix]) # 加载预训练embedding
model.layers[0].trainable = False
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
save_model_file = 'pre_trained_glove_model.h5'