深度学习模型只能处理数值型张量,因此需要将文本转换为数值张量,即文本向量化。将文本分解成标记token(单词、字符或n-gram),将标记与向量关联的方法常用的one-hot编码和词嵌入word embedding。
one-hot编码很明显的缺陷就是很难量化token之间的距离,假设现在有三个词human, dog, cat,分别编码为[1,0,0],[0,1,0], [0,0,1]那human与dog,dog与cat之间的距离就是相等的,这个显然是不合理的!
所以现在有词嵌入,每个维度表示一定含义,语义相似的词的词嵌入就相近。最开始的时候,词嵌入是每个词语有个固定的词嵌入,但是对于一词多译的情况,显然也是不合理的。目前基本都是每个token一个嵌入。
词嵌入的作用是将人类语言映射到几何空间,利用词向量之间的几何关系表示这些词之间的语义关系。一个好的词嵌入空间很大程度上取决于任务。
可以将一个embedding层理解为字典,他是接受整数作为输入,返回相关联的向量。(单词索引->对应的词向量)
在训练过程中,embedding的权重最开始是随机的。在训练过程中,利用反向传播逐渐调节这些词向量。
网络结构:
代码详解:
from keras.datasets import imdb
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
max_features = 10000 #作为特征的单词个数
maxlen = 20 #在maxlen个但此后截断文本
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) #加载数据
#将整数列表转换为(samples,maxlen)的二维整数张量
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen)) #标记的个数为10000, 嵌入的维度为64, 最大输入长度为maxlen
model.add(Flatten()) #将输出三维张量展平为二维张量
model.add(Dense(1, activation='sigmoid')) #添加分类器
model.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['acc'])
print(model.summary())
history = model.fit(x_train, y_train, epochs = 10, batch_size=32, validation_split=0.2)
Glove这种训练词向量的方法的核心思想是通过对’词-词’共现矩阵进行分解从而得到词表示的方法。需要先到https://nlp.stanford.edu/projects/glove下载glove.6B.zip文件并解压。
代码详解:
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
import matplotlib.pyplot as plt
imdb_dir = './aclImdb/aclImdb' #数据集
train_dir = imdb_dir + '/train' #训练集
test_dir = imdb_dir + '/test' #测试集
glove_dir = './glove.6B' #预训练的glove模型
labels = [] #存放标签
texts = []
#将负面和正面分别用0,1表示
for label_type in ['neg', 'pos']:
dir_name = train_dir + '/' + label_type
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname),encoding='gb18030', errors='ignore')
texts.append(f.read())
f.close()
if label_type == 'neg':
labels.append(0)
else:
labels.append(1)
test_labels = [] #存放标签
test_texts = []
#将负面和正面分别用0,1表示
for label_type in ['neg', 'pos']:
dir_name = test_dir + '/' + label_type
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname),encoding='gb18030', errors='ignore')
test_texts.append(f.read())
f.close()
if label_type == 'neg':
test_labels.append(0)
else:
test_labels.append(1)
maxlen = 100 #100个单词后截断评论
training_samples = 200 #训练样本数
validation_samples = 10000 #验证样本数
max_words = 10000 #词频数限制
tokenizer = Tokenizer(num_words=max_words) #创建分词器
tokenizer.fit_on_texts(texts) #构建单词索引
sequences = tokenizer.texts_to_sequences(texts) #将字符串转为整数索引组成的列表
word_index = tokenizer.word_index #找回单词索引
print('Found %s unique tokens.'%len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
#打乱数据
indices = np.arange(data.shape[0])
np.random.shuffle(indices) #索引重组
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
sequences = tokenizer.texts_to_sequences(test_texts) #将字符串转为整数索引组成的列表
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.array(test_labels)
#对glove进行预处理,构建一个将单词(字符串)映射为其向量表示(数值向量)的索引
embeddings_index = {}
f = open(glove_dir+ '/glove.6B.100d.txt',encoding='gb18030', errors='ignore')
for line in f:
values = line.split()
word = values[0] #第一个字符为单词索引
coefs = np.asarray(values[1:], dtype='float32') #其后的为该单词对应的embedding_dim向量
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' %len(embeddings_index))
#准备词嵌入矩阵
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
if i < max_words:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) #添加embedding层
model.add(Flatten()) #张量展开
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
model.layers[0].set_weights([embedding_matrix]) #将预训练的词嵌入加载到embedding层中
model.layers[0].trainable = False
model.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['acc']) #编译网络
history = model.fit(x_train, y_train, epochs = 10, batch_size=32, validation_data=(x_val, y_val)) #训练
model.save_weights('pre_trained_glove_model.h5') #保存
model.load_weights('./pre_trained_glove_model.h5')
test_loss , test_acc = model.evaluate(x_test, y_test)
print(test_acc)
#绘制acc和loss
'''
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1,len(acc) + 1)
plt.plot(epochs, acc, 'bo', label = 'Training acc')
plt.plot(epochs, val_acc, 'b', label = 'Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label = 'training loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()
'''
Glove总结