对于序列数据的处理,首先要进行编码(Embedding),然后还需处理其不等长的问题(变长输入问题)。
1.Embedding
a. One-hot编码:word -> index [0,0,...,1,0,...](稀疏)
b. (Dense) Embedding:Word -> [1.2,2.4,..,0.2,...](密集)
2.变长输入
a.padding
word index:[3,2,5,1] -> padding: [3,2,5,1,0,0,...](补零)
b.pooling
words -> Embedding -> 组合
3.embedding + padding,pooling 的缺点:
a.信息丢失
b.计算效率低,无效计算多
因此有了后续的RNN和LSTM神经网络结构,以下基于IMDB数据(涉及文本数据的处理),简单的实现embedding + padding,pooling + 全连接的神经网络。
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
# 使用IMDB数据,电影评分分类
imdb = keras.datasets.imdb
vocab_size = 10000 #词数
index_from = 3 #词index
(train_data,train_labels),(test_data,test_labels) = imdb.load_data(
num_words = vocab_size,index_from = index_from)
# print(train_data[0],train_labels[0])
print(train_data.shape,train_labels.shape)
print(len(train_data[0]),len(train_data[1]))
print(test_data.shape,test_labels.shape)
word_index = imdb.get_word_index() #载入词表
print(len(word_index))
print(word_index)
word_index = {k:(v+3) for k,v in word_index.items()}
#特殊字符
word_index['' ] = 0
word_index['' ] = 1
word_index['' ] = 2
word_index['' ] = 3
reverse_word_index = dict([(value,key) for key,value in word_index.items()])
# 解析 train data 的数据 向量 -> 字符
def decode_review(text_ids):
return ''.join([reverse_word_index.get(word_id, '' )
for word_id in text_ids])
decode_review(train_data[0])
max_length = 500
# 补全
train_data = keras.preprocessing.sequence.pad_sequences(
train_data, #list of list
value=word_index['' ],
padding='post',# post:放在后面,pre:放在前面
maxlen = max_length
)
test_data = keras.preprocessing.sequence.pad_sequences(
test_data, #list of list
value=word_index['' ],
padding='post',# post:放在后面,pre:放在前面
maxlen = max_length)
print(train_data[0])
embedding_dim = 16
batch_size = 128
model = keras.models.Sequential([
keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
# 1. define matrix : [vacob_size,embedding_dim]
# 2. 假设[1,2,3,4...]为一个句子,每一个词都变成词都对应到一个向量(embedding_dim),句子: max_length * embedding_dim
# 3. 输出 batch_size * max_length * embedding_dim
keras.layers.GlobalAveragePooling1D(),
# batch_size * max_length * embedding_dim -> batch_size * embedding_dim
keras.layers.Dense(64,activation='relu'),
keras.layers.Dense(1,activation='sigmoid')
])
model.summary()
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])
epochs = 30
history = model.fit(train_data,train_labels,epochs = epochs,
batch_size = batch_size,
validation_split = 0.2)
def plot_learning_curves(history, label, epochs, min_value, max_value):
data = {}
data[label]=history.history[label]
data['val_'+label]=history.history['val_'+label]
pd.DataFrame(data).plot(figsize=(8, 5))
plt.grid(True)
plt.axis([0, epochs, min_value, max_value])
plt.show()
plot_learning_curves(history, 'accuracy',epochs, 0, 1)
plot_learning_curves(history, 'loss',epochs, 0, 1)
model.evaluate(test_data,test_labels,batch_size = batch_size)