tensorflow2.0学习笔记:embedding_padding_pooling

对于序列数据的处理,首先要进行编码(Embedding),然后还需处理其不等长的问题(变长输入问题)。

1.Embedding

a. One-hot编码:word -> index [0,0,...,1,0,...](稀疏) 
b. (Dense) Embedding:Word -> [1.2,2.4,..,0.2,...](密集)

2.变长输入

a.padding 
  word index:[3,2,5,1]  -> padding:  [3,2,5,1,0,0,...](补零)
b.pooling
   words -> Embedding -> 组合

3.embedding + padding,pooling 的缺点:

a.信息丢失
b.计算效率低,无效计算多

因此有了后续的RNN和LSTM神经网络结构,以下基于IMDB数据(涉及文本数据的处理),简单的实现embedding + padding,pooling + 全连接的神经网络。

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

# 使用IMDB数据,电影评分分类
imdb = keras.datasets.imdb
vocab_size = 10000 #词数
index_from = 3 #词index
(train_data,train_labels),(test_data,test_labels) = imdb.load_data(
    num_words = vocab_size,index_from = index_from)
# print(train_data[0],train_labels[0])
print(train_data.shape,train_labels.shape)
print(len(train_data[0]),len(train_data[1]))
print(test_data.shape,test_labels.shape)
word_index = imdb.get_word_index() #载入词表
print(len(word_index))
print(word_index)
word_index = {k:(v+3) for k,v in word_index.items()}
#特殊字符
word_index[''] = 0
word_index[''] = 1
word_index[''] = 2
word_index[''] = 3

reverse_word_index = dict([(value,key) for key,value in word_index.items()])

# 解析 train data 的数据 向量 -> 字符
def decode_review(text_ids):
    return ''.join([reverse_word_index.get(word_id, '') 
                    for word_id in text_ids])
decode_review(train_data[0])

max_length = 500

# 补全
train_data = keras.preprocessing.sequence.pad_sequences(
    train_data, #list of list
    value=word_index[''],
    padding='post',# post:放在后面,pre:放在前面
    maxlen = max_length
)

test_data = keras.preprocessing.sequence.pad_sequences(
    test_data, #list of list
    value=word_index[''],
    padding='post',# post:放在后面,pre:放在前面
    maxlen = max_length)

print(train_data[0])
embedding_dim = 16
batch_size = 128
model = keras.models.Sequential([
    
    keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    # 1. define matrix : [vacob_size,embedding_dim]
    # 2. 假设[1,2,3,4...]为一个句子,每一个词都变成词都对应到一个向量(embedding_dim),句子: max_length * embedding_dim
    # 3. 输出 batch_size * max_length * embedding_dim
    
    keras.layers.GlobalAveragePooling1D(), 
    # batch_size * max_length * embedding_dim -> batch_size *  embedding_dim
    
    keras.layers.Dense(64,activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])

model.summary()
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])
epochs = 30
history = model.fit(train_data,train_labels,epochs = epochs,
                    batch_size = batch_size,
                    validation_split = 0.2)
def plot_learning_curves(history, label, epochs, min_value, max_value):
    data = {}
    data[label]=history.history[label]
    data['val_'+label]=history.history['val_'+label]
    pd.DataFrame(data).plot(figsize=(8, 5))
    plt.grid(True)
    plt.axis([0, epochs, min_value, max_value])
    plt.show()
    
plot_learning_curves(history, 'accuracy',epochs, 0, 1)
plot_learning_curves(history, 'loss',epochs, 0, 1)
model.evaluate(test_data,test_labels,batch_size = batch_size)

你可能感兴趣的:(tensorflow2,神经网络,深度学习,tensorflow,python)