in.txt是诗的上半句,也就是作为输入的部分。
out.txt是诗的下半句,也就是我们要预测的部分,可以看出这两幅图片上的诗是两两对应的。
import pandas as pd
import numpy as np
from gensim.models.word2vec import LineSentence, Word2Vec
import tensorflow as tf
import time
#加载数据:
root='data/couplet'
with open (root+"/train/in.txt","r") as f:
data_in = f.read()
with open (root+"/train/out.txt","r") as f:
data_out = f.read()
train_X = data_in.split("\n")
train_Y = data_out.split("\n")
train_X = [data.split() for data in train_X]
train_Y = [data.split() for data in train_Y]
train_Y[1:3]
输出:[['万', '方', '乐', '奏', '有', '于', '阗'], ['绿', '柳', '堤', '新', '燕', '复', '来']]
import itertools
# 获取所有的字
words = list(itertools.chain.from_iterable(train_X))+list(itertools.chain.from_iterable(train_Y))
# 去重
words = set(words)
# 构建vocab
vocab = {word:index+1 for index ,word in enumerate(words)}
# 添加unk标签
vocab["unk"] = 0
其中itertools.chain.from_iterable的作用就是将词切分成一个个。
实例:
import itertools
# 获取所有的字
train_X1=[['万', '方', '乐', '奏', '有', '于', '阗'], ['绿', '柳', '堤', '新', '燕', '复', '来']]
list(itertools.chain.from_iterable(train_X1))
输出:['万', '方', '乐', '奏', '有', '于', '阗', '绿', '柳', '堤', '新', '燕', '复', '来']
这样就构造了一个数据的字典。
将数据数据的字转换成字典的索引,
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 转换成索引
train_X_ids = [[vocab.get(word,0) for word in sen] for sen in train_X]
train_Y_ids = [[vocab.get(word,0) for word in sen] for sen in train_Y]
# 填充长度
train_X_ids = pad_sequences(train_X_ids,maxlen=100,padding='post')
train_Y_ids = pad_sequences(train_Y_ids,maxlen=100,padding='post')
# 扩展维度
train_Y_ids = train_Y_ids.reshape(*train_Y_ids.shape, 1)
# train_label_input = train_label.reshape(*train_label.shape, 1)
vocab.get(word,0)的作用是,如果word在字典中,则返回其索引,否则返回0,即“unk“。
执行上方代码将数据padding成等长(100维),后续方便喂给模型。其中需要注意的是需要给train_label扩充一个维度,原因是由于keras的sparse_categorical_crossentropy loss需要输入的3维的数据。
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import GRU, LSTM,Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
def seq2seq_model(input_length,output_sequence_length,vocab_size):
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim = 128,input_length=input_length))
model.add(Bidirectional(LSTM(128, return_sequences = False)))
model.add(Dense(128, activation="relu"))
model.add(RepeatVector(output_sequence_length))
model.add(Bidirectional(LSTM(128, return_sequences = True)))
model.add(TimeDistributed(Dense(vocab_size, activation = 'softmax')))
model.compile(loss = sparse_categorical_crossentropy,
optimizer = Adam(1e-3))
model.summary()
return model
model = seq2seq_model(train_X_ids.shape[1],train_Y_ids.shape[1],len(vocab))
模型构建好之后,就可以开始训练起来了。需要做的是将输入数据喂给模型,同时定义好batch_size和epoch。
model.fit(train_X_ids,train_Y_ids, batch_size =64, epochs =10, validation_split = 0.2)
import numpy as np
input_sen ="国破山河在"
char2id = [vocab.get(i,0) for i in input_sen]
input_data = pad_sequences([char2id],100)
result = model.predict(input_data)[0][-len(input_sen):]
result_label = [np.argmax(i) for i in result]
dict_res = {i:j for j,i in vocab.items()}
print([dict_res.get(i) for i in result_label])
输出["人","来","日","月","长"]