采用RNN和unicode分词进行文本生成
这里我们使用tensorflow
实现,代码如下:
# 完整代码在这里
import tensorflow as tf
import keras_nlp
import numpy as np
tokenizer = keras_nlp.tokenizers.UnicodeCodepointTokenizer(vocabulary_size=400)
# tokens - ids
ids = tokenizer(['Why are you so funny?', 'how can i get you'])
# ids - tokens
tokenizer.detokenize(ids)
def split_input_target(sequence):
input_text = sequence[:-1]
target_text = sequence[1:]
return input_text, target_text
# 准备数据
text = open('./shakespeare.txt', 'rb').read().decode(encoding='utf-8')
dataset = tf.data.Dataset.from_tensor_slices(tokenizer(text))
dataset = dataset.batch(64, drop_remainder=True)
dataset = dataset.map(split_input_target).batch(64)
input, ouput = dataset.take(1).get_single_element()
# 定义模型
d_model = 512
rnn_units = 1025
class CustomModel(tf.keras.Model):
def __init__(self, vocabulary_size, d_model, rnn_units):
super().__init__(self)
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model)
self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
self.dense = tf.keras.layers.Dense(vocabulary_size, activation='softmax')
def call(self, inputs, states=None, return_state=False, training=False):
x = inputs
x = self.embedding(x)
if states is None:
states = self.gru.get_initial_state(x)
x, states = self.gru(x, initial_state=states, training=training)
x = self.dense(x, training=training)
if return_state:
return x, states
else:
return x
model = CustomModel(tokenizer.vocabulary_size(), d_model, rnn_units)
# 查看模型结构
model(input)
model.summary()
# 模型配置
model.compile(
loss = tf.losses.SparseCategoricalCrossentropy(),
optimizer='adam',
metrics=['accuracy']
)
# 模型训练
model.fit(dataset, epochs=3)
# 模型推理
class InferenceModel(tf.keras.Model):
def __init__(self, model, tokenizer):
super().__init__(self)
self.model = model
self.tokenizer = tokenizer
def generate(self, inputs, length, return_states=False):
inputs = inputs = tf.constant(inputs)[tf.newaxis]
states = None
input_ids = self.tokenizer(inputs).to_tensor()
outputs = []
for i in range(length):
predicted_logits, states = model(inputs=input_ids, states=states, return_state=True)
input_ids = tf.argmax(predicted_logits, axis=-1)
outputs.append(input_ids[0][-1].numpy())
outputs = self.tokenizer.detokenize(lst).numpy().decode('utf-8')
if return_states:
return outputs, states
else:
return outputs
infere = InferenceModel(model, tokenizer)
# 开始推理
start_chars = 'hello'
outputs = infere.generate(start_chars, 1000)
print(start_chars + outputs)
先导包tensorflow
, keras_nlp
, numpy
import tensorflow as tf
import keras_nlp
import numpy as np
数据来自莎士比亚的作品 storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt;我们将其下载下来存储为shakespeare.txt
这里我们使用unicode分词:将所有字符都作为一个词来进行分词
tokenizer = keras_nlp.tokenizers.UnicodeCodepointTokenizer(vocabulary_size=400)
# tokens - ids
ids = tokenizer(['Why are you so funny?', 'how can i get you'])
# ids - tokens
tokenizer.detokenize(ids)
利用tokenizer
和text
数据构建数据集
def split_input_target(sequence):
input_text = sequence[:-1]
target_text = sequence[1:]
return input_text, target_text
text = open('./shakespeare.txt', 'rb').read().decode(encoding='utf-8')
dataset = tf.data.Dataset.from_tensor_slices(tokenizer(text))
dataset = dataset.batch(64, drop_remainder=True)
dataset = dataset.map(split_input_target).batch(64)
input, ouput = dataset.take(1).get_single_element()
d_model = 512
rnn_units = 1025
class CustomModel(tf.keras.Model):
def __init__(self, vocabulary_size, d_model, rnn_units):
super().__init__(self)
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model)
self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
self.dense = tf.keras.layers.Dense(vocabulary_size, activation='softmax')
def call(self, inputs, states=None, return_state=False, training=False):
x = inputs
x = self.embedding(x)
if states is None:
states = self.gru.get_initial_state(x)
x, states = self.gru(x, initial_state=states, training=training)
x = self.dense(x, training=training)
if return_state:
return x, states
else:
return x
model = CustomModel(tokenizer.vocabulary_size(), d_model, rnn_units)
# 查看模型结构
model(input)
model.summary()
model.compile(
loss = tf.losses.SparseCategoricalCrossentropy(),
optimizer='adam',
metrics=['accuracy']
)
model.fit(dataset, epochs=3)
定义一个InferenceModel
进行模型推理配置;
class InferenceModel(tf.keras.Model):
def __init__(self, model, tokenizer):
super().__init__(self)
self.model = model
self.tokenizer = tokenizer
def generate(self, inputs, length, return_states=False):
inputs = inputs = tf.constant(inputs)[tf.newaxis]
states = None
input_ids = self.tokenizer(inputs).to_tensor()
outputs = []
for i in range(length):
predicted_logits, states = model(inputs=input_ids, states=states, return_state=True)
input_ids = tf.argmax(predicted_logits, axis=-1)
outputs.append(input_ids[0][-1].numpy())
outputs = self.tokenizer.detokenize(lst).numpy().decode('utf-8')
if return_states:
return outputs, states
else:
return outputs
infere = InferenceModel(model, tokenizer)
start_chars = 'hello'
outputs = infere.generate(start_chars, 1000)
print(start_chars + outputs)
生成结果如下所示,感觉很差:
hellonofur us:
medous, teserwomador.
walled o y.
as
t aderemowate tinievearetyedust. manonels,
w?
workeneastily.
watrenerdores aner'shra
palathermalod, te a y, s adousced an
ptit: mamerethus:
bas as t: uaruriryedinesm's lesoureris lares palit al ancoup, maly thitts?
b veatrt
watyeleditenchitr sts, on fotearen, medan ur
tiblainou-lele priniseryo, ofonet manad plenerulyo
thilyr't th
palezedorine.
ti dous slas, sed, ang atad t,
wanti shew.
e
upede wadraredorenksenche:
wedemen stamesly ateara tiafin t t pes:
t: tus mo at
io my.
ane hbrelely berenerusedus' m tr;
p outellilid ng
ait tevadwantstry.
arafincara, es fody
'es pra aluserelyonine
pales corseryea aburures
angab:
sunelyothe: s al, chtaburoly o oonis s tioute tt,
pro.
tedeslenali: s 't ing h
sh, age de, anet: hathes: s es'tht,
as:
wedly at s serinechamai:
mored t.
t monatht t athoumonches le.
chededondirineared
t
er
p y
letinalys
ani
aconen,
t rs:
t;et, tes-
luste aly,
thonort aly one telus, s mpsantenam ranthinarrame! a
pul; bon
s fofuly
RNN结合unicode分词能进行文本生成但是效果一言难尽!