Image Inception:图像描述,从给定的图像直观地生成一段描述文字;
该工作起源于2014年百度研究院发表的
Explain images with multimodal recurrent neural networks 将深度卷积神经网络和深度循环网络结合
Show and tell: A neural image caption generator
数据集2介绍
200k张图像
以captain_train_annotation——20170902.json的格式保存每张图像的描述:
描述具有的特点:
对中文的处理分成:
英语采用空格区分单词,汉语则采用分词软件,最有效的是结巴分词 结巴分词,安装采用 pip install jieba
import jieba
seg_list = jieba.cut("他正在学习深度学习知识", cut_all=False)
print(u"分词结果:" + "/ ".join(seg_list))
output:
他/ 正在/ 学习/ 深度/ 学习/ 知识
结巴分词利用自身的词典进行分词,可自建词典。
函数pack_padded_sequence专门对经过pad操作后的序列进行pack,因经过pad后的序列存在很多空白的填充值,使得RNN计算变得复杂,浪费计算资源。
packedsequence知道输入数据中哪些是pad的值,不会计算pad的数据输出,从而节省计算资源。具体方法:
import torch as t
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch import nn
sen1 = [1,1,1]
sen2 = [2,2,2,2]
sen3 = [3,3,3,3,3]
sen4 = [4,4,4,4,4,4]
sentences = [sen1, sen2, sen3, sen4]
sentences = sorted(sentences, key=lambda x:len(x),reverse=True)
lenghths = [ 5 if len(sen)>5 else len(sen) for sen in sentences ]
def pad_sen(sen, lenth=5, padded_num=0):
origin_len = len(sen)
padded_sen = sen[:length]
padded_sen = padded_sen + [padded_num for _ in range(origin_len,length)]
return padded_sen
pad_sentences = [pad_sen(sen) for sen in sentences]
pad_tensor = t.tensor(pad_sentences)
pad_variable = t.autograd.Variable(pad_tensor)
embedding = nn.Embedding(5,2)
pad_embeddings = embedding(pad_variable)
packed_variable = pack_padded_sequence(pad_embeddings,lenghths)
rnn = t.nn.LSTM(2,3)
output,hn = rnn(packed_variable)
进行数据预处理
#coding:utf8
import torch as t
import numpy as np
import json
import jieba
import tqdm
class Config:
annotation_file = 'caption_train_annotations_20170902.json'
unknown = ''
end=''
padding=''
max_words=10000
min_appear=2
save_path='caption.pth'
# START=''
# MAX_LENS = 25,
def process(**kwargs):
opt = Config()
for k,v in kwargs.items():
setattr(opt,k,v)
with open(opt.annotation_file) as f:
data = json.load(f)
# 8f00f3d0f1008e085ab660e70dffced16a8259f6.jpg -> 0
id2ix = {item['image_id']:ix for ix,item in enumerate(data) }
# 0-> 8f00f3d0f1008e085ab660e70dffced16a8259f6.jpg
ix2id = {ix:id for id,ix in (id2ix.items())}
assert id2ix[ix2id[10]] == 10
captions = [item['caption'] for item in data]
# 分词结果
cut_captions = [ [list(jieba.cut(ii,cut_all=False)) for ii in item ] for item in tqdm.tqdm(captions) ]
word_nums = {} # '快乐'-> 10000 (次)
def update(word_nums):
def fun(word):
word_nums[word] = word_nums.get(word,0)+1
return None
return fun
lambda_ = update(word_nums)
_ = {lambda_(word) for sentences in cut_captions for sentence in sentences for word in sentence}
vocabs = list(word_nums.keys())
# [ (10000,u'快乐'),(9999,u'开心') ...]
word_nums_list = sorted([(num,word) for word,num in word_nums.items() ],reverse=True)
#### 以上的操作是无损,可逆的操作###############################
#**********以下会删除一些信息******************
#1. 丢弃词频不够的词
#2. ~~丢弃长度过长的词~~
words = [word[1] for word in word_nums_list[:opt.max_words] if word[0]>=opt.min_appear]
words = [opt.unknown,opt.padding,opt.end] + words
word2ix = {word:ix for ix,word in enumerate(words)}
ix2word = {ix:word for word,ix in word2ix.items()}
assert word2ix[ix2word[123]] == 123
ix_captions =[ [ [ word2ix.get(word,word2ix.get(opt.unknown)) for word in sentence]
for sentence in item]
for item in cut_captions]
readme=u'''
word:词
ix:index
id:图片名
caption: 分词之后的描述,通过ix2word可以获得原始中文词
'''
results = {
'caption':ix_captions,
'word2ix':word2ix,
'ix2word':ix2word,
'ix2id':ix2id,
'id2ix':id2ix,
'padding':'',
'end':'',
'readme':readme
}
t.save(results,opt.save_path)
print('save file in %s' %opt.save_path)
def test(ix,ix2=4):
results = t.load(opt.save_path)
ix2word = results['ix2word']
examples = results['caption'][ix][4]
sentences_p = (''.join([ix2word[ii] for ii in examples]) )
sentences_r = data[ix]['caption'][ix2]
assert sentences_p == sentences_r,'test failed'
test(1000)
print('test success')
if __name__ == '__main__':
import fire
fire.Fire()
# python data_preprocess.py process --annotation-file=/data/annotation.json --max-words=5000