wikipedia[1] 用于检索的数据集,包含 2866 个样本、10 个类,图像、文本两个模态。
想按照 [2] 的设置处理数据,而 [2] 的设置应该来自 [3],即 images 用 CaffeNet[4] 提取 fc7 层[5] 的 4096 维特征,texts 用 word2vec[6] 提取每个单词的 100 维词向量并取平均。
暂时用 Keras 预训练的 VGG16[7,8] 代替 CaffeNet,参考 [12];word2vec 特征用 gensim[9] 库生成,参考 [13, 14]。
从 [10] 下载,解压之后有 trainset_txt_img_cat.list 和 testset_txt_img_cat.list 两个文件,里面每行代表一个样本,分 3 列:text 文件名、image 文件名、class id。
text 数据在 texts/
下,装在 .xml 文件里。本想用 minidom[11] 解析,但因为一些特殊符号(比如单独的 &
)解析不了,未找到好方法,暂时手动解析。
image 数据在 images/
下,分类放在不同文件夹。
import os
from os.path import join
import numpy as np
import scipy.io as sio
from gensim.models import Word2Vec
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
P = "wikipedia_dataset"
IMG_P = "images"
TXT_P = "texts"
TRAIN_LIST = "trainset_txt_img_cat.list"
TEST_LIST = "testset_txt_img_cat.list"
os.chdir(P) # 切去解压目录
print(os.getcwd())
2173
个是原 training set,后 693
个是原 test set。ls_img = []
ls_txt = []
ls_lab = []
with open("id-map.wiki.txt", "w") as f_out:
sid = -1
for fname in (TRAIN_LIST, TEST_LIST):
with open(fname, "r") as f_in:
for line in f_in:
sid += 1
txt_f, img_f, lab = line.strip().split()
#txt_f = join(TXT_P, txt_f, ".xml")
#img_f = join(IMG_P, img_f, ".jpg")
ls_img.append(img_f)
ls_txt.append(txt_f)
lab = int(lab) - 1 # shift to 0-base
ls_lab.append(lab)
# format:
f_out.write("{} {} {} {}\n".format(sid, txt_f, img_f, lab))
print(len(ls_img), len(ls_txt), len(ls_lab))
labels = np.asarray(ls_lab)
print(labels.shape, np.max(labels), np.min(labels)) # (2866,) 9 0
# N_CLASS = np.max(labels)
# labels -= 1 # shift to [0, N_CLASS - 1]
# labels = np.eye(N_CLASS)[labels] # to one-hot
# print(labels.shape) # (2866, 10)
# np.save("labels.npy", labels)
sio.savemat("labels.wiki.mat", {"labels": labels}, do_compression=True)
mean word2vec (deprecated)
一节。今新增一节 doc2vec
,记录基于 gensim Doc2Vec 的处理方法,见下文。deprecated
)错
了,且对 corpus 的预处理建议换成下一节 gensim 工具的方式,而不是像此节一样的手动处理。def parse(fn):
"""手动解析 xml:读 之间的部分"""
res = ""
flag = False
with open(fn, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
if line == "":
break
if flag:
res += " " + line
if line == "" :
flag = True
return res
def clean(strings, pattern):
"""驱邪……"""
return [s.replace(pattern, "") for s in strings]
"""解析 xml"""
sentences = []
for txt_f in ls_txt:
txt_f = join(TXT_P, "{}.xml".format(txt_f))
# print(txt_f)
doc = parse(txt_f) # 手动解析
# doc = minidom.parse(txt_f).documentElement.getElementsByTagName("text")[0].childNodes[0].data
words = doc.split()
# 清除多余符号
for pat in (",", ".", "!", "?", "''", "(", ")", "\"", ":", ";", "{", "}", "[", "]"):
words = clean(words, pat)
sentences.append(words)
print(len(sentences))
"""训练 word2vec 模型"""
# [3] 说用 skip-gram
w2v = Word2Vec(sentences, size=100, min_count=5, iter=50, sg=1) # sg = skip-gram
"""提取文本特征"""
texts = np.zeros([len(sentences), 100])
for i, s in enumerate(sentences):
cnt = 0
for w in s:
if w in w2v:
cnt += 1
texts[i] += w2v[w]
# 取平均词向量
texts[i] /= cnt
# 保存
np.save("texts.w2v.100.npy", texts)
gensim.utils.simple_preprocess
,但这里没有更新!应要换成 [18] 的方法,详见 [18]。clean
函数)。gensim.utils.simple_preprocess
好像并不自动去除。from __future__ import print_function
import os
import os.path as osp
import re
import io
import numpy as np
import scipy.io as sio
import gensim
from gensim.models import Doc2Vec
"""process text with Doc2Vec
paser text in .xml files
ref:
- https://blog.csdn.net/HackerTom/article/details/117001560
"""
USER_ID = 1000 # 宿主机 user id
# wiki
P = "/home/tom/dataset/wikipedia"
ID_MAP_F = osp.join(P, "id-map.wiki.txt")
TEXT_P = osp.join(P, "texts")
# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
start_alpha = 0.01
infer_epoch = 1000
DIM = 300 # dimension of the doc2vec feature
text_files = []
with open(ID_MAP_F, "r") as f:
for line in f:
_, txt_f, _, _ = line.strip().split()
text_files.append(txt_f)
print("#data:", len(text_files))
def parse(fn):
res = ""
flag = False
with io.open(fn, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
if line == "":
break
if flag:
res += " " + line
if line == "" :
flag = True
return res
model = Doc2Vec.load(MODEL)
texts = []
for txt_f in text_files:
txt_f = osp.join(TEXT_P, "{}.xml".format(txt_f))
#print(txt_f)
doc = parse(txt_f)
# print(doc)
# (2023.1.11) [18] 的分词方法已换成用 Stanford CoreNLP 做
# 但这里未更新 !!!
# 去 [18] 看新的文本预处理方法
doc = gensim.utils.simple_preprocess(doc)
# print(doc)
vec = model.infer_vector(doc)
# print(vec.shape) # (300,)
texts.append(vec[np.newaxis, :])
# break
texts = np.vstack(texts).astype(np.float32)
print("texts:", texts.shape, texts.dtype) # (2866, 300) float32
_f_name = "texts.wiki.doc2vec.{}.mat".format(DIM)
sio.savemat(_f_name, {"texts": texts})
# 参考 [18],因为在 docker 中运行,故需将文件权限换回宿主帐号
os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))
deprecated
)import io
import os
import os.path as osp
import numpy as np
import scipy.io as sio
import gensim
from gensim import corpora, models, similarities
P = "G:/wiki_top10cats"
ID_MAP_F = osp.join(P, "id-map.wiki.txt")
STOP_WORD_F = osp.join(P, "stop_words_english.txt")
TEXT_P = osp.join(P, "texts")
N_TOPIC = 10
print("stop words")
with open(STOP_WORD_F, "r", encoding='utf-8') as f:
stop_words = [line.strip() for line in f]
print("#stop word:", len(stop_words))
print("text 文件顺序:按前述 id map")
text_files = []
with open(ID_MAP_F, "r") as f:
for line in f:
_, txt_f, _, _ = line.strip().split()
text_files.append(txt_f)
print("#data:", len(text_files)) # 2866
def parse(fn):
res = ""
flag = False
with io.open(fn, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
if line == "":
break
if flag:
res += " " + line
if line == "" :
flag = True
return res
print("按顺序读 texts")
corpus = []
for txt_f in text_files:
txt_f = osp.join(TEXT_P, "{}.xml".format(txt_f))
#print(txt_f)
doc = parse(txt_f)
# print(doc)
doc = gensim.utils.simple_preprocess(doc)
doc = [w for w in doc if w not in stop_words] # 去掉 stop words
# print(doc)
corpus.append(doc)
print("corpus:", len(corpus)) # 2866
dictionary = corpora.Dictionary(corpus)
print("vocab:", len(dictionary)) # 63218
print("转 BoW")
bow_list = [dictionary.doc2bow(doc) for doc in corpus]
print(len(bow_list), type(bow_list[0])) # 2866
print("算 TF-IDF")
corpus_tfidf = models.TfidfModel(bow_list)[bow_list]
lda = models.LdaModel(corpus_tfidf, num_topics=N_TOPIC, id2word=dictionary,
alpha=0.01, eta=0.01, minimum_probability=0.001,
update_every=1, chunksize=100, passes=1)
print("应该是对每个主题的从属度?")
doc_topics = lda.get_document_topics(corpus_tfidf)
# 格式:[(topic_id, membership)]
print(doc_topics[0])
texts_lda = np.asarray(doc_topics)
print(texts_lda.shape)
texts_lda = texts_lda[:, :, 1]
# print(texts_lda[0])
# 保存
assert texts_lda.shape[1] == N_TOPIC
sio.savemat(osp.join(P, "texts.wiki.lda.{}.mat".format(N_TOPIC)), {"texts": texts_lda})
import os.path as osp
import numpy as np
import scipy.io as sio
P = "G:/wiki_top10cats"
DATA_F = osp.join(P, "raw_features.mat")
data = sio.loadmat(DATA_F)
print(list(data.keys())) # 'I_tr', 'I_te', 'T_tr', 'T_te'
print("先 train 后 test,同前面 sample order")
texts_lda = np.vstack([data["T_tr"], data["T_te"]])
print(texts_lda.shape) # (2866, 10)
sio.savemat(osp.join(P, "texts.wiki.lda.{}.mat".format(texts_lda.shape[1])), {"texts": texts_lda})
将图片全部复制到同一个目录,方便操作。用 VGG16 提特征
ALL_IMG_P = "images_all"
if not os.path.exists(ALL_IMG_P):
os.makedirs(ALL_IMG_P)
"""全复制到 ALL_IMG_P"""
for cls in os.listdir(IMG_P):
cls_d = join(IMG_P, cls)
# print(os.listdir(cls_d))
for img in os.listdir(cls_d):
# os.system("cp {} {}".format(join(cls_d, img), ALL_IMG_P)) # linux
os.system("copy {} {}".format(join(cls_d, img), ALL_IMG_P)) # windows
print(len(os.listdir(ALL_IMG_P)))
"""提特征"""
base_model = VGG16(weights='imagenet')
# print(base_model.summary())
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)
# print(model.summary())
images = []
for i_name in ls_img:
img_f = join(ALL_IMG_P, "{}.jpg".format(i_name))
img = image.load_img(img_f, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
images.append(model.predict(x))
images = np.vstack(images)
print(images.shape)
# 保存
np.save("images.vgg16.npy", images)
数据放在百度云盘,有原数据和处理过的。
链接:https://pan.baidu.com/s/19pjYO5Uxsq2aiGFqofp-CQ,提取码:gr9m
。