sigmoid一般是用来做二分类的,它是将一个标量的数字转换成[0,1]之间的一个概率值,如果概率值大于0.5, 则判定为是某个分类,否则则不是某个分类,公式如下:
本文基于sigmoid做了一个将一段长文本打上多个标签的算法模型,首先声明,我的模型最终的效果不好,因为我的文本很长,而且采用的模型很简单,就一层神经网络,权当练手。
数据集文件的格式为每行一个样本,以制表符分隔。第一列为文本的多个标签,以逗号分隔,第二列为文本内容
和我之前写的一篇基于softmax多文本多分类模型的结构差不多,代码分成两个文件
第一个文件: data_loader.py
主要用来读取训练数据,并预处理,代码如下:
# coding: utf-8
import sys
from collections import Counter
import pdb
import numpy as np
import tensorflow.contrib.keras as kr
if sys.version_info[0] > 2:
is_py3 = True
else:
reload(sys)
sys.setdefaultencoding("utf-8")
is_py3 = False
def native_word(word, encoding='utf-8'):
"""如果在python2下面使用python3训练的模型,可考虑调用此函数转化一下字符编码"""
if not is_py3:
return word.encode(encoding)
else:
return word
def native_content(content):
if not is_py3:
return content.decode('utf-8')
else:
return content
def open_file(filename, mode='r'):
"""
常用文件操作,可在python2和python3间切换.
mode: 'r' or 'w' for read or write
"""
if is_py3:
return open(filename, mode, encoding='utf-8', errors='ignore')
else:
return open(filename, mode)
def read_file(filename):
"""读取文件数据"""
contents, labels = [], []
with open_file(filename) as f:
for line in f:
try:
p = line.strip().split('\t')
label_lst = p[0].split(',')
content = p[1]
if content:
contents.append(list(native_content(content)))
label_lst = [native_content(label) for label in label_lst]
#多标签
labels.append(label_lst)
except:
pass
return contents, labels
#构建词汇表,使用字符级的表示,这一函数会将词汇表存储下来,避免每一次重复处理;
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
"""根据训练集构建词汇表,存储"""
#train, test, val文件的格式为: 分类 文字
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
counter = Counter(all_data) #统计所有文档中每个字出现的次数 格式:{'c': 3, 'a': 1, 'b': 1}
count_pairs = counter.most_common(vocab_size - 1) #取出现次数最多的部分, 格式:[('c', 3), ('a', 1)]
words, _ = list(zip(*count_pairs)) #格式:[('c', 'a'), (3, 1)], words格式为:('c', 'a')
# 添加一个 来将所有文本pad为同一长度
words = [''] + list(words)
open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n') #vocab_dir里面就是处理后的词表,每行一个字
#读取上一步存储的词汇表,转换为{词:id}表示;
def read_vocab(vocab_dir):
"""读取词汇表"""
# words = open_file(vocab_dir).read().strip().split('\n')
with open_file(vocab_dir) as fp:
# 如果是py2 则每个值都转化为unicode
words = [native_content(_.strip()) for _ in fp.readlines()]
word_to_id = dict(zip(words, range(len(words))))
return words, word_to_id
# 将分类目录固定,转换为{类别: id}表示;
def read_category(vocab_file):
categories = []
for line in open(vocab_file, 'r'):
categories += line.strip().split('\t')[0].split(',')
categories = set(categories)
categories = [native_content(x) for x in categories]
cat_to_id = dict(zip(categories, range(len(categories))))
return categories, cat_to_id
#将一条由id表示的数据重新转换为文字;
def to_words(content, words):
"""将id表示的内容转换为文字"""
return ''.join(words[x] for x in content)
#将数据集从文字转换为固定长度的id序列表示;
def process_file(filename, word_to_id, cat_to_id, max_length=600):
"""将文件转换为id表示"""
contents, labels = read_file(filename)
data_id, label_id = [], []
for i in range(len(contents)):
#实际就是将一篇文档的词id向量和一个分类id对应起来
#data_id中每个元素是一篇文档的词id构成的向量
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
#label_id, 每篇文档对应一个分类id,这个分类id是与一篇文档的词id向量对应
label_id.append([cat_to_id[label] for label in labels[i]])
# 使用keras提供的pad_sequences来将文本pad为固定长度
#因为data_id中每个元素都是一个由一篇文档中的字组成的向量,而每篇文档长度不同,所以每篇文档对应的向量元素个数不同,所以这里要将他们格式化为同一长度,策略就是高位补0
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
# y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示
y_pad = parse_multi_category(label_id, len(cat_to_id))
return x_pad, y_pad
def parse_multi_category(label_ids, label_axis):
new_label_ids = []
for label in label_ids:
l = np.zeros(label_axis)
for index in label:
l[index] = 1
new_label_ids.append(l)
return np.array(new_label_ids)
#为神经网络的训练准备经过shuffle的批次的数据。
#x为所有文档的词id向量构成的集合,是np.array类型
#y为所有文档对应的标签的one-hot向量集合
#注意:batch_iter这个函数返回的是一个迭代器
def batch_iter(x, y, batch_size=64):
"""生成批次数据"""
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1 #计算每个批次取的数据量
#np.random.permutation是随机打乱一个数组, 比如将[0,1,2,3] 打乱成[3,1,0,2]
#np.arange是构造一个[0, data_len]的列表
indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[indices] #基于一个打乱的索引顺序indics,分别从x中取出对应位置的向量,并按照这个顺序组成新的x,实际就是打乱x的向量顺序
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] #每次从打乱顺序后的x和y中依次取一个批次的数
接下来是模型的实现:run_sigmoid.py
代码如下:
#-*- coding:utf8 -*-
import pdb
import os
import tensorflow as tf
import numpy as np
from data_loader import read_vocab, read_category, batch_iter, process_file, build_vocab
train_file = 'data/baidu_tag_one_train.lst'
test_file = 'data/baidu_tag_one_test.lst'
vocab_file = 'data/baike_category_vocab.lst'
all_file = 'data/baidu_tag_one_all.lst'
max_vocab_size = 5000
seq_length = 600 #输入x的维度
num_epochs = 10
batch_size = 64
def feed_data(x_batch, y_batch):
feed_dict = {
x: x_batch,
y_: y_batch
}
return feed_dict
if not os.path.exists(vocab_file):
build_vocab(all_file, vocab_file, max_vocab_size)
print 'build vocab over'
#全部分类,分类对应的id
categorys, cat_to_id = read_category(all_file)
print 'read category over'
words, word_to_id = read_vocab(vocab_file)
print 'read vocab over'
x_train, y_train = process_file(train_file, word_to_id, cat_to_id, seq_length)
print 'process file over'
num_classes = len(cat_to_id)
#定义模型
with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, [None, seq_length], name='input_x')
y_ = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
# w = tf.Variable(tf.zeros([seq_length, num_classes]))
# b = tf.Variable(tf.zeros([num_classes]))
w = tf.Variable(tf.truncated_normal(shape=[seq_length, num_classes], mean=0, stddev=1))
b = tf.Variable(tf.truncated_normal(shape=[num_classes], mean=0, stddev=1))
logits = tf.matmul(x,w) + b
y = tf.sigmoid(logits)
# cost = -tf.reduce_sum(y_*tf.log(y)) #交叉熵的计算方式
# cost = tf.reduce_sum(tf.square(y_-y))
cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=logits) #用sigmoid_*计算交叉熵的话,logits也可以传y
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
print 'initial'
init = tf.initialize_all_variables()
print 'session'
sess = tf.Session()
sess.run(init)
#多标签分类计算准确率
correct_tags = tf.equal(tf.cast(tf.greater_equal(tf.sigmoid(logits), 0.5), tf.int32), tf.cast(y_, tf.int32))
accuracy = tf.reduce_mean(tf.reduce_min(tf.cast(correct_tags, tf.float32), 1))
#比较预测和真实分类值
# correct_prediction = tf.equal(y__cast, y_cast)
# correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) #argmax是指取数组中最大的值所在的索引
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print 'accuracy'
for epoch in range(num_epochs):
print('epoch:', epoch+1)
batch_train = batch_iter(x_train, y_train, batch_size)
#这里x_batch的维度是(batch_size, seq_length), batch_size其实就是每次取的文档的个数
for x_batch, y_batch in batch_train:
# pdb.set_trace()
feed_dict = feed_data(x_batch, y_batch)
sess.run(train_step, feed_dict=feed_dict)
print 'accuracy',sess.run(accuracy, feed_dict=feed_dict)
print 'y',sess.run(tf.cast(tf.greater_equal(tf.sigmoid(logits), 0.5), tf.int32), feed_dict=feed_dict)
print 'y_',sess.run(tf.cast(y_, tf.int32), feed_dict=feed_dict)
feed_dict = feed_data(x_train, y_train)
print 'accuracy',sess.run(accuracy, feed_dict=feed_dict)
代码相对于softmax做分类有几个区别:
1、softmax是将样本输出转换成one-hot向量,而sigmoid做多标签分类,则不是one-hot,而是有多位为1
2、计算accuracy准确率上,有些差别,sigmoid做多标签分类计算准确率有些复杂
如果想提高准确率,可以尝试采用cnn或者lstm/rnn模型,最后接一层sigmoid来做多标签分类