由于数据不均衡,有些类别的样本数量少,有些类别的样本数量多,导致学习到的模型在样本数量少的类别上表现很差。
于是解决办法之一便是学习过程中,对各类别的样本产生的loss加权,即给样本少的类别所产生的loss一个较大的权重,给样本多的类别所产生的loss一个较小的权重。
在实现中,便是对一个batch中多个样本产生的loss进行加权求和。
tensorflow实现如下:
这是CNN文本分类的代码,我有10个类别。
首先是通过tf.where(tf.equal())获取batch中不同类别的索引;
然后通过tf.gather_nd()获取对应的loss,然后乘上对应的类别权重self.cls_weigths[];
最后所有样本权重相加得到最终的loss。
我的方法比较笨,有更好的办法欢迎交流。谢谢。
def build_net(self):
train_dataset = tf.data.Dataset.from_tensor_slices((self.train_data, self.train_labels)).shuffle(1000).batch(
self.batch, drop_remainder=True)
val_dataset = tf.data.Dataset.from_tensor_slices((self.val_data, self.val_labels)).batch(self.batch,
drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices((self.test_data, self.test_labels)).batch(self.batch)
iterator = tf.data.Iterator.from_structure(output_types=train_dataset.output_types,
output_shapes=train_dataset.output_shapes)
self.next_batch = iterator.get_next()
self.train_init = iterator.make_initializer(train_dataset)
self.val_init = iterator.make_initializer(val_dataset)
self.test_init = iterator.make_initializer(test_dataset)
embedding = tf.Variable(tf.random_normal([self.num_vocab, 30]))
input = tf.nn.embedding_lookup(embedding, self.next_batch[0])
conv1 = tf.layers.conv1d(input, filters=100, kernel_size=3, padding='SAME')
conv2 = tf.layers.conv1d(input, filters=100, kernel_size=4, padding='SAME')
conv3 = tf.layers.conv1d(input, filters=100, kernel_size=5, padding='SAME')
pool1 = tf.reduce_max(conv1, axis=-1)
pool2 = tf.reduce_max(conv2, axis=-1)
pool3 = tf.reduce_max(conv3, axis=-1)
pool = tf.concat([pool1, pool2, pool3], axis=-1)
logits = tf.layers.dense(pool, self.num_label)
ind0 = tf.where(tf.equal(self.next_batch[1], 0))
ind1 = tf.where(tf.equal(self.next_batch[1], 1))
ind2 = tf.where(tf.equal(self.next_batch[1], 2))
ind3 = tf.where(tf.equal(self.next_batch[1], 3))
ind4 = tf.where(tf.equal(self.next_batch[1], 4))
ind5 = tf.where(tf.equal(self.next_batch[1], 5))
ind6 = tf.where(tf.equal(self.next_batch[1], 6))
ind7 = tf.where(tf.equal(self.next_batch[1], 7))
ind8 = tf.where(tf.equal(self.next_batch[1], 8))
ind9 = tf.where(tf.equal(self.next_batch[1], 9))
batch_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.next_batch[1], logits=logits)
loss0 = tf.gather_nd(batch_loss, ind0) * self.cls_weigths[0]
loss1 = tf.gather_nd(batch_loss, ind1) * self.cls_weigths[1]
loss2 = tf.gather_nd(batch_loss, ind2) * self.cls_weigths[2]
loss3 = tf.gather_nd(batch_loss, ind3) * self.cls_weigths[3]
loss4 = tf.gather_nd(batch_loss, ind4) * self.cls_weigths[4]
loss5 = tf.gather_nd(batch_loss, ind5) * self.cls_weigths[5]
loss6 = tf.gather_nd(batch_loss, ind6) * self.cls_weigths[6]
loss7 = tf.gather_nd(batch_loss, ind7) * self.cls_weigths[7]
loss8 = tf.gather_nd(batch_loss, ind8) * self.cls_weigths[8]
loss9 = tf.gather_nd(batch_loss, ind9) * self.cls_weigths[9]
loss = tf.concat([loss0, loss1, loss2, loss3, loss4, loss5, loss6, loss7, loss8, loss9], axis=-1)
self.loss = tf.reduce_sum(loss)
self.pred = tf.argmax(logits, 1, output_type=tf.int32)
self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.pred, self.next_batch[1]), tf.float32))
self.optimizer = tf.train.AdagradOptimizer(self.lr).minimize(self.loss)