参考:添加链接描述
简单来说语义分割就是从像素级别来理解一张图片,给每个像素都分类。VOC2012、MSCOCO是语义分割比较常用的数据集。
非深度学习方法:TextonForest和随机森林
深度学习方法:
1.patch classification
把图像切块输入,通常为固定大小,因为有全连接层。
2.FCN
FCN是所有语义分割的基础,没有全连接层,因此尺寸随意。还有一个问题就是pooling层,它可以扩大感受域并且整合上下文信息,但它同时也抛弃了一些位置信息。目前提出了两种结构来解决这个问题:
3.CRFs 条件随机场
原文:添加链接描述
代码:添加链接描述
用的是caffe。
num_output:卷积核个数
lr_mult: 学习率的系数,最终的学习率是这个数乘以solver.prototxt配置文件中的base_lr。
如果有两个lr_mult, 则第一个表示权值的学习率,第二个表示偏置项的学习率。一般偏置项的学习率是权值学习率的两倍
decay_mult :权值衰减,为了避免模型的over-fitting,最终的权重衰减系数需乘上solver.prototxt中的weight_decay。
splitlines():其中的参数若为false,则字符串以换行符为分隔符拆分,去掉换行符;若为true,则保留换行符。
# the base net
#输入为500x500x3 默认ks=3, stride=1, pad=1
n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=100) #698x698x64
n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) #698x698x64
n.pool1 = max_pool(n.relu1_2) #349x349x64
n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) #349x349x128
n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) #349x349x128
n.pool2 = max_pool(n.relu2_2) #174.5x174.5x128
n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) #174.5x174.5x256
n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) #174.5x174.5x256
n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) #174.5x174.5x256
n.pool3 = max_pool(n.relu3_3) #87.25x87.25x256
n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) #87.25x87.25x512
n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) #87.25x87.25x512
n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) #87.25x87.25x512
n.pool4 = max_pool(n.relu4_3) #43.625x43.625x512
n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) #43.625x43.625x512
n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) #43.625x43.625x512
n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) #43.625x43.625x512
n.pool5 = max_pool(n.relu5_3) #21.8125x21.8125x512
其实除了第一层卷积之外的卷积前后尺寸都没变,而所有的最大池化都是变成原来的一半。假设初始尺寸为 m ∗ m ∗ 3 m*m*3 m∗m∗3,那么上述操作后数值变为 ( m + 198 ) / 2 5 (m+198)/2^5 (m+198)/25。
# fully conv
#第一个全卷积 ks=7 pad=0 操作后数值变为(m+6)/2^5。
n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0) #15.8125x15.8125x4096
n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True)
n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0) #15.8125x15.8125x4096
n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True)
n.score_fr = L.Convolution(n.drop7, num_output=21, kernel_size=1,
pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2,
decay_mult=0)]) #15.8125x15.8125x21
下面就分别是FCN32s、FCN16s、FCN8s
FCN32s:
#Deconvolution是反卷积,计算公式就是卷积反着来就行了
# 大概是 w=(o-1)s-2p+k 这里的pad=0
# 操作后数值变为m+38
n.upscore = L.Deconvolution(n.score_fr,
convolution_param=dict(num_output=21, kernel_size=64, stride=32,
bias_term=False), param=[dict(lr_mult=0)]) #538x538x21
#这里是指把上述操作后的538x538按照原图的大小裁剪了
#也就是变成500x500x21 边缘均裁掉19,19数字跟第一次卷积的pad设置数值有关
n.score = crop(n.upscore, n.data)
n.loss = L.SoftmaxWithLoss(n.score, n.label,
loss_param=dict(normalize=False, ignore_label=255))
FCN16s:
#一样反卷积,但是参数变了
n.upscore2 = L.Deconvolution(n.score_fr,
convolution_param=dict(num_output=21, kernel_size=4, stride=2,
bias_term=False),
param=[dict(lr_mult=0)]) #33.625x33.625x21
#n.pool4为43.625x43.625x512
n.score_pool4 = L.Convolution(n.pool4, num_output=21, kernel_size=1, pad=0,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) #43.625x43.625x21
n.score_pool4c = crop(n.score_pool4, n.upscore2) #33.625x33.625x21
#Eltwise层的操作有三个:product点乘,sum相加减 和 max取大值其中sum是默认操作,对应元素相加
n.fuse_pool4 = L.Eltwise(n.upscore2, n.score_pool4c,
operation=P.Eltwise.SUM) #33.625x33.625x21
n.upscore16 = L.Deconvolution(n.fuse_pool4,
convolution_param=dict(num_output=21, kernel_size=32, stride=16,
bias_term=False),
param=[dict(lr_mult=0)]) #554x554x21
n.score = crop(n.upscore16, n.data) #500x500x21
n.loss = L.SoftmaxWithLoss(n.score, n.label,
loss_param=dict(normalize=False, ignore_label=255))
FCN8s:
#n.drop7为15.8125x15.8125x4096
n.score_fr_sem = L.Convolution(n.drop7, num_output=33, kernel_size=1, pad=0,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) # 15.8125x15.8125x33
n.upscore2_sem = L.Deconvolution(n.score_fr_sem,
convolution_param=dict(num_output=33, kernel_size=4, stride=2,
bias_term=False), param=[dict(lr_mult=0)]) #33.625x33.625x33
#n.pool4为43.625x43.625x512
n.score_pool4_sem = L.Convolution(n.pool4, num_output=33, kernel_size=1, pad=0,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) #43.625x43.625x33
n.score_pool4_semc = crop(n.score_pool4_sem, n.upscore2_sem) #33.625x33.625x33
n.fuse_pool4_sem = L.Eltwise(n.upscore2_sem, n.score_pool4_semc,
operation=P.Eltwise.SUM) #33.625x33.625x33
n.upscore_pool4_sem = L.Deconvolution(n.fuse_pool4_sem,
convolution_param=dict(num_output=33, kernel_size=4, stride=2,
bias_term=False), param=[dict(lr_mult=0)]) #69.25x69.25x33
#n.pool3为87.25x87.25x256
n.score_pool3_sem = L.Convolution(n.pool3, num_output=33, kernel_size=1,
pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2,
decay_mult=0)]) #87.25x87.25x33
n.score_pool3_semc = crop(n.score_pool3_sem, n.upscore_pool4_sem) #69.25x69.25x33
n.fuse_pool3_sem = L.Eltwise(n.upscore_pool4_sem, n.score_pool3_semc,
operation=P.Eltwise.SUM) #69.25x69.25x33
n.upscore8_sem = L.Deconvolution(n.fuse_pool3_sem,
convolution_param=dict(num_output=33, kernel_size=16, stride=8,
bias_term=False), param=[dict(lr_mult=0)]) #562x562x33
n.score_sem = crop(n.upscore8_sem, n.data) #500x500x33
整个网络过程大致如下,有些图形我画的比例不对(不然图太大了…),按标注的大小来看。
深蓝色层就是卷积层(或除特殊操作的层),浅蓝色是pooling层,黄色是反卷积层,橙色是裁剪,橙色的线是相加的意思。
下面就是定义的损失部分
# loss to make score happy (o.w. loss_sem)
n.loss = L.SoftmaxWithLoss(n.score_sem, n.sem,
loss_param=dict(normalize=False, ignore_label=255))
n.score_fr_geo = L.Convolution(n.drop7, num_output=3, kernel_size=1, pad=0,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) #15.8125x15.8125x3
n.upscore2_geo = L.Deconvolution(n.score_fr_geo,
convolution_param=dict(num_output=3, kernel_size=4, stride=2,
bias_term=False), param=[dict(lr_mult=0)]) #33.625x33.625x3
n.score_pool4_geo = L.Convolution(n.pool4, num_output=3, kernel_size=1, pad=0,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) #43.625x43.625x3
n.score_pool4_geoc = crop(n.score_pool4_geo, n.upscore2_geo) #33.625x33.625x3
n.fuse_pool4_geo = L.Eltwise(n.upscore2_geo, n.score_pool4_geoc,
operation=P.Eltwise.SUM) #33.625x33.625x3
n.upscore_pool4_geo = L.Deconvolution(n.fuse_pool4_geo,
convolution_param=dict(num_output=3, kernel_size=4, stride=2,
bias_term=False), param=[dict(lr_mult=0)]) # 69.25x69.25x3
n.score_pool3_geo = L.Convolution(n.pool3, num_output=3, kernel_size=1,
pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2,
decay_mult=0)]) #87.25x87.25x3
n.score_pool3_geoc = crop(n.score_pool3_geo, n.upscore_pool4_geo) # 69.25x69.25x3
n.fuse_pool3_geo = L.Eltwise(n.upscore_pool4_geo, n.score_pool3_geoc,
operation=P.Eltwise.SUM) # 69.25x69.25x3
n.upscore8_geo = L.Deconvolution(n.fuse_pool3_geo,
convolution_param=dict(num_output=3, kernel_size=16, stride=8,
bias_term=False),param=[dict(lr_mult=0)]) #562x562x3
n.score_geo = crop(n.upscore8_geo, n.data) #500x500x3
n.loss_geo = L.SoftmaxWithLoss(n.score_geo, n.geo,
loss_param=dict(normalize=False, ignore_label=255))