caffe模型转pytorch模型

转换基本流程:

1)创建pytorch的网络结构模型;

2)利用caffe来读取其存储的预训练模型,用于读取caffe模型的参数;

3)遍历caffe加载的模型参数;

4)对一些指定的key值,需要进行相应的处理和转换;

5)对修改后的层名(key值),利用numpy之间的转换来实现加载;

6)对相应层进行参数(feature)进行比较;

以下以SE_Resnet50为例,将caffe上的模型转换到pytorch上;

1)创建pytorch的网络结构模型:


class Resnet50(Module):
    def __init__(self, embedding_size = 512, class_num=0):
        super(Resnet50, self).__init__()
        self.conv0 = Conv2d(3, out_channels=64, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1, 1), bias=False)
        self.bn0 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        self.relu0 = PReLU(64)

        self.stage1_unit1_bn1 =  BatchNorm2d(64, eps=2e-5, momentum=0.9)#stage1_unit1_bn1
        self.stage1_unit1_conv1 = Conv2d(64, out_channels=64, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1, 1), bias=False)
        self.stage1_unit1_bn2 = BatchNorm2d(64, eps=2e-5, momentum=0.9)

        self.stage1_unit1_relu1 = PReLU(64)
        self.stage1_unit1_conv2 = Conv2d(64, out_channels=64, kernel_size=(3, 3), groups=1, stride=(2, 2), padding=(1, 1), bias=False)
        self.stage1_unit1_bn3 = BatchNorm2d(64, eps=2e-5, momentum=0.9)

        # self.stage1_unit1_bn3_scale
        self.stage1_unit1_se_pool1 = AdaptiveAvgPool2d(1)
        # self.stage1_unit1_se_pool1 = AvgPool2d(3, stride=1)
        self.stage1_unit1_se_conv1 = Conv2d(64, out_channels=4, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage1_unit1_se_relu1 = PReLU(4)
        self.stage1_unit1_se_conv2 = Conv2d(4, out_channels=64, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage1_unit1_se_sigmoid = Sigmoid()

        self.stage1_unit1_conv1sc = Conv2d(64, out_channels=64, kernel_size=(1, 1), groups=1, stride=(2, 2), padding=(0,0), bias=False)
        self.stage1_unit1_sc = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage1_unit1_sc_scale #relu
        # self._plus0 =  #axpy

        self.stage1_unit2_bn1 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage1_unit2_bn1_scale
        self.stage1_unit2_conv1 = Conv2d(64, out_channels=64, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage1_unit2_bn2 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage1_unit2_bn2_scale
        self.stage1_unit2_relu1 = PReLU(64)
        self.stage1_unit2_conv2 = Conv2d(64, out_channels=64, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage1_unit2_bn3 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage1_unit2_bn3_scale
        self.stage1_unit2_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage1_unit2_se_conv1 = Conv2d(64, out_channels=4, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage1_unit2_se_relu1 = PReLU(4)
        self.stage1_unit2_se_conv2 = Conv2d(4, out_channels=64, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage1_unit2_se_sigmoid = Sigmoid()

        # self._plus1  #axpy
        self.stage1_unit3_bn1 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage1_unit3_bn1_scale
        self.stage1_unit3_conv1 = Conv2d(64, out_channels=64, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage1_unit3_bn2 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage1_unit3_bn2_scale
        self.stage1_unit3_relu1 = PReLU(64)
        self.stage1_unit3_conv2 = Conv2d(64, out_channels=64, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage1_unit3_bn3 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage1_unit3_bn3_scale
        self.stage1_unit3_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage1_unit3_se_conv1 = Conv2d(64, out_channels=4, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage1_unit3_se_relu1 = PReLU(4)
        self.stage1_unit3_se_conv2 = Conv2d(4, out_channels=64, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage1_unit3_se_sigmoid = Sigmoid()
        # self._plus2 #Axpy


        self.stage2_unit1_bn1 = BatchNorm2d(64, eps=2e-5, momentum=0.9)
        # self.stage2_unit1_bn1_scale
        self.stage2_unit1_conv1 = Conv2d(64, out_channels=128, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage2_unit1_bn2 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit1_bn2_scale
        self.stage2_unit1_relu1 = PReLU(128)
        self.stage2_unit1_conv2 = Conv2d(128, out_channels=128, kernel_size=(3, 3), groups=1, stride=(2, 2), padding=(1,1), bias=False)
        self.stage2_unit1_bn3 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit1_bn3_scale
        self.stage2_unit1_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage2_unit1_se_conv1 = Conv2d(128, out_channels=8, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit1_se_relu1 = PReLU(8)
        self.stage2_unit1_se_conv2 = Conv2d(8, out_channels=128, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit1_se_sigmoid = Sigmoid()

        self.stage2_unit1_conv1sc = Conv2d(64, out_channels=128, kernel_size=(1, 1), groups=1, stride=(2, 2), padding=(0,0), bias=False)
        self.stage2_unit1_sc = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit1_sc_scale
        # self._plus3   #axpy

        self.stage2_unit2_bn1 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit2_bn1_scale
        self.stage2_unit2_conv1 = Conv2d(128, out_channels=128, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage2_unit2_bn2 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit2_bn2_scale
        self.stage2_unit2_relu1 = PReLU(128)
        self.stage2_unit2_conv2 = Conv2d(128, out_channels=128, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage2_unit2_bn3 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit2_bn3_scale
        self.stage2_unit2_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage2_unit2_se_conv1 = Conv2d(128, out_channels=8, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit2_se_relu1 = PReLU(8)
        self.stage2_unit2_se_conv2 = Conv2d(8, out_channels=128, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit2_se_sigmoid = Sigmoid()
        # self._plus4

        self.stage2_unit3_bn1 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit3_bn1_scale
        self.stage2_unit3_conv1 = Conv2d(128, out_channels=128, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage2_unit3_bn2 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit3_bn2_scale
        self.stage2_unit3_relu1 = PReLU(128)
        self.stage2_unit3_conv2 = Conv2d(128, out_channels=128, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage2_unit3_bn3 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit3_bn3_scale
        self.stage2_unit3_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage2_unit3_se_conv1 = Conv2d(128, out_channels=8, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit3_se_relu1 = PReLU(8)
        self.stage2_unit3_se_conv2 = Conv2d(8, out_channels=128, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit3_se_sigmoid = Sigmoid()
        # self._plus5

        self.stage2_unit4_bn1 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit4_bn1_scale
        self.stage2_unit4_conv1 = Conv2d(128, out_channels=128, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage2_unit4_bn2 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit4_bn2_scale
        self.stage2_unit4_relu1 = PReLU(128)
        self.stage2_unit4_conv2 = Conv2d(128, out_channels=128, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage2_unit4_bn3 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage2_unit4_bn3_scale
        self.stage2_unit4_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage2_unit4_se_conv1 = Conv2d(128, out_channels=8, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit4_se_relu1 = PReLU(8)
        self.stage2_unit4_se_conv2 = Conv2d(8, out_channels=128, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage2_unit4_se_sigmoid = Sigmoid()
        # self._plus6

        self.stage3_unit1_bn1 = BatchNorm2d(128, eps=2e-5, momentum=0.9)
        # self.stage3_unit1_bn1_scale
        self.stage3_unit1_conv1 = Conv2d(128, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit1_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit1_bn2_scale
        self.stage3_unit1_relu1 = PReLU(256)
        self.stage3_unit1_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(2, 2), padding=(1,1), bias=False)
        self.stage3_unit1_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit1_bn3_scale
        self.stage3_unit1_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit1_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit1_se_relu1 = PReLU(16)
        self.stage3_unit1_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit1_se_sigmoid = Sigmoid()

        self.stage3_unit1_conv1sc = Conv2d(128, out_channels=256, kernel_size=(1, 1), groups=1, stride=(2, 2), padding=(0,0), bias=False)
        self.stage3_unit1_sc = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit1_sc_scale
        # self._plus7

        self.stage3_unit2_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit2_bn1_scale
        self.stage3_unit2_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit2_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit2_bn2_scale
        self.stage3_unit2_relu1 = PReLU(256)
        self.stage3_unit2_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit2_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit2_bn3_scale
        self.stage3_unit2_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit2_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit2_se_relu1 = PReLU(16)
        self.stage3_unit2_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit2_se_sigmoid = Sigmoid()
        # self._plus8

        self.stage3_unit3_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit3_bn1_scale
        self.stage3_unit3_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit3_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit3_bn2_scale
        self.stage3_unit3_relu1 = PReLU(256)
        self.stage3_unit3_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit3_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit3_bn3_scale
        self.stage3_unit3_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit3_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit3_se_relu1 = PReLU(16)
        self.stage3_unit3_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit3_se_sigmoid = Sigmoid()
        # self._plus9

        self.stage3_unit4_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit4_bn1_scale
        self.stage3_unit4_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit4_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit4_bn2_scale
        self.stage3_unit4_relu1 = PReLU(256)
        self.stage3_unit4_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit4_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit4_bn3_scale
        self.stage3_unit4_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit4_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit4_se_relu1 = PReLU(16)
        self.stage3_unit4_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit4_se_sigmoid = Sigmoid()
        # self._plus10

        self.stage3_unit5_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit5_bn1_scale
        self.stage3_unit5_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit5_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit5_bn2_scale
        self.stage3_unit5_relu1 = PReLU(256)
        self.stage3_unit5_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit5_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit5_bn3_scale
        self.stage3_unit5_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit5_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit5_se_relu1 = PReLU(16)
        self.stage3_unit5_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit5_se_sigmoid = Sigmoid()
        # self._plus11

        self.stage3_unit6_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit6_bn1_scale
        self.stage3_unit6_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit6_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit6_bn2_scale
        self.stage3_unit6_relu1 = PReLU(256)
        self.stage3_unit6_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit6_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit6_bn3_scale
        self.stage3_unit6_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit6_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit6_se_relu1 = PReLU(16)
        self.stage3_unit6_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit6_se_sigmoid = Sigmoid()
        # self._plus12

        self.stage3_unit7_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit7_bn1_scale
        self.stage3_unit7_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit7_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit7_bn2_scale
        self.stage3_unit7_relu1 = PReLU(256)
        self.stage3_unit7_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit7_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit7_bn3_scale
        self.stage3_unit7_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit7_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit7_se_relu1 = PReLU(16)
        self.stage3_unit7_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit7_se_sigmoid = Sigmoid()
        # self._plus13

        self.stage3_unit8_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit8_bn1_scale
        self.stage3_unit8_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit8_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit8_bn2_scale
        self.stage3_unit8_relu1 = PReLU(256)
        self.stage3_unit8_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit8_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit8_bn3_scale
        self.stage3_unit8_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit8_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit8_se_relu1 = PReLU(16)
        self.stage3_unit8_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit8_se_sigmoid = Sigmoid()
        # self._plus14

        self.stage3_unit9_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit9_bn1_scale
        self.stage3_unit9_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit9_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit9_bn2_scale
        self.stage3_unit9_relu1 = PReLU(256)
        self.stage3_unit9_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit9_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit9_bn3_scale
        self.stage3_unit9_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit9_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit9_se_relu1 = PReLU(16)
        self.stage3_unit9_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit9_se_sigmoid = Sigmoid()
        # self._plus15

        self.stage3_unit10_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit10_bn1_scale
        self.stage3_unit10_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit10_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit10_bn2_scale
        self.stage3_unit10_relu1 = PReLU(256)
        self.stage3_unit10_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit10_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit10_bn3_scale
        self.stage3_unit10_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit10_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit10_se_relu1 = PReLU(16)
        self.stage3_unit10_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit10_se_sigmoid = Sigmoid()
        # self._plus16

        self.stage3_unit11_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit11_bn1_scale
        self.stage3_unit11_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit11_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit11_bn2_scale
        self.stage3_unit11_relu1 = PReLU(256)
        self.stage3_unit11_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit11_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit11_bn3_scale
        self.stage3_unit11_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit11_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit11_se_relu1 = PReLU(16)
        self.stage3_unit11_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit11_se_sigmoid = Sigmoid()
        # self._plus17

        self.stage3_unit12_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit12_bn1_scale
        self.stage3_unit12_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit12_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit12_bn2_scale
        self.stage3_unit12_relu1 = PReLU(256)
        self.stage3_unit12_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit12_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit12_bn3_scale
        self.stage3_unit12_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit12_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit12_se_relu1 = PReLU(16)
        self.stage3_unit12_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit12_se_sigmoid = Sigmoid()
        # self._plus18

        self.stage3_unit13_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit13_bn1_scale
        self.stage3_unit13_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit13_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit13_bn2_scale
        self.stage3_unit13_relu1 = PReLU(256)
        self.stage3_unit13_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit13_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit13_bn3_scale
        self.stage3_unit13_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit13_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit13_se_relu1 = PReLU(16)
        self.stage3_unit13_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit13_se_sigmoid = Sigmoid()
        # self._plus19

        self.stage3_unit14_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit14_bn1_scale
        self.stage3_unit14_conv1 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit14_bn2 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit14_bn2_scale
        self.stage3_unit14_relu1 = PReLU(256)
        self.stage3_unit14_conv2 = Conv2d(256, out_channels=256, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage3_unit14_bn3 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage3_unit14_bn3_scale
        self.stage3_unit14_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage3_unit14_se_conv1 = Conv2d(256, out_channels=16, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit14_se_relu1 = PReLU(16)
        self.stage3_unit14_se_conv2 = Conv2d(16, out_channels=256, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage3_unit14_se_sigmoid = Sigmoid()
        # self._plus20

        self.stage4_unit1_bn1 = BatchNorm2d(256, eps=2e-5, momentum=0.9)
        # self.stage4_unit1_bn1_scale
        self.stage4_unit1_conv1 = Conv2d(256, out_channels=512, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage4_unit1_bn2 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit1_bn2_scale
        self.stage4_unit1_relu1 = PReLU(512)
        self.stage4_unit1_conv2 = Conv2d(512, out_channels=512, kernel_size=(3, 3), groups=1, stride=(2, 2), padding=(1,1), bias=False)
        self.stage4_unit1_bn3 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit1_bn3_scale
        self.stage4_unit1_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage4_unit1_se_conv1 = Conv2d(512, out_channels=32, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage4_unit1_se_relu1 = PReLU(32)
        self.stage4_unit1_se_conv2 = Conv2d(32, out_channels=512, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage4_unit1_se_sigmoid = Sigmoid()

        self.stage4_unit1_conv1sc = Conv2d(256, out_channels=512, kernel_size=(1, 1), groups=1, stride=(2, 2), padding=(0,0), bias=False)
        self.stage4_unit1_sc = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit1_sc_scale
        # self._plus21

        self.stage4_unit2_bn1 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit2_bn1_scale
        self.stage4_unit2_conv1 = Conv2d(512, out_channels=512, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage4_unit2_bn2 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit2_bn2_scale
        self.stage4_unit2_relu1 = PReLU(512)
        self.stage4_unit2_conv2 = Conv2d(512, out_channels=512, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage4_unit2_bn3 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit2_bn3_scale
        self.stage4_unit2_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage4_unit2_se_conv1 = Conv2d(512, out_channels=32, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage4_unit2_se_relu1 = PReLU(32)
        self.stage4_unit2_se_conv2 = Conv2d(32, out_channels=512, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage4_unit2_se_sigmoid = Sigmoid()
        # self._plus22

        self.stage4_unit3_bn1 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit3_bn1_scale
        self.stage4_unit3_conv1 = Conv2d(512, out_channels=512, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage4_unit3_bn2 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit3_bn2_scale
        self.stage4_unit3_relu1 = PReLU(512)
        self.stage4_unit3_conv2 = Conv2d(512, out_channels=512, kernel_size=(3, 3), groups=1, stride=(1, 1), padding=(1,1), bias=False)
        self.stage4_unit3_bn3 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.stage4_unit3_bn3_scale
        self.stage4_unit3_se_pool1 = AdaptiveAvgPool2d(1)
        self.stage4_unit3_se_conv1= Conv2d(512, out_channels=32, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage4_unit3_se_relu1 = PReLU(32)
        self.stage4_unit3_se_conv2 = Conv2d(32, out_channels=512, kernel_size=(1, 1), groups=1, stride=(1, 1), padding=(0,0), bias=True)
        self.stage4_unit3_se_sigmoid = Sigmoid()
        # self._plus23

        self.bn1 = BatchNorm2d(512, eps=2e-5, momentum=0.9)
        # self.bn1_scale
        self.drop0 = Dropout(0.4)
        self.pre_fc1 = Linear(in_features=25088, out_features=512, bias=True)
        self.fc1 = BatchNorm1d(512, eps=2e-05, momentum=0.9)#, affine=True, track_running_stats=True)


    def forward(self, x):
        out = self.conv0(x)  # 20,64,112,112

        out = self.bn0(out)

        out = self.relu0(out)

        # plus0
        out2 = self.stage1_unit1_bn1(out)

        out2 = self.stage1_unit1_conv1(out2)

        out2 = self.stage1_unit1_bn2(out2)

        out2 = self.stage1_unit1_relu1(out2)
        out2 = self.stage1_unit1_conv2(out2)

        out2 = self.stage1_unit1_bn3(out2)
        # out2 = input
        w = self.stage1_unit1_se_pool1(out2)
        # out1 = w
        w = self.stage1_unit1_se_conv1(w)
        w = self.stage1_unit1_se_relu1(w)
        w = self.stage1_unit1_se_conv2(w)
        w = self.stage1_unit1_se_sigmoid(w)

        out = self.stage1_unit1_conv1sc(out)

        out = self.stage1_unit1_sc(out)
        out = out2 * w + out


        # plus1

        out2 = self.stage1_unit2_bn1(out)

        # self.stage1_unit2_bn1_scale
        out2 = self.stage1_unit2_conv1(out2)
        out2 = self.stage1_unit2_bn2(out2)
        # self.stage1_unit2_bn2_scale
        out2 = self.stage1_unit2_relu1(out2)
        out2 = self.stage1_unit2_conv2(out2)
        # out2 = input
        out2 = self.stage1_unit2_bn3(out2)
        # out1 = out2
        # self.stage1_unit2_bn3_scale
        w = self.stage1_unit2_se_pool1(out2)
        w = self.stage1_unit2_se_conv1(w)
        w = self.stage1_unit2_se_relu1(w)
        w = self.stage1_unit2_se_conv2(w)
        w = self.stage1_unit2_se_sigmoid(w)
        out = out2 * w + out

        # plus2
        out2 = self.stage1_unit3_bn1(out)
        # self.stage1_unit3_bn1_scale
        out2 = self.stage1_unit3_conv1(out2)
        out2 = self.stage1_unit3_bn2(out2)
        # self.stage1_unit3_bn2_scale
        out2 = self.stage1_unit3_relu1(out2)
        out2 = self.stage1_unit3_conv2(out2)
        out2 = self.stage1_unit3_bn3(out2)
        # self.stage1_unit3_bn3_scale
        w = self.stage1_unit3_se_pool1(out2)
        w = self.stage1_unit3_se_conv1(w)
        w = self.stage1_unit3_se_relu1(w)
        w = self.stage1_unit3_se_conv2(w)
        w = self.stage1_unit3_se_sigmoid(w)
        out = out2 * w + out

        # plus3
        out2 = self.stage2_unit1_bn1(out)
        # self.stage2_unit1_bn1_scale
        out2 = self.stage2_unit1_conv1(out2)
        out2 = self.stage2_unit1_bn2(out2)
        # self.stage2_unit1_bn2_scale
        out2 = self.stage2_unit1_relu1(out2)
        out2 = self.stage2_unit1_conv2(out2)
        out2 = self.stage2_unit1_bn3(out2)
        # self.stage2_unit1_bn3_scale
        w = self.stage2_unit1_se_pool1(out2)
        w = self.stage2_unit1_se_conv1(w)
        w = self.stage2_unit1_se_relu1(w)
        w = self.stage2_unit1_se_conv2(w)
        w = self.stage2_unit1_se_sigmoid(w)

        out = self.stage2_unit1_conv1sc(out)
        out = self.stage2_unit1_sc(out)
        out = out2 * w + out
        # self.stage2_unit1_sc_scale
        # self._plus3   #axpy

        # plus4
        out2 = self.stage2_unit2_bn1(out)
        # self.stage2_unit2_bn1_scale
        out2 = self.stage2_unit2_conv1(out2)
        out2 = self.stage2_unit2_bn2(out2)
        # self.stage2_unit2_bn2_scale
        out2 = self.stage2_unit2_relu1(out2)
        out2 = self.stage2_unit2_conv2(out2)
        out2 = self.stage2_unit2_bn3(out2)
        # self.stage2_unit2_bn3_scale
        w = self.stage2_unit2_se_pool1(out2)
        w = self.stage2_unit2_se_conv1(w)
        w = self.stage2_unit2_se_relu1(w)
        w = self.stage2_unit2_se_conv2(w)
        w = self.stage2_unit2_se_sigmoid(w)
        # self._plus4
        out = out2 * w + out

        # plus5
        out2 = self.stage2_unit3_bn1(out)
        # self.stage2_unit3_bn1_scale
        out2 = self.stage2_unit3_conv1(out2)
        out2 = self.stage2_unit3_bn2(out2)
        # self.stage2_unit3_bn2_scale
        out2 = self.stage2_unit3_relu1(out2)
        out2 = self.stage2_unit3_conv2(out2)
        out2 = self.stage2_unit3_bn3(out2)
        # self.stage2_unit3_bn3_scale
        w = self.stage2_unit3_se_pool1(out2)
        w = self.stage2_unit3_se_conv1(w)
        w = self.stage2_unit3_se_relu1(w)
        w = self.stage2_unit3_se_conv2(w)
        w = self.stage2_unit3_se_sigmoid(w)
        out = out2 * w + out
        # self._plus5

        # plus6
        out2 = self.stage2_unit4_bn1(out)
        # self.stage2_unit4_bn1_scale
        out2 = self.stage2_unit4_conv1(out2)
        out2 = self.stage2_unit4_bn2(out2)
        # self.stage2_unit4_bn2_scale
        out2 = self.stage2_unit4_relu1(out2)
        out2 = self.stage2_unit4_conv2(out2)
        out2 = self.stage2_unit4_bn3(out2)
        # self.stage2_unit4_bn3_scale
        w = self.stage2_unit4_se_pool1(out2)
        w = self.stage2_unit4_se_conv1(w)
        w = self.stage2_unit4_se_relu1(w)
        w = self.stage2_unit4_se_conv2(w)
        w = self.stage2_unit4_se_sigmoid(w)
        # self._plus6
        out = out2 * w + out

        # plus7
        out2 = self.stage3_unit1_bn1(out)
        # self.stage3_unit1_bn1_scale
        out2 = self.stage3_unit1_conv1(out2)
        out2 = self.stage3_unit1_bn2(out2)
        # self.stage3_unit1_bn2_scale
        out2 = self.stage3_unit1_relu1(out2)
        out2 = self.stage3_unit1_conv2(out2)
        out2 = self.stage3_unit1_bn3(out2)
        # self.stage3_unit1_bn3_scale
        w = self.stage3_unit1_se_pool1(out2)
        w = self.stage3_unit1_se_conv1(w)
        w = self.stage3_unit1_se_relu1(w)
        w = self.stage3_unit1_se_conv2(w)
        w = self.stage3_unit1_se_sigmoid(w)

        out = self.stage3_unit1_conv1sc(out)
        out = self.stage3_unit1_sc(out)
        # self.stage3_unit1_sc_scale
        # self._plus7
        out = out2 * w + out

        # plus8
        out2 = self.stage3_unit2_bn1(out)
        # self.stage3_unit2_bn1_scale
        out2 = self.stage3_unit2_conv1(out2)
        out2 = self.stage3_unit2_bn2(out2)
        # self.stage3_unit2_bn2_scale
        out2 = self.stage3_unit2_relu1(out2)
        out2 = self.stage3_unit2_conv2(out2)
        out2 = self.stage3_unit2_bn3(out2)
        # self.stage3_unit2_bn3_scale
        w = self.stage3_unit2_se_pool1(out2)
        w = self.stage3_unit2_se_conv1(w)
        w = self.stage3_unit2_se_relu1(w)
        w = self.stage3_unit2_se_conv2(w)
        w = self.stage3_unit2_se_sigmoid(w)
        # self._plus8
        out = out2 * w + out

        # plus9
        out2 = self.stage3_unit3_bn1(out)
        # self.stage3_unit3_bn1_scale
        out2 = self.stage3_unit3_conv1(out2)
        out2 = self.stage3_unit3_bn2(out2)
        # self.stage3_unit3_bn2_scale
        out2 = self.stage3_unit3_relu1(out2)
        out2 = self.stage3_unit3_conv2(out2)
        out2 = self.stage3_unit3_bn3(out2)
        # self.stage3_unit3_bn3_scale
        w = self.stage3_unit3_se_pool1(out2)
        w = self.stage3_unit3_se_conv1(w)
        w = self.stage3_unit3_se_relu1(w)
        w = self.stage3_unit3_se_conv2(w)
        w = self.stage3_unit3_se_sigmoid(w)
        # self._plus9
        out = out2 * w + out

        # plus10
        out2 = self.stage3_unit4_bn1(out)
        # self.stage3_unit4_bn1_scale
        out2 = self.stage3_unit4_conv1(out2)
        out2 = self.stage3_unit4_bn2(out2)
        # self.stage3_unit4_bn2_scale
        out2 = self.stage3_unit4_relu1(out2)
        out2 = self.stage3_unit4_conv2(out2)
        out2 = self.stage3_unit4_bn3(out2)
        # self.stage3_unit4_bn3_scale
        w = self.stage3_unit4_se_pool1(out2)
        w = self.stage3_unit4_se_conv1(w)
        w = self.stage3_unit4_se_relu1(w)
        w = self.stage3_unit4_se_conv2(w)
        w = self.stage3_unit4_se_sigmoid(w)
        # self._plus10
        out = out2 * w + out

        # plus11
        out2 = self.stage3_unit5_bn1(out)
        # self.stage3_unit5_bn1_scale
        out2 = self.stage3_unit5_conv1(out2)
        out2 = self.stage3_unit5_bn2(out2)
        # self.stage3_unit5_bn2_scale
        out2 = self.stage3_unit5_relu1(out2)
        out2 = self.stage3_unit5_conv2(out2)
        out2 = self.stage3_unit5_bn3(out2)
        # self.stage3_unit5_bn3_scale
        w = self.stage3_unit5_se_pool1(out2)
        w = self.stage3_unit5_se_conv1(w)
        w = self.stage3_unit5_se_relu1(w)
        w = self.stage3_unit5_se_conv2(w)
        w = self.stage3_unit5_se_sigmoid(w)
        # self._plus11
        out = out2 * w + out

        # plus12
        out2 = self.stage3_unit6_bn1(out)
        # self.stage3_unit6_bn1_scale
        out2 = self.stage3_unit6_conv1(out2)
        out2 = self.stage3_unit6_bn2(out2)
        # self.stage3_unit6_bn2_scale
        out2 = self.stage3_unit6_relu1(out2)
        out2 = self.stage3_unit6_conv2(out2)
        out2 = self.stage3_unit6_bn3(out2)
        # self.stage3_unit6_bn3_scale
        w = self.stage3_unit6_se_pool1(out2)
        w = self.stage3_unit6_se_conv1(w)
        w = self.stage3_unit6_se_relu1(w)
        w = self.stage3_unit6_se_conv2(w)
        w = self.stage3_unit6_se_sigmoid(w)
        # self._plus12
        out = out2 * w + out

        # plus13
        out2 = self.stage3_unit7_bn1(out)
        # self.stage3_unit7_bn1_scale
        out2 = self.stage3_unit7_conv1(out2)
        out2 = self.stage3_unit7_bn2(out2)
        # self.stage3_unit7_bn2_scale
        out2 = self.stage3_unit7_relu1(out2)
        out2 = self.stage3_unit7_conv2(out2)
        out2 = self.stage3_unit7_bn3(out2)
        # self.stage3_unit7_bn3_scale
        w = self.stage3_unit7_se_pool1(out2)
        w = self.stage3_unit7_se_conv1(w)
        w = self.stage3_unit7_se_relu1(w)
        w = self.stage3_unit7_se_conv2(w)
        w = self.stage3_unit7_se_sigmoid(w)
        # self._plus13
        out = out2 * w + out

        # plus14
        out2 = self.stage3_unit8_bn1(out)
        # self.stage3_unit8_bn1_scale
        out2 = self.stage3_unit8_conv1(out2)
        out2 = self.stage3_unit8_bn2(out2)
        # self.stage3_unit8_bn2_scale
        out2 = self.stage3_unit8_relu1(out2)
        out2 = self.stage3_unit8_conv2(out2)
        out2 = self.stage3_unit8_bn3(out2)
        # self.stage3_unit8_bn3_scale
        w = self.stage3_unit8_se_pool1(out2)
        w = self.stage3_unit8_se_conv1(w)
        w = self.stage3_unit8_se_relu1(w)
        w = self.stage3_unit8_se_conv2(w)
        w = self.stage3_unit8_se_sigmoid(w)
        # self._plus14
        out = out2 * w + out

        # plus15
        out2 = self.stage3_unit9_bn1(out)
        # self.stage3_unit9_bn1_scale
        out2 = self.stage3_unit9_conv1(out2)
        out2 = self.stage3_unit9_bn2(out2)
        # self.stage3_unit9_bn2_scale
        out2 = self.stage3_unit9_relu1(out2)
        out2 = self.stage3_unit9_conv2(out2)
        out2 = self.stage3_unit9_bn3(out2)
        # self.stage3_unit9_bn3_scale
        w = self.stage3_unit9_se_pool1(out2)
        w = self.stage3_unit9_se_conv1(w)
        w = self.stage3_unit9_se_relu1(w)
        w = self.stage3_unit9_se_conv2(w)
        w = self.stage3_unit9_se_sigmoid(w)
        # self._plus15
        out = out2 * w + out

        # plus16
        out2 = self.stage3_unit10_bn1(out)
        # self.stage3_unit10_bn1_scale
        out2 = self.stage3_unit10_conv1(out2)
        out2 = self.stage3_unit10_bn2(out2)
        # self.stage3_unit10_bn2_scale
        out2 = self.stage3_unit10_relu1(out2)
        out2 = self.stage3_unit10_conv2(out2)
        out2 = self.stage3_unit10_bn3(out2)
        # self.stage3_unit10_bn3_scale
        w = self.stage3_unit10_se_pool1(out2)
        w = self.stage3_unit10_se_conv1(w)
        w = self.stage3_unit10_se_relu1(w)
        w = self.stage3_unit10_se_conv2(w)
        w = self.stage3_unit10_se_sigmoid(w)
        # self._plus16
        out = out2 * w + out
        # return out

        # plus17
        out2 = self.stage3_unit11_bn1(out)
        # self.stage3_unit11_bn1_scale
        out2 = self.stage3_unit11_conv1(out2)
        out2 = self.stage3_unit11_bn2(out2)
        # self.stage3_unit11_bn2_scale
        out2 = self.stage3_unit11_relu1(out2)
        out2 = self.stage3_unit11_conv2(out2)
        out2 = self.stage3_unit11_bn3(out2)
        # self.stage3_unit11_bn3_scale
        w = self.stage3_unit11_se_pool1(out2)
        w = self.stage3_unit11_se_conv1(w)
        w = self.stage3_unit11_se_relu1(w)
        w = self.stage3_unit11_se_conv2(w)
        w = self.stage3_unit11_se_sigmoid(w)
        # self._plus17
        out = out2 * w + out


        # plus18
        out2 = self.stage3_unit12_bn1(out)
        # self.stage3_unit12_bn1_scale
        out2 = self.stage3_unit12_conv1(out2)
        out2 = self.stage3_unit12_bn2(out2)
        # self.stage3_unit12_bn2_scale
        out2 = self.stage3_unit12_relu1(out2)
        out2 = self.stage3_unit12_conv2(out2)
        out2 = self.stage3_unit12_bn3(out2)
        # self.stage3_unit12_bn3_scale
        w = self.stage3_unit12_se_pool1(out2)
        w = self.stage3_unit12_se_conv1(w)
        w = self.stage3_unit12_se_relu1(w)
        w = self.stage3_unit12_se_conv2(w)
        w = self.stage3_unit12_se_sigmoid(w)
        # self._plus18
        out = out2 * w + out


        # plus19
        out2 = self.stage3_unit13_bn1(out)
        # self.stage3_unit13_bn1_scale
        out2 = self.stage3_unit13_conv1(out2)
        out2 = self.stage3_unit13_bn2(out2)
        # self.stage3_unit13_bn2_scale
        out2 = self.stage3_unit13_relu1(out2)
        out2 = self.stage3_unit13_conv2(out2)
        out2 = self.stage3_unit13_bn3(out2)
        # self.stage3_unit13_bn3_scale
        w = self.stage3_unit13_se_pool1(out2)
        w = self.stage3_unit13_se_conv1(w)
        w = self.stage3_unit13_se_relu1(w)
        w = self.stage3_unit13_se_conv2(w)
        w = self.stage3_unit13_se_sigmoid(w)
        # self._plus19
        out = out2 * w + out


        # plus20
        out2 = self.stage3_unit14_bn1(out)
        # self.stage3_unit14_bn1_scale
        out2 = self.stage3_unit14_conv1(out2)
        out2 = self.stage3_unit14_bn2(out2)
        # self.stage3_unit14_bn2_scale
        out2 = self.stage3_unit14_relu1(out2)
        out2 = self.stage3_unit14_conv2(out2)
        out2 = self.stage3_unit14_bn3(out2)
        # self.stage3_unit14_bn3_scale
        w = self.stage3_unit14_se_pool1(out2)
        w = self.stage3_unit14_se_conv1(w)
        w = self.stage3_unit14_se_relu1(w)
        w = self.stage3_unit14_se_conv2(w)
        w = self.stage3_unit14_se_sigmoid(w)
        # self._plus20
        out = out2 * w + out


        # plus21

        out2 = self.stage4_unit1_bn1(out)

        # self.stage4_unit1_bn1_scale
        out2 = self.stage4_unit1_conv1(out2)

        out2 = self.stage4_unit1_bn2(out2)

        # self.stage4_unit1_bn2_scale
        out2 = self.stage4_unit1_relu1(out2)

        out2 = self.stage4_unit1_conv2(out2)

        out2 = self.stage4_unit1_bn3(out2)

        # self.stage4_unit1_bn3_scale
        w = self.stage4_unit1_se_pool1(out2)

        w = self.stage4_unit1_se_conv1(w)

        w = self.stage4_unit1_se_relu1(w)

        w = self.stage4_unit1_se_conv2(w)

        w = self.stage4_unit1_se_sigmoid(w)

        out = self.stage4_unit1_conv1sc(out)
        # out = input
        out = self.stage4_unit1_sc(out)
        # out1 = out
        # self.stage4_unit1_sc_scale
        # self._plus21
        out = out2 * w + out



        # plus22
        out2 = self.stage4_unit2_bn1(out)
        # self.stage4_unit2_bn1_scale
        out2 = self.stage4_unit2_conv1(out2)
        out2 = self.stage4_unit2_bn2(out2)
        # self.stage4_unit2_bn2_scale
        out2 = self.stage4_unit2_relu1(out2)
        out2 = self.stage4_unit2_conv2(out2)
        out2 = self.stage4_unit2_bn3(out2)
        # self.stage4_unit2_bn3_scale
        w = self.stage4_unit2_se_pool1(out2)
        w = self.stage4_unit2_se_conv1(w)
        w = self.stage4_unit2_se_relu1(w)
        w = self.stage4_unit2_se_conv2(w)
        w = self.stage4_unit2_se_sigmoid(w)
        # self._plus22
        out = out2 * w + out



        # plus23
        out2 = self.stage4_unit3_bn1(out)
        # self.stage4_unit3_bn1_scale
        out2 = self.stage4_unit3_conv1(out2)
        out2 = self.stage4_unit3_bn2(out2)
        # self.stage4_unit3_bn2_scale
        out2 = self.stage4_unit3_relu1(out2)
        out2 = self.stage4_unit3_conv2(out2)
        out2 = self.stage4_unit3_bn3(out2)
        # self.stage4_unit3_bn3_scale
        w = self.stage4_unit3_se_pool1(out2)
        w = self.stage4_unit3_se_conv1(w)
        w = self.stage4_unit3_se_relu1(w)
        w = self.stage4_unit3_se_conv2(w)
        w = self.stage4_unit3_se_sigmoid(w)
        # self._plus23
        out = out2 * w + out

        out = self.bn1(out)

        # self.bn1_scale
        out = self.drop0(out)

        out = out.view(out.size(0), -1)
        out = self.pre_fc1(out)

        # out = input
        out = self.fc1(out)
        # out1 = out
        return out#, out1# , out_res  #l2_norm(out)

2)利用caffe进行模型加载:

#coding=utf-8
import sys
sys.path.insert(0, "/home/fuxueping/sdb/Caffe_Project_Train/caffe-ssd/python")
import caffe


caffe_model = 'face.caffemodel'
prototxt = 'face.prototxt'
caffe.set_mode_gpu()
caffe.set_device(0)
net = caffe.Net(prototxt, caffe_model, caffe.TEST)

3)4)5)一起:

    def init_model(self, model,net):
        # print(model)
        for n, m in model.named_modules():
            if isinstance(m, BatchNorm2d):
                self.bn_init(n, m, net)
            elif isinstance(m, Conv2d):
                self.conv_init(n, m, net)
            elif isinstance(m, Linear):
                self.fc_init(n, m, net)
            elif isinstance(m, PReLU):
                self.prelu_init(n, m, net)
            if isinstance(m, BatchNorm1d):
                self.bn_init(n, m, net)
            # elif isinstance(m, AdaptiveAvgPool2d):
            #     self.AdaptiveAvgPool2d_init(n, m, net)
        return model

    def bn_init(self, n, m, net):
        if n in net.params:
            if len(net.params[n]) > 0:
                # print(len(net.params[n]))
                # for i in range(len(net.params[n])):
                    # print (i)
                data0 = net.params[n][0].data
                data1 = net.params[n][1].data
                # data2 = net.params[n][2].data[:]
                m.running_mean.copy_(torch.FloatTensor(data0))
                m.running_var.copy_(torch.FloatTensor(data1))

                name = n+'_scale'
                if name in net.params:
                    if len(net.params[n]) > 0:
                        data0_scale = net.params[name][0].data
                        data1_scale = net.params[name][1].data
                        m.weight.data.copy_(torch.FloatTensor(data0_scale))
                        m.bias.data.copy_(torch.FloatTensor(data1_scale))


    def conv_init(self, n, m, net):
        # for pr in net.params:
        if n in net.params:
            if len(net.params[n]) > 0:
                for i in range(len(net.params[n])):
                    print(i)
                    if i > 0:
                        data = net.params[n][1].data[:]
                        m.bias.data.copy_(torch.FloatTensor(data))

                data = net.params[n][0].data[:]
                m.weight.data.copy_(torch.FloatTensor(data))


    def fc_init(self, n, m, net):
        if n in net.params:
            if len(net.params[n]) > 0:
                for i in range(len(net.params[n])):
                    # print(i)
                    if i > 0:
                        data = net.params[n][1].data[:]
                        m.bias.data.copy_(torch.FloatTensor(data))

                data = net.params[n][0].data[:]
                m.weight.data.copy_(torch.FloatTensor(data))

    def prelu_init(self, n, m, net):
        if n in net.params:
            if len(net.params[n]) > 0:
                # for i in range(len(net.params[n])):
                    # print(i)
                data = net.params[n][0].data[:]
                m.weight.data.copy_(torch.FloatTensor(data))

    def AdaptiveAvgPool2d_init(self, n, m, net):
        if n in net.params:
            if len(net.params[n]) > 0:
                # for i in range(len(net.params[n])):
                    # print(i)
                data = net.params[n][0].data[:]
                m.weight.data.copy_(torch.FloatTensor(data))

6)对相应层进行参数(feature)进行比较:

a.保证相同的输入图,可以自己设置一个

img = np.ones([1,3,112,112]).astype(np.float32)

b.对每个层名进行feature的读取,进行比较:

caffe层读取参数:

net.blobs['data'].data[...] = img
output = net.forward()
caffe_data = net.blobs["fc1"].data[0][...].flatten()

pytorch读取参数:

out = net1(im_tensor.cuda())#, torch.from_numpy(out).unsqueeze(0).cuda())
                         # torch.from_numpy(w).unsqueeze(0).cuda(),
                         # torch.from_numpy(out2).unsqueeze(0).cuda())
pytorch_data = out.data.cpu().numpy().flatten()

c.进行参数对比:

diff = abs(pytorch_data - caffe_data).sum()
print("caffe & pytorch diff:", diff/pytorch_data.size)

 参考资源:

torch和caffe中的BatchNorm层

用MXnet预训练模型初始化Pytorch模型

 

 

 

 

你可能感兴趣的:(深度学习pytorch使用,pytorch,caffe,模型转换)