3dcnn视频分类算法-pytorch上分之路

3DCNN-视频分类

  • 项目结构
  • config.py
  • datalist.py
  • model.py
  • train.py
  • 最后

项目结构

3dcnn视频分类算法-pytorch上分之路_第1张图片

config.py

import argparse

'''
training settings

metavar参数,用来控制部分命令行参数的显示
'''
parser = argparse.ArgumentParser(description='PyTorch Example for all')
parser.add_argument('--train-batch-size', type=int, default=30, metavar='N',
                    help='input batch size for training (default: 32)')
parser.add_argument('--test-batch-size', type=int, default=30, metavar='N',
                    help='input batch size for testing (default: 64)')
parser.add_argument('--epoches', type=int, default=500, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.0001)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                    help='SGD momentum (default: 0.5)')
parser.add_argument('--seed', type=int, default=123, metavar='S',
                    help='random seed 设置种子的用意是一旦固定种子,后面依次生成的随机数其实都是固定的,有利于实验结果的产生与比较')
parser.add_argument('--use_cuda', type=bool, default=True,
                    help='whether to use cuda to accerlate')
parser.add_argument('--base_data_path', type=str, default='G:/数据集/分类/',
                    help="total base data path for training")
parser.add_argument('--resume', type=bool, default=False, metavar='R',
                    help="whether to use the pretrained model to start the train")
parser.add_argument('--saved_model', type=str, default="E:/完成工作/trained_model/",
                    help="the path to store the weight")
parser.add_argument('--val_num', type=float, default=0.3,
                    help="perecentage of validate data")
parser.add_argument('--pretrained_weight', type=str, default="E:/完成工作/trained_model/",
                    help="the path to load the pytorch weight")
parser.add_argument('--save', type=bool, default=True,
                    help="whether to save the model weight")
parser.add_argument('--project_name', type=str, default='视频分类',
                    help="该项目的工程名称")
parser.add_argument('--use_aug', type=bool, default=False,
                    help='使用数据增广,增加数据多样性')
parser.add_argument('--model_name',type=str,default="shuffle_net",
                    help='model name')
parser.add_argument('--train_dir',type=str,default="E:/Datasets2/violencedata/hockey/train")
parser.add_argument('--test_dir',type=str,default="E:/Datasets2/violencedata/hockey/test")

datalist.py

import os
from random import shuffle
from torchvision import transforms
import cv2
from torch.utils.data import Dataset
import torch
from uuid import uuid1
class VideoDataset(Dataset):
    def __init__(self, dirs):
        super(VideoDataset, self).__init__()
        self.base_dir = dirs

        self.video_data=[]
        self.video_label=[]
        videos = []
        for video in os.listdir(self.base_dir):
            videos.append(self.base_dir + '/' + video)

        '''
        读取数据,it will take a long time
        '''
        for i,video_line in enumerate(videos):
            frames,label=self.get_video_and_label(video_line)
            self.video_data.append(frames)
            self.video_label.append(label)



    def __getitem__(self, index):

        frames= self.video_data[index]
        label=self.video_label[index]

        frames=torch.Tensor(frames)
        frames=frames.permute(3,0,1,2)
        for frame in frames:
            frame.div_(127.5).sub_(1)

        return frames,label


    def __len__(self):
        return len(self.video_label)

    def get_video_and_label(self, video):
        frames = []
        label = 0 if "fi" in video.split('/')[-1] else 1
        cap = cv2.VideoCapture(video)
        isOpened = cap.isOpened()
        # print(video + " is opened : " + str(isOpened))
        fps = cap.get(cv2.CAP_PROP_FPS)
        assert fps > 16, "video length should greater than 16"
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        i = 0
        while isOpened:
            if i == 16:
                # 截取前16帧的连续图片
                break
            else:
                i += 1
            flag, frame = cap.read()
            # frame=cv2.cvtColor(frame,cv2.COLOR_RGB2GRAY)
            frame=frame[width//2-56:width//2+56,height//2-56:height//2+56]
            frames.append(frame)
            # cv2.imwrite("./weights/"+str(uuid1())+".jpg",frame)

        return frames,label

    def onehot_encode(self, label, n_class=2):
        diag = torch.eye(n_class)
        oh_vector = diag[label].view(n_class)
        return oh_vector

model.py

import torch
import torch.nn as nn


class C3D(nn.Module):
    """
    The C3D network.
    """

    def __init__(self, num_classes):
        super(C3D, self).__init__()

        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.fc6 = nn.Linear(8192, 512)
        self.fc7 = nn.Linear(512, 100)
        self.fc8 = nn.Linear(100, num_classes)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()

        self.sigmoid=nn.Sigmoid()




    def forward(self, x):
        # print ('1:',x.size())
        x = self.relu(self.conv1(x))
        # print ('2:',x.size())
        x = self.pool1(x)
        # print ('3:',x.size())

        x = self.relu(self.conv2(x))
        # print ('4:',x.size())
        x = self.pool2(x)
        # print ('5:',x.size())

        x = self.relu(self.conv3a(x))
        # print ('6:',x.size())
        x = self.relu(self.conv3b(x))
        # print ('7:',x.size())
        x = self.pool3(x)
        # print ('8:',x.size())

        x = self.relu(self.conv4a(x))
        # print ('9:',x.size())
        x = self.relu(self.conv4b(x))
        # print ('10:',x.size())
        x = self.pool4(x)
        # print ('11:',x.size())

        x = self.relu(self.conv5a(x))
        # print ('12:',x.size())
        x = self.relu(self.conv5b(x))
        # print ('13:',x.size())
        x = self.pool5(x)
        # print ('14:',x.size())

        x = x.view(-1, 8192)
        # print ('15:',x.size())
        x = self.relu(self.fc6(x))
        # print ('16:',x.size())
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)

        logits = self.fc8(x)
        # print ('17:',logits.size())
        return logits

train.py

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

from config import parser
from datalist import VideoDataset
from model import C3D

best_acc = 0


class train(object):
    def __init__(self):
        self.args = parser.parse_args()
        print(f"-----------{self.args.project_name}-------------")

        use_cuda = self.args.use_cuda and torch.cuda.is_available()
        if use_cuda:
            torch.cuda.manual_seed(self.args.seed)  # 为当前GPU设置随机种子
        else:
            torch.manual_seed(self.args.seed)  # 为CPU设置种子用于生成随机数,以使得结果是确定的
        self.device = torch.device("cuda" if use_cuda else "cpu")
        train_kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {}
        test_kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {}
        '''
        构造DataLoader
        '''
        self.train_dataloader = DataLoader(VideoDataset(self.args.train_dir), batch_size=self.args.train_batch_size,
                                           shuffle=True, **train_kwargs)
        self.test_dataloader = DataLoader(VideoDataset(self.args.test_dir), batch_size=self.args.test_batch_size,
                                          shuffle=True, **test_kwargs)

        self.model = C3D(2).to(self.device)

        if self.args.resume:
            try:
                print("load the weight from pretrained-weight file")
                model_dict = self.model.state_dict()
                pretrained_dict = torch.load(self.args.pretrained_weight, map_location=self.device)
                pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
                model_dict.update(pretrained_dict)
                self.model.load_state_dict(model_dict)
                print("Finished to load the weight")
            except:
                print("can not load weight \n train the model from stratch")
                self.model.apply(self.weights_init)

        if use_cuda:
            self.model = torch.nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            cudnn.benchmark = True

        '''
        构造loss目标函数
        选择优化器
        学习率变化选择
        '''

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr, momentum=self.args.momentum)
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=5, eta_min=1e-5)

        for epoch in range(1, self.args.epoches + 1):
            self.train(epoch)
            if epoch % 1 == 0:
                self.test(epoch)

        torch.cuda.empty_cache()
        print("finish model training")

    def train(self, epoch):
        self.model.train()
        average_loss = []
        pbar = tqdm(self.train_dataloader,
                    desc=f'Train Epoch{epoch}/{self.args.epoches}')

        for data, target in pbar:
            data, target = data.to(self.device), target.to(self.device)
            self.optimizer.zero_grad()  # 模型参数梯度清零
            output = self.model(data)
            loss = self.criterion(output, target)
            loss.backward()
            average_loss.append(loss.item())
            self.optimizer.step()
            pbar.set_description(
                f'Train Epoch:{epoch}/{self.args.epoches} train_loss:{round(np.mean(average_loss), 4)}')
        self.scheduler.step()

    def test(self, epoch):

        global best_acc

        self.model.eval()
        test_loss = 0
        correct = torch.zeros(1).squeeze().cuda()
        total = torch.zeros(1).squeeze().cuda()

        average_loss = []

        pbar = tqdm(self.test_dataloader,
                    desc=f'Test Epoch{epoch}/{self.args.epoches}',
                    mininterval=0.3)
        for data, target in pbar:
            data, target = data.to(self.device), target.to(self.device)
            with torch.no_grad():
                output = self.model(data)
            average_loss.append(self.criterion(output, target).item())
            test_loss += self.criterion(output, target).item()  # sum up batch loss
            pred = torch.argmax(output, 1)

            correct += (pred == target).sum().float()
            total += len(target)
            predict_acc = correct / total
            pbar.set_description(
                f'Test Epoch:{epoch}/{self.args.epoches} acc:{predict_acc}')

        if self.args.save and predict_acc > best_acc:
            best_acc = predict_acc

            torch.save({
                'epoch': epoch,
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'loss': round(np.mean(average_loss), 2)
            },
                "./weights" + f'/Epoch-{epoch}-Test_loss-{best_acc}.pth')


if __name__ == "__main__":
    train = train()

最后

最近看到了一篇文章–基于3D-CNN的暴力行为检测,正好还没有使用过3d卷积所以尝试了下,做了一个简单的分类,不用任何trick。
这里值得学习的是用在视频中的3d卷积,对应的输入是N,C,D,H,W比一般的2D卷积多了一个D,在用3d卷积和池化的过程中,D,H,W是被作用的主要对象,其实也就这些内容,着实感觉有点水。最后来看一下分类的结果吧。
3dcnn视频分类算法-pytorch上分之路_第2张图片
这么看是可以到达这篇论文所说的效果的。另外,我用的数据集是hockey鉴暴数据集。这里我一开始训练的时候以为自己代码有问题,因为你可以看到我这个训练的发展变化,有1-69的这个epoch中他的准确率是没有变化的,有点想不通,但看这个loss是有轻微变化的。我只能给出解释3d卷积学习非常慢。

你可能感兴趣的:(torch学习)