通过前面两篇文章:
- 使用CNN预测HEVC的CU分割 (1) -- 构建数据集
- 使用CNN预测HEVC的CU分割 (2) -- 数据集的划分,随机抽取帧,优化数据集结构加快读取
我们已经完成了构建自己的数据集以及数据预处理的工作,接下来可以构建神经网络模型进行训练了。我们的训练目标是将每个64x64的CTU作为神经网络的输入,预测对应的分割信息作为输出。这篇文章中,我将会介绍几种论文中使用的不同的神经网络的架构,并使用PyTorch实现一些思路的代码。
我已经使用了一种神经网络模型进行了训练和测试,并将代码放在了GitHub,里面介绍了我的模型架构,以及验证结果:
GitHub - wolverinn/HEVC-CU-depths-prediction-CNN: Using convolutional neural networks to predict the Coding Units (CUs) depths in HEVC intra-prediction mode, in order to reduce the time of the encoding process in HEVC.
首先是一种自上而下分层预测的架构,这种架构与HEVC编码器十分相似,都是对于一个64x64的CTU,第一步预测是否需要将其进一步向下分割,即分割为4个32x32的CU,如果需要进行分割,接下来再对每一个32x32的CU进行同样的是否向下分割的判断,直到达到最小的8x8大小。这种思路应用到神经网络中就是每次判断都应用一个二分类器——分割或者不分割。对于第一步的二分类器,输入图片的大小就是64x64,输出是0或1,代表是否分割;第二步的二分类器输入大小就是32x32...以此类推。一共需要构建三个二分类器。第一个64x64所使用的二分类器的代码如下:
首先是卷积神经网络模型部分的代码:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
import os
import pickle
import numpy as np
from PIL import Image
import time
import math
'''
预测64x64的ctu是否需要分割
'''
BATCH_SIZE=256
EPOCHS=50 # 总共训练批次
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 让torch判断是否使用GPU
class ConvNet1(nn.Module):
def __init__(self):
super().__init__()
# (3,64,64)
self.conv1=nn.Conv2d(3,16,5,padding=2) # (16,32,32)
self.conv2=nn.Conv2d(16,32,5,padding=2) # (32,16,16)
self.conv3=nn.Conv2d(32,64,3,padding=1) # (64,8,8)
self.conv4=nn.Conv2d(64,128,3,padding=1) # (128,4,4)
self.fc1 = nn.Linear(128*4*4,256)
self.fc2 = nn.Linear(256,64)
self.fc3 = nn.Linear(64,2)
self.dropout = nn.Dropout(0.25)
def forward(self,x):
in_size = x.size(0)
out = self.conv1(x)
out = F.relu(out)
out = F.max_pool2d(out, 2, 2)
out = self.conv2(out)
out = F.relu(out)
out = F.max_pool2d(out,2,2)
out = F.relu(self.conv3(out))
out = F.max_pool2d(out,2,2)
out = F.relu(self.conv4(out))
out = F.max_pool2d(out, 2, 2)
out = out.view(in_size,-1) # 扁平化flat然后传入全连接层
out = self.fc1(out)
out = F.relu(out)
out = self.fc2(out)
out = F.relu(out)
out = self.dropout(out)
out = self.fc3(out)
return out
接下来是数据集加载部分,不知道我的数据集为什么这样加载以及预处理的可以看看我之前的文章(这个代码和之前的预处理有一些变化,这是我的数据加载的最终版本):
transform = transforms.Compose([
transforms.ToTensor(), # 将图片转换为Tensor,归一化至[0,1]
# transforms.Normalize(mean=[.5, .5, .5], std=[.5, .5, .5]) # 标准化至[-1,1]
])
def from_ctufile(load_type,video_number,frame_number,ctu_number):
# https://pytorch-cn.readthedocs.io/zh/latest/package_references/Tensor/
ctu_file = "./img-dataset/pkl-{}/v_{}.pkl".format(load_type,video_number)
f_pkl = open(ctu_file,'rb')
video_dict = pickle.load(f_pkl)
f_pkl.close()
ctu_info = video_dict[frame_number][ctu_number] # 获得当前ctu的16x16分割信息的列表
if 0 in ctu_info:
label = torch.tensor(0) # 0 means NON-SPLIT in this layer
else:
label = torch.tensor(1)
return label
class ImageSet(data.Dataset):
def __init__(self,root):
# 所有图片的绝对路径
self.img_files = []
self.root = root
for img in os.listdir(root):
ctu_numbers_per_frame = img.split('_')[3]
for ctu_number in range(int(ctu_numbers_per_frame)):
self.img_files.append((img,ctu_number))
self.transforms=transform
def __getitem__(self, index):
img = Image.open(os.path.join(self.root,self.img_files[index][0]))
video_number = self.img_files[index][0].split('_')[1]
frame_number = self.img_files[index][0].split('_')[2]
ctu_number = self.img_files[index][1]
img_width, img_height = img.size
img_row = ctu_number // math.ceil(img_width / 64)
img_colonm = ctu_number % math.ceil(img_height / 64)
start_pixel_x = img_colonm * 64
start_pixel_y = img_row * 64
cropped_img = img.crop((start_pixel_x, start_pixel_y, start_pixel_x + 64, start_pixel_y + 64)) # 依次对抽取到的帧进行裁剪
img.close()
if "train" in self.root:
load_type = "train"
elif "validation" in self.root:
load_type = "validation"
else:
load_type = "test"
if self.transforms:
data = self.transforms(cropped_img)
else:
img = np.asarray(cropped_img)
data = torch.from_numpy(img)
label = from_ctufile(load_type,video_number,frame_number,str(ctu_number))
cropped_img.close()
return data,label
def __len__(self):
return len(self.img_files)
train_loader = data.DataLoader(ImageSet("./img-dataset/img-train/"),batch_size=BATCH_SIZE,shuffle=True)
# test_loader = data.DataLoader(ImageSet("./img-dataset/img-test/"),batch_size=1,shuffle=True)
validation_loader = data.DataLoader(ImageSet("./img-dataset/img-validation/"),batch_size=BATCH_SIZE,shuffle=True)
接下来是模型的训练部分的代码,初次训练时先进行权重初始化。另一点需要注意的是由于数据集中大部分的64x64都是需要进行分割的,也就是说可能有90%的情况都是1(要分割),所以最好在损失函数上加一个weight,否则神经网络就算总是输出1也有90%的正确率:
model = ConvNet1().to(DEVICE)
try:
model.load_state_dict(torch.load('hevc_encoder_model.pt'))
print("loaded model from drive")
except:
print("initializing weight...")
for m in model.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
print(model)
optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss()
valid_loss_min = np.Inf
def train(model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
data_v = Variable(data)
target_v = Variable(target)
# print(target_v)
optimizer.zero_grad() # 梯度归零
output = model(data_v)
# print(output)
loss = criterion(output, target_v)
loss.backward()
optimizer.step() # 更新梯度
if(batch_idx+1)%150 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def validation(model, device, validation_loader,epoch):
global valid_loss_min,startTick
model.eval()
print("start validation...")
validation_loss = 0
with torch.no_grad():
for data, target in validation_loader:
data, target = data.to(device), target.to(device)
output = model(data)
validation_loss += criterion(output, target).item() # 将一批的损失相加
validation_loss /= len(validation_loader.dataset)
timeSpan = time.clock() - startTick # 计算花费时间
print('EPOCH:{} Time used:{} Validation set: Average loss: {:.4f}'.format(epoch,str(timeSpan),validation_loss))
if validation_loss < valid_loss_min:
valid_loss_min = validation_loss
print("saving model ...")
torch.save(model.state_dict(),'hevc_encoder_model.pt')
startTick = time.clock()
for epoch in range(1, EPOCHS + 1):
train(model, DEVICE, train_loader, optimizer, epoch)
validation(model, DEVICE, validation_loader,epoch)
下一层32x32,加载数据集label的时候要在64x64的基础上加上一个layer变量来定位是第几块32x32的单元.除了数据集的加载部分不同之外,还有就是神经网络的结构稍微有点不同,输入的大小变为了32x32:
def from_ctufile(load_type,video_number,frame_number,ctu_number,layer2):
# https://pytorch-cn.readthedocs.io/zh/latest/package_references/Tensor/
ctu_file = "./img-dataset/pkl-{}/v_{}.pkl".format(load_type,video_number)
f_pkl = open(ctu_file,'rb')
video_dict = pickle.load(f_pkl)
f_pkl.close()
ctu_info = video_dict[frame_number][ctu_number] # 获得当前ctu的16x16分割信息的列表
if 0 in ctu_info:
label = torch.tensor(0) # 0 means NON-SPLIT in this layer
elif ctu_info[128*(layer2 // 2)+8*(layer2 %2)] == 1:
label = torch.tensor(0)
else:
label = torch.tensor(1)
return label
class ImageSet(data.Dataset):
def __init__(self,root):
# 所有图片的绝对路径
self.img_files = []
self.root = root
for img in os.listdir(root):
ctu_numbers_per_frame = img.split('_')[3]
for ctu_number in range(int(ctu_numbers_per_frame)):
for layer2 in range(4):
self.img_files.append((img,ctu_number,layer2))
self.transforms=transform
def __getitem__(self, index):
img = Image.open(os.path.join(self.root,self.img_files[index][0]))
video_number = self.img_files[index][0].split('_')[1]
frame_number = self.img_files[index][0].split('_')[2]
ctu_number = self.img_files[index][1]
layer2 = self.img_files[index][2]
img_width, img_height = img.size
img_row = ctu_number // math.ceil(img_width / 64)
img_colonm = ctu_number % math.ceil(img_height / 64)
start_pixel_x = img_colonm * 64 + (layer2 % 2)*32
start_pixel_y = img_row * 64 + (layer2 // 2)*32
cropped_img = img.crop((start_pixel_x, start_pixel_y, start_pixel_x + 32, start_pixel_y + 32)) # 依次对抽取到的帧进行裁剪
img.close()
if "train" in self.root:
load_type = "train"
elif "validation" in self.root:
load_type = "validation"
else:
load_type = "test"
if self.transforms:
data = self.transforms(cropped_img)
else:
img = np.asarray(cropped_img)
data = torch.from_numpy(img)
label = from_ctufile(load_type,video_number,frame_number,str(ctu_number),layer2)
cropped_img.close()
return data,label
def __len__(self):
return len(self.img_files)
再下一层16x16的数据集加载同理,需要用两个layer来定位是第几块16x16的单元
以上就是第一种使用二分类器的思路,这种思路就是模仿HEVC的一层一层的决定是否分割的思路。接下来论文中出现的第二种思路就是直接用64x64的图片对应输出的矩阵,直接使用神经网络预测输出的矩阵。观察一下每个64x64CTU对应的输出矩阵,可以发现矩阵中最小重复单元的大小是4x4,也就是说从这个16x16的矩阵当中,我们可以提取出一个长度为16的向量,其中每个元素的取值范围是0~3,就包含了这个矩阵中的所有信息。
所以一种最简单的想法就是用64x64大小的图片作为输入,用一个长度为64的向量作为输出,之所以长度是64是因为原本长度为16的向量,经过one-hot编码(四分类)之后就变成了长度为64.这个模型很难训练,由于是多标签分类问题,所以使用了MultiLabelSoftMarginLoss()
作为损失函数,整个网络的代码大致和之前的二分类器差不多,其中数据集加载部分的代码如下:
def one_hot_label(label_list):
label = torch.zeros(64,dtype=torch.float)
for i, c in enumerate(label_list):
idx = i*4 + int(c)
label[idx] = 1
return label
def from_ctufile(load_type,video_number,frame_number,ctu_number):
ctu_file = "./img-dataset/pkl-{}/v_{}.pkl".format(load_type,video_number)
f_pkl = open(ctu_file,'rb')
video_dict = pickle.load(f_pkl)
f_pkl.close()
ctu_info = video_dict[frame_number][ctu_number] # 获得当前ctu的16x16分割信息的列表
label_list = []
# 从中提取出长度为16的张量
for i in range(4):
for j in range(4):
label_list.append(ctu_info[i*64+j*4])
# label = torch.LongTensor(label_list)
label = one_hot_label(label_list)
return label
以上就是两种可以用于预测HEVC中CU分割信息的思路,对于第一种思路,不同的论文主要就是CNN网络结构的不同,对于第二种思路,除了网络结构不同之外,不同的论文还采取了不同的预测方式,因为直接预测16个标签很难训练,所以有论文不使用64x64的输入,而是将其分割为16个16x16的CU作为输入,这样每个16x16的图片就对应一个标签,变成了一个多分类问题,不再是多标签了。除此之外,还有很多其它方法来构建神经网络模型。