源码地址:https://github.com/fzliu/style-transfer
scipy.optimize.minimize(fun, x0, args=(), method=None, jac=None, hess=None, hessp=None, bounds=None, constraints=(), tol=None, callback=None, options=None)
fun
目标函数,也即要最小化的函数。
x0
初始猜测值。
args
可将额外参数传给fun
。
method
论文使用的是L-BFGS-B
。
options
为字典形,其中maxiter
表示最大迭代次数。
在L-BFGS-B
的方法前提下(其它方法可选参数不一样):
jac
表示fun
除了要返回loss
之外,还要另外返回变量的向量梯度grad
。
bounds
表示各个变量的范围,形式为列表中的元组。
举一个例子如下:
s.t. −5 < x1 <5 , 12 < x2 <23
如下图所示:greeting
是额外参数,因为jac
为True
, 所以除了loss
还返回了梯度向量2*x
。
其中 p⃗ 、a⃗ 、x⃗ 分别代表content照、artwork照、生成照。 α,β 表示对两种loss的权重。
Lstyle 强调纹理、颜色的损失,以Gram矩阵来表达。
Lcontent 强调直接损失,以特征图直接表达。
其它具体细节请参照论文。
两个“##”指示的程序运行顺序,这里只标示出默认情况下的运行顺序。
可以先找到 “## 1”开始读,注意有个空格
argparse模块请参考:http://blog.csdn.net/stepleave/article/details/51737211
# system imports
import argparse
import logging
import os
import sys
import timeit
# library imports
import caffe
import numpy as np
import progressbar as pb
from scipy.fftpack import ifftn
from scipy.linalg.blas import sgemm
from scipy.misc import imsave
from scipy.optimize import minimize
from skimage import img_as_ubyte
from skimage.transform import rescale
# logging
LOG_FORMAT = "%(filename)s:%(funcName)s:%(asctime)s.%(msecs)03d -- %(message)s"
# numeric constants
INF = np.float32(np.inf)
STYLE_SCALE = 1.2
# weights for the individual models
# assume that corresponding layers' top blob matches its name
VGG19_WEIGHTS = {"content": {"conv4_2": 1},
"style": {"conv1_1": 0.2,
"conv2_1": 0.2,
"conv3_1": 0.2,
"conv4_1": 0.2,
"conv5_1": 0.2}}
VGG16_WEIGHTS = {"content": {"conv4_2": 1},
"style": {"conv1_1": 0.2,
"conv2_1": 0.2,
"conv3_1": 0.2,
"conv4_1": 0.2,
"conv5_1": 0.2}}
GOOGLENET_WEIGHTS = {"content": {"conv2/3x3": 2e-4,
"inception_3a/output": 1-2e-4},
"style": {"conv1/7x7_s2": 0.2,
"conv2/3x3": 0.2,
"inception_3a/output": 0.2,
"inception_4a/output": 0.2,
"inception_5a/output": 0.2}}
CAFFENET_WEIGHTS = {"content": {"conv4": 1},
"style": {"conv1": 0.2,
"conv2": 0.2,
"conv3": 0.2,
"conv4": 0.2,
"conv5": 0.2}}
# argparse
parser = argparse.ArgumentParser(description="Transfer the style of one image to another.",
usage="style.py -s -c " )
parser.add_argument("-s", "--style-img", type=str, required=True, help="input style (art) image")
parser.add_argument("-c", "--content-img", type=str, required=True, help="input content image")
parser.add_argument("-g", "--gpu-id", default=0, type=int, required=False, help="GPU device number")
parser.add_argument("-m", "--model", default="vgg16", type=str, required=False, help="model to use")
parser.add_argument("-i", "--init", default="content", type=str, required=False, help="initialization strategy")
parser.add_argument("-r", "--ratio", default="1e4", type=str, required=False, help="style-to-content ratio")
parser.add_argument("-n", "--num-iters", default=512, type=int, required=False, help="L-BFGS iterations")
parser.add_argument("-l", "--length", default=512, type=float, required=False, help="maximum image length")
parser.add_argument("-v", "--verbose", action="store_true", required=False, help="print minimization outputs")
parser.add_argument("-o", "--output", default=None, required=False, help="output path")
## 5.1.6.2
def _compute_style_grad(F, G, G_style, layer):
"""
Computes style gradient and loss from activation features.
"""
# compute loss and gradient
(Fl, Gl) = (F[layer], G[layer])
c = Fl.shape[0]**-2 * Fl.shape[1]**-2
El = Gl - G_style[layer]
loss = c/4 * (El**2).sum()
grad = c * sgemm(1.0, El, Fl) * (Fl>0)
return loss, grad
## 5.1.6.3
def _compute_content_grad(F, F_content, layer):
"""
Computes content gradient and loss from activation features.
"""
# compute loss and gradient
Fl = F[layer]
El = Fl - F_content[layer]
loss = (El**2).sum() / 2
grad = El * (Fl>0)
return loss, grad
##5.1.2 计算content特征和style特征,并返回
def _compute_reprs(net_in, net, layers_style, layers_content, gram_scale=1):
"""
Computes representation matrices for an image.
"""
# input data and forward pass
(repr_s, repr_c) = ({}, {})
net.blobs["data"].data[0] = net_in
net.forward()
# loop through combined set of layers
for layer in set(layers_style)|set(layers_content):
F = net.blobs[layer].data[0].copy()
F.shape = (F.shape[0], -1)
repr_c[layer] = F
if layer in layers_style:
repr_s[layer] = sgemm(gram_scale, F, F.T)
return repr_s, repr_c
## 5.1.6 目标函数,返回损失、和梯度向量
def style_optfn(x, net, weights, layers, reprs, ratio):
"""
Style transfer optimization callback for scipy.optimize.minimize().
:param numpy.ndarray x:
Flattened data array.
:param caffe.Net net:
Network to use to generate gradients.
:param dict weights:
Weights to use in the network.
:param list layers:
Layers to use in the network.
:param tuple reprs:
Representation matrices packed in a tuple.
:param float ratio:
Style-to-content ratio.
"""
# 解析输入参数
layers_style = weights["style"].keys()
layers_content = weights["content"].keys()
net_in = x.reshape(net.blobs["data"].data.shape[1:])
(G_style, F_content) = reprs
## 5.1.6.1又进入_compute_reprs函数,之前都是单独计算style特征或者content特征,
## 这里对(初始猜测net_in=img0)在各层(layers_style|layers_content)提取content特征,
## 在layers_style层提取style特征。
(G, F) = _compute_reprs(net_in, net, layers_style, layers_content)
# 按层反向传播
# 初始化loss和layers中最后一层的diff
loss = 0
net.blobs[layers[-1]].diff[:] = 0 # diff函数指的是 T 时刻与 T-1 时刻的差分,初始化为零
for i, layer in enumerate(reversed(layers))
next_layer = None if i == len(layers)-1 else layers[-i-2]
# 循环的过程:conv5_1 --> conv4_2 --> conv4_1 --> conv3_1 --> conv2_1
# --> conv1_1 --> data层,这个过程是一步一步进行的,
# 首次循环是conv5_1(layer) --> conv4_2(next_layer)
grad = net.blobs[layer].diff[0]
# style对loss的贡献
if layer in layers_style:
wl = weights["style"][layer]
## 5.1.6.2进入_compute_style_grad函数,
## 计算style特征对loss的贡献,
## 以及loss对 F(初始猜测的该层的content特征)的导数。
(l, g) = _compute_style_grad(F, G, G_style, layer)
loss += wl * l * ratio # ratio是style损失和content损失对总loss贡献的权重折中。
grad += wl * g.reshape(grad.shape) * ratio # 更新梯度
# content对loss的贡献
if layer in layers_content:
wl = weights["content"][layer]
## 5.1.6.3 进入_compute_content_grad,
## 计算content特征对loss的贡献,
## 以及loss对 F(初始猜测的该层的content特征)的导数。
(l, g) = _compute_content_grad(F, F_content, layer)
loss += wl * l
grad += wl * g.reshape(grad.shape)
# 梯度反向传播
net.backward(start=layer, end=next_layer)
if next_layer is None:
grad = net.blobs["data"].diff[0]
else:
grad = net.blobs[next_layer].diff[0]
# 将梯度拉成一个长向量
grad = grad.flatten().astype(np.float64)
# 一次优化结束,返回损失和梯度向量
return loss, grad
class StyleTransfer(object):
## 4.1 生成对象实例
'''初始化了如下参数
self.net #网络
self.transformer #网络转换器
self.weights #网络权重
self.layers #网络层
self.callback #函数
self.use_pbar #是否显示进度条
'''
def __init__(self, model_name, use_pbar=True):
"""
Initialize the model used for style transfer.
:param str model_name:
Model to use.
:param bool use_pbar:
Use progressbar flag.
"""
style_path = os.path.abspath(os.path.split(__file__)[0])
base_path = os.path.join(style_path, "models", model_name)
# vgg19
if model_name == "vgg19":
model_file = os.path.join(base_path,
"VGG_ILSVRC_19_layers_deploy.prototxt")
pretrained_file = os.path.join(base_path,
"VGG_ILSVRC_19_layers.caffemodel")
mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy")
weights = VGG19_WEIGHTS
# vgg16
elif model_name == "vgg16":
model_file = os.path.join(base_path,
"VGG_ILSVRC_16_layers_deploy.prototxt")
pretrained_file = os.path.join(base_path,
"VGG_ILSVRC_16_layers.caffemodel")
mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy")
weights = VGG16_WEIGHTS
# googlenet
elif model_name == "googlenet":
model_file = os.path.join(base_path, "deploy.prototxt")
pretrained_file = os.path.join(base_path, "bvlc_googlenet.caffemodel")
mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy")
weights = GOOGLENET_WEIGHTS
# caffenet
elif model_name == "caffenet":
model_file = os.path.join(base_path, "deploy.prototxt")
pretrained_file = os.path.join(base_path,
"bvlc_reference_caffenet.caffemodel")
mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy")
weights = CAFFENET_WEIGHTS
else:
assert False, "model not available"
# add model and weights
## 4.1.1 进入self.load_model, 初始化了self.net、self.transformer
self.load_model(model_file, pretrained_file, mean_file)
self.weights = weights.copy()
self.layers = []
for layer in self.net.blobs:
if layer in self.weights["style"] or layer in self.weights["content"]:
self.layers.append(layer)
self.use_pbar = use_pbar
# set the callback function
# 定义的callback函数,每次优化后都会被调用一次,显示进度条。
if self.use_pbar:
def callback(xk):
self.grad_iter += 1
try:
self.pbar.update(self.grad_iter)
except:
self.pbar.finished = True
if self._callback is not None:
net_in = xk.reshape(self.net.blobs["data"].data.shape[1:])
self._callback(self.transformer.deprocess("data", net_in))
else:
def callback(xk):
if self._callback is not None:
net_in = xk.reshape(self.net.blobs["data"].data.shape[1:])
self._callback(self.transformer.deprocess("data", net_in))
self.callback = callback
##4.1.1 比较基础,不做多解释
def load_model(self, model_file, pretrained_file, mean_file):
"""
Loads specified model from caffe install (see caffe docs).
:param str model_file:
Path to model protobuf.
:param str pretrained_file:
Path to pretrained caffe model.
:param str mean_file:
Path to mean file.
"""
# load net (supressing stderr output)
null_fds = os.open(os.devnull, os.O_RDWR)
out_orig = os.dup(2)
os.dup2(null_fds, 2)
net = caffe.Net(model_file, pretrained_file, caffe.TEST)
os.dup2(out_orig, 2)
os.close(null_fds)
# all models used are trained on imagenet data
transformer = caffe.io.Transformer({"data": net.blobs["data"].data.shape})
transformer.set_mean("data", np.load(mean_file).mean(1).mean(1))
transformer.set_channel_swap("data", (2,1,0))
transformer.set_transpose("data", (2,0,1))
transformer.set_raw_scale("data", 255)
# add net parameters
self.net = net
self.transformer = transformer
## 5.2 获得生成的图像,
## 之所以能从'data'层得到的,
## 是因为在5.1.6 style_optfn中,
## 反向传播时对初始猜测不断进行更新了。
def get_generated(self):
"""
Saves the generated image (net input, after optimization).
:param str path:
Output path.
"""
data = self.net.blobs["data"].data
img_out = self.transformer.deprocess("data", data)
return img_out
## 5.1.1 调整网络输入尺度,以适应img
def _rescale_net(self, img):
"""
Rescales the network to fit a particular image.
"""
# get new dimensions and rescale net + transformer
new_dims = (1, img.shape[2]) + img.shape[:2]
self.net.blobs["data"].reshape(*new_dims)
self.transformer.inputs["data"] = new_dims
def _make_noise_input(self, init):
"""
Creates an initial input (generated) image.
"""
# specify dimensions and create grid in Fourier domain
dims = tuple(self.net.blobs["data"].data.shape[2:]) + \
(self.net.blobs["data"].data.shape[1], )
grid = np.mgrid[0:dims[0], 0:dims[1]]
# create frequency representation for pink noise
Sf = (grid[0] - (dims[0]-1)/2.0) ** 2 + \
(grid[1] - (dims[1]-1)/2.0) ** 2
Sf[np.where(Sf == 0)] = 1
Sf = np.sqrt(Sf)
Sf = np.dstack((Sf**int(init),)*dims[2])
# apply ifft to create pink noise and normalize
ifft_kernel = np.cos(2*np.pi*np.random.randn(*dims)) + \
1j*np.sin(2*np.pi*np.random.randn(*dims))
img_noise = np.abs(ifftn(Sf * ifft_kernel))
img_noise -= img_noise.min()
img_noise /= img_noise.max()
# preprocess the pink noise image
x0 = self.transformer.preprocess("data", img_noise)
return x0
def _create_pbar(self, max_iter):
"""
Creates a progress bar.
"""
self.grad_iter = 0
self.pbar = pb.ProgressBar()
self.pbar.widgets = ["Optimizing: ", pb.Percentage(),
" ", pb.Bar(marker=pb.AnimatedMarker()),
" ", pb.ETA()]
self.pbar.maxval = max_iter
## 5.1 关键函数,转换风格
def transfer_style(self, img_style, img_content, length=512, ratio=1e5,
n_iter=512, init="-1", verbose=False, callback=None):
"""
Transfers the style of the artwork to the input image.
:param numpy.ndarray img_style:
A style image with the desired target style.
:param numpy.ndarray img_content:
A content image in floating point, RGB format.
:param function callback:
A callback function, which takes images at iterations.
"""
# 假设输入图像:高=宽
orig_dim = min(self.net.blobs["data"].shape[2:])
# 对图像进行尺度变换,但我不知作用是什么。。
scale = max(length / float(max(img_style.shape[:2])),
orig_dim / float(min(img_style.shape[:2])))
img_style = rescale(img_style, STYLE_SCALE*scale)
scale = max(length / float(max(img_content.shape[:2])),
orig_dim / float(min(img_content.shape[:2])))
img_content = rescale(img_content, scale)
# 计算style特征
## 5.1.1 进入_rescale_net函数,调整网络输入参数,使尺度适应img_style图像
self._rescale_net(img_style)
layers = self.weights["style"].keys()
net_in = self.transformer.preprocess("data", img_style)
gram_scale = float(img_content.size)/img_style.size #这一句作者没使用。。
## 5.1.2 进入_compute_reprs,计算并返回img_style的style特征
G_style = _compute_reprs(net_in, self.net, layers, [],
gram_scale=1)[0]
# 计算content特征
## 5.1.3进入_rescale_net函数,调整网络输入参数,以适应图像img_content
## 注意之后就没有改变网络输入参数了,一直沿用适应img_content的这些参数
self._rescale_net(img_content)
layers = self.weights["content"].keys()
net_in = self.transformer.preprocess("data", img_content)
## 5.1.4 进入_compute_reprs,计算并返回img_content的content特征
F_content = _compute_reprs(net_in, self.net, [], layers)[1]
# 生成初始网络输入img0
# 默认的init = 'content', 见开头argpaser部分
if isinstance(init, np.ndarray): #如果init是数组类型
img0 = self.transformer.preprocess("data", init)
elif init == "content":
img0 = self.transformer.preprocess("data", img_content)
elif init == "mixed":
img0 = 0.95*self.transformer.preprocess("data", img_content) + \
0.05*self.transformer.preprocess("data", img_style)
else:
img0 = self._make_noise_input(init) #生成随机噪声图像,默认情况下不会用到。
# 计算每个像素的范围
data_min = -self.transformer.mean["data"][:,0,0]
data_max = data_min + self.transformer.raw_scale["data"]
data_bounds = [(data_min[0], data_max[0])]*(img0.size/3) + \
[(data_min[1], data_max[1])]*(img0.size/3) + \
[(data_min[2], data_max[2])]*(img0.size/3)
# 设置优化参数
grad_method = "L-BFGS-B"
reprs = (G_style, F_content)
minfn_args = {
"args": (self.net, self.weights, self.layers, reprs, ratio),
"method": grad_method, "jac": True, "bounds": data_bounds,
"options": {"maxcor": 8, "maxiter": n_iter, "disp": verbose}
}
# 进行优化
self._callback = callback #callback为输入参数,默认为None
minfn_args["callback"] = self.callback #self.callback是一个函数,与上一句有区别
if self.use_pbar and not verbose: #默认情况
## 5.1.5 进入_create_pbar函数,生成进度条。
self._create_pbar(n_iter)
self.pbar.start()
## 5.1.6 最最关键的函数!!!我们来看style_optfn函数
## minimize返回一个类,这里作者只使用了类的nit属性:迭代次数
res = minimize(style_optfn, img0.flatten(), **minfn_args).nit
self.pbar.finish()
else:
res = minimize(style_optfn, img0.flatten(), **minfn_args).nit
return res
def main(args):
## 1. logging
level = logging.INFO if args.verbose else logging.DEBUG
logging.basicConfig(format=LOG_FORMAT, datefmt="%H:%M:%S", level=level)
logging.info("Starting style transfer.")
## 2. 设置 GPU/CPU 模式
if args.gpu_id == -1:
caffe.set_mode_cpu()
logging.info("Running net on CPU.")
else:
caffe.set_device(args.gpu_id)
caffe.set_mode_gpu()
logging.info("Running net on GPU {0}.".format(args.gpu_id))
## 3. 载入图像,RGB格式,值在(0,1)之间
img_style = caffe.io.load_image(args.style_img)
img_content = caffe.io.load_image(args.content_img)
logging.info("Successfully loaded images.")
## 4. 生成StyleTransfer对象
use_pbar = not args.verbose
## 4.1 进入StyleTransfer类
st = StyleTransfer(args.model.lower(), use_pbar=use_pbar)
logging.info("Successfully loaded model {0}.".format(args.model))
## 5. 进行风格转换
start = timeit.default_timer()
## 5.1 关键函数,转换风格
n_iters = st.transfer_style(img_style, img_content, length=args.length,
init=args.init, ratio=np.float(args.ratio),
n_iter=args.num_iters, verbose=args.verbose)
end = timeit.default_timer()
logging.info("Ran {0} iterations in {1:.0f}s.".format(n_iters, end-start))
## 5.2 获得输出图像
img_out = st.get_generated()
## 6. 输出路径
if args.output is not None:
out_path = args.output
else:
out_path_fmt = (os.path.splitext(os.path.split(args.content_img)[1])[0],
os.path.splitext(os.path.split(args.style_img)[1])[0],
args.model, args.init, args.ratio, args.num_iters)
out_path = "outputs/{0}-{1}-{2}-{3}-{4}-{5}.jpg".format(*out_path_fmt)
## 7. 保存图像
imsave(out_path, img_as_ubyte(img_out))
logging.info("Output saved to {0}.".format(out_path))
if __name__ == "__main__":
args = parser.parse_args()
main(args)