cfg/yolov3-voc-giou.cfg
......
[yolo]
mask = 6,7,8
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
classes=20
num=9
jitter=.3
ignore_thresh = .5
truth_thresh = 1
random=1
iou_normalizer=0.5
cls_normalizer=1.0
iou_loss=giou
......
[yolo]
mask = 3,4,5
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
classes=20
num=9
jitter=.3
ignore_thresh = .5
truth_thresh = 1
random=1
iou_normalizer=0.5
cls_normalizer=1.0
iou_loss=giou
......
[yolo]
mask = 0,1,2
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
classes=20
num=9
jitter=.3
ignore_thresh = .5
truth_thresh = 1
random=1
iou_normalizer=0.5
cls_normalizer=1.0
iou_loss=giou
option_list.h
#ifdef ENABLE_GIOU
char *option_find_str_quiet(list *l, char *key, char *def);
#endif //ENABLE_GIOU
option_list.c
#ifdef ENABLE_GIOU
char *option_find_str_quiet(list *l, char *key, char *def)
{
char *v = option_find(l, key);
if (v) return v;
return def;
}
#endif //ENABLE_GIOU
parser.c
layer parse_yolo(list *options, size_params params)
{
int classes = option_find_int(options, "classes", 20);
int total = option_find_int(options, "num", 1);
int num = total;
char *a = option_find_str(options, "mask", 0);
int *mask = parse_yolo_mask(a, &num);
layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes);
assert(l.outputs == params.inputs);
#ifdef ENABLE_GIOU
l.iou_normalizer = option_find_float_quiet(options, "iou_normalizer", 0.75);
printf("iou_normalizer is %f\n", l.iou_normalizer);
l.cls_normalizer = option_find_float_quiet(options, "cls_normalizer", 1);
printf("cls_normalizer is %f\n", l.cls_normalizer);
char *iou_loss = option_find_str_quiet(options, "iou_loss", "iou"); // "iou");
printf("loss param is %s\n", iou_loss);
if (strcmp(iou_loss, "mse")==0) {
printf("loss is set to MSE\n");
l.iou_loss = MSE;
} else {
if (strcmp(iou_loss, "giou")==0) {
printf("loss is set to GIOU\n");
l.iou_loss = GIOU;
} else {
printf("loss is set to IOU\n");
l.iou_loss = IOU;
}
}
fprintf(stderr, "Yolo layer params: iou loss: %s, iou_normalizer: %f, cls_normalizer: %f\n", (l.iou_loss==MSE?"mse":(l.iou_loss==GIOU?"giou":"iou")), l.iou_normalizer, l.cls_normalizer);
#endif //ENABLE_GIOU
l.max_boxes = option_find_int_quiet(options, "max",90);
l.jitter = option_find_float(options, "jitter", .2);
l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
l.truth_thresh = option_find_float(options, "truth_thresh", 1);
l.random = option_find_int_quiet(options, "random", 0);
char *map_file = option_find_str(options, "map", 0);
if (map_file) l.map = read_map(map_file);
a = option_find_str(options, "anchors", 0);
if(a){
int len = strlen(a);
int n = 1;
int i;
for(i = 0; i < len; ++i){
if (a[i] == ',') ++n;
}
for(i = 0; i < n; ++i){
float bias = atof(a);
l.biases[i] = bias;
a = strchr(a, ',')+1;
}
}
return l;
}
darknet.h
#ifndef ENABLE_GIOU
#define ENABLE_GIOU
#endif
#ifdef ENABLE_GIOU
typedef enum {
IOU, GIOU, MSE
} IOU_LOSS;
typedef struct boxabs {
float left, right, top, bot;
} boxabs;
typedef struct dxrep {
float dt, db, dl, dr;
} dxrep;
typedef struct ious {
float iou, giou;
dxrep dx_iou;
dxrep dx_giou;
} ious;
#endif //ENABLE_GIOU
struct layer {
......
#ifdef ENABLE_GIOU
float iou_normalizer;
float cls_normalizer;
IOU_LOSS iou_loss;
#endif
......
}
box.h
#ifdef ENABLE_GIOU
float box_iou(box a, box b);
dxrep dx_box_iou(box a, box b, IOU_LOSS iou_loss);
float box_giou(box a, box b);
boxabs to_tblr(box a);
#endif
box.c
#ifdef ENABLE_GIOU
// where c is the smallest box that fully encompases a and b
boxabs box_c(box a, box b) {
boxabs ba = { 0 };
ba.top = fmin(a.y - a.h / 2, b.y - b.h / 2);
ba.bot = fmax(a.y + a.h / 2, b.y + b.h / 2);
ba.left = fmin(a.x - a.w / 2, b.x - b.w / 2);
ba.right = fmax(a.x + a.w / 2, b.x + b.w / 2);
return ba;
}
// representation from x, y, w, h to top, left, bottom, right
boxabs to_tblr(box a) {
boxabs tblr = { 0 };
float t = a.y - (a.h / 2);
float b = a.y + (a.h / 2);
float l = a.x - (a.w / 2);
float r = a.x + (a.w / 2);
tblr.top = t;
tblr.bot = b;
tblr.left = l;
tblr.right = r;
return tblr;
}
float box_giou(box a, box b)
{
boxabs ba = box_c(a, b);
float w = ba.right - ba.left;
float h = ba.bot - ba.top;
float c = w*h;
float iou = box_iou(a, b);
if (c == 0) {
return iou;
}
float u = box_union(a, b);
float giou_term = (c - u) / c;
#ifdef DEBUG_PRINTS
printf(" c: %f, u: %f, giou_term: %f\n", c, u, giou_term);
#endif
return iou - giou_term;
}
dxrep dx_box_iou(box pred, box truth, IOU_LOSS iou_loss) {
boxabs pred_tblr = to_tblr(pred);
float pred_t = fmin(pred_tblr.top, pred_tblr.bot);
float pred_b = fmax(pred_tblr.top, pred_tblr.bot);
float pred_l = fmin(pred_tblr.left, pred_tblr.right);
float pred_r = fmax(pred_tblr.left, pred_tblr.right);
boxabs truth_tblr = to_tblr(truth);
#ifdef DEBUG_PRINTS
printf("\niou: %f, giou: %f\n", box_iou(pred, truth), box_giou(pred, truth));
printf("pred: x,y,w,h: (%f, %f, %f, %f) -> t,b,l,r: (%f, %f, %f, %f)\n", pred.x, pred.y, pred.w, pred.h, pred_tblr.top, pred_tblr.bot, pred_tblr.left, pred_tblr.right);
printf("truth: x,y,w,h: (%f, %f, %f, %f) -> t,b,l,r: (%f, %f, %f, %f)\n", truth.x, truth.y, truth.w, truth.h, truth_tblr.top, truth_tblr.bot, truth_tblr.left, truth_tblr.right);
#endif
//printf("pred (t,b,l,r): (%f, %f, %f, %f)\n", pred_t, pred_b, pred_l, pred_r);
//printf("trut (t,b,l,r): (%f, %f, %f, %f)\n", truth_tblr.top, truth_tblr.bot, truth_tblr.left, truth_tblr.right);
dxrep dx = { 0 };
float X = (pred_b - pred_t) * (pred_r - pred_l);
float Xhat = (truth_tblr.bot - truth_tblr.top) * (truth_tblr.right - truth_tblr.left);
float Ih = fmin(pred_b, truth_tblr.bot) - fmax(pred_t, truth_tblr.top);
float Iw = fmin(pred_r, truth_tblr.right) - fmax(pred_l, truth_tblr.left);
float I = Iw * Ih;
float U = X + Xhat - I;
float Cw = fmax(pred_r, truth_tblr.right) - fmin(pred_l, truth_tblr.left);
float Ch = fmax(pred_b, truth_tblr.bot) - fmin(pred_t, truth_tblr.top);
float C = Cw * Ch;
// float IoU = I / U;
// Partial Derivatives, derivatives
float dX_wrt_t = -1 * (pred_r - pred_l);
float dX_wrt_b = pred_r - pred_l;
float dX_wrt_l = -1 * (pred_b - pred_t);
float dX_wrt_r = pred_b - pred_t;
// gradient of I min/max in IoU calc (prediction)
float dI_wrt_t = pred_t > truth_tblr.top ? (-1 * Iw) : 0;
float dI_wrt_b = pred_b < truth_tblr.bot ? Iw : 0;
float dI_wrt_l = pred_l > truth_tblr.left ? (-1 * Ih) : 0;
float dI_wrt_r = pred_r < truth_tblr.right ? Ih : 0;
// derivative of U with regard to x
float dU_wrt_t = dX_wrt_t - dI_wrt_t;
float dU_wrt_b = dX_wrt_b - dI_wrt_b;
float dU_wrt_l = dX_wrt_l - dI_wrt_l;
float dU_wrt_r = dX_wrt_r - dI_wrt_r;
// gradient of C min/max in IoU calc (prediction)
float dC_wrt_t = pred_t < truth_tblr.top ? (-1 * Cw) : 0;
float dC_wrt_b = pred_b > truth_tblr.bot ? Cw : 0;
float dC_wrt_l = pred_l < truth_tblr.left ? (-1 * Ch) : 0;
float dC_wrt_r = pred_r > truth_tblr.right ? Ch : 0;
// Final IOU loss (prediction) (negative of IOU gradient, we want the negative loss)
float p_dt = 0;
float p_db = 0;
float p_dl = 0;
float p_dr = 0;
if (U > 0) {
p_dt = ((U * dI_wrt_t) - (I * dU_wrt_t)) / (U * U);
p_db = ((U * dI_wrt_b) - (I * dU_wrt_b)) / (U * U);
p_dl = ((U * dI_wrt_l) - (I * dU_wrt_l)) / (U * U);
p_dr = ((U * dI_wrt_r) - (I * dU_wrt_r)) / (U * U);
}
if (iou_loss == GIOU) {
if (C > 0) {
// apply "C" term from gIOU
p_dt += ((C * dU_wrt_t) - (U * dC_wrt_t)) / (C * C);
p_db += ((C * dU_wrt_b) - (U * dC_wrt_b)) / (C * C);
p_dl += ((C * dU_wrt_l) - (U * dC_wrt_l)) / (C * C);
p_dr += ((C * dU_wrt_r) - (U * dC_wrt_r)) / (C * C);
}
}
// apply grad from prediction min/max for correct corner selection
dx.dt = pred_tblr.top < pred_tblr.bot ? p_dt : p_db;
dx.db = pred_tblr.top < pred_tblr.bot ? p_db : p_dt;
dx.dl = pred_tblr.left < pred_tblr.right ? p_dl : p_dr;
dx.dr = pred_tblr.left < pred_tblr.right ? p_dr : p_dl;
return dx;
}
#endif //ENABLE_GIOU
yolo_layer.c
#ifdef ENABLE_GIOU
/**
* truth: ground truth bounding box
* x: l.output
* biases: anchors
* n: highest iou index of anchor in l.total (total number of anchors)
* index: box_index
* stride: l.w * l.h
* (for each)
* i: l.w
* j: l.h
* // Layer w and h
* lw: l.w
* lh: l.h
* // network w and h
* w: net.w
* h: net.h
* scale: (2-truth.w*truth.h)
* stride: l.w*l.h
*/
ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss)
{
ious all_ious = { 0 };
// i - step in layer width
// j - step in layer height
// Returns a box in absolute coordinates
box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
all_ious.iou = box_iou(pred, truth);
all_ious.giou = box_giou(pred, truth);
// avoid nan in dx_box_iou
if (pred.w == 0) { pred.w = 1.0; }
if (pred.h == 0) { pred.h = 1.0; }
if (iou_loss == MSE) // old loss
{
float tx = (truth.x*lw - i);
float ty = (truth.y*lh - j);
float tw = log(truth.w*w / biases[2 * n]);
float th = log(truth.h*h / biases[2 * n + 1]);
delta[index + 0 * stride] = scale * (tx - x[index + 0 * stride]);
delta[index + 1 * stride] = scale * (ty - x[index + 1 * stride]);
delta[index + 2 * stride] = scale * (tw - x[index + 2 * stride]);
delta[index + 3 * stride] = scale * (th - x[index + 3 * stride]);
}
else {
// https://github.com/generalized-iou/g-darknet
// https://arxiv.org/abs/1902.09630v2
// https://giou.stanford.edu/
all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
// jacobian^t (transpose)
delta[index + 0 * stride] = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
delta[index + 1 * stride] = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
delta[index + 2 * stride] = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
delta[index + 3 * stride] = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
// predict exponential, apply gradient of e^delta_t ONLY for w,h
delta[index + 2 * stride] *= exp(x[index + 2 * stride]);
delta[index + 3 * stride] *= exp(x[index + 3 * stride]);
// normalize iou weight
delta[index + 0 * stride] *= iou_normalizer;
delta[index + 1 * stride] *= iou_normalizer;
delta[index + 2 * stride] *= iou_normalizer;
delta[index + 3 * stride] *= iou_normalizer;
}
return all_ious;
}
#else
float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
{
box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
float iou = box_iou(pred, truth);
float tx = (truth.x*lw - i);
float ty = (truth.y*lh - j);
float tw = log(truth.w*w / biases[2*n]);
float th = log(truth.h*h / biases[2*n + 1]);
delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
return iou;
}
#endif //ENABLE_GIOU
void forward_yolo_layer(const layer l, network net)
{
int i,j,b,t,n;
memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
#ifndef GPU
for (b = 0; b < l.batch; ++b){
for(n = 0; n < l.n; ++n){
int index = entry_index(l, b, n*l.w*l.h, 0);
activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
index = entry_index(l, b, n*l.w*l.h, 4);
activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
}
}
#endif
memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
if(!net.train) return;
#ifdef ENABLE_GIOU
float tot_iou = 0;
float tot_giou = 0;
float tot_iou_loss = 0;
float tot_giou_loss = 0;
#else
float avg_iou = 0;
#endif
float recall = 0;
float recall75 = 0;
float avg_cat = 0;
float avg_obj = 0;
float avg_anyobj = 0;
int count = 0;
int class_count = 0;
*(l.cost) = 0;
for (b = 0; b < l.batch; ++b) {
for (j = 0; j < l.h; ++j) {
for (i = 0; i < l.w; ++i) {
for (n = 0; n < l.n; ++n) {
int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
float best_iou = 0;
int best_t = 0;
for(t = 0; t < l.max_boxes; ++t){
box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
if(!truth.x) break;
float iou = box_iou(pred, truth);
if (iou > best_iou) {
best_iou = iou;
best_t = t;
}
}
int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
avg_anyobj += l.output[obj_index];
#ifdef ENABLE_GIOU
l.delta[obj_index] = l.cls_normalizer * (0 - l.output[obj_index]);
#else
l.delta[obj_index] = 0 - l.output[obj_index];
#endif
if (best_iou > l.ignore_thresh) {
l.delta[obj_index] = 0;
}
if (best_iou > l.truth_thresh) {
#ifdef ENABLE_GIOU
l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
#else
l.delta[obj_index] = 1 - l.output[obj_index];
#endif
int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
if (l.map) class = l.map[class];
int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
#ifdef ENABLE_GIOU
delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
#else
delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
#endif
}
}
}
}
for(t = 0; t < l.max_boxes; ++t){
box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
if(!truth.x) break;
float best_iou = 0;
int best_n = 0;
i = (truth.x * l.w);
j = (truth.y * l.h);
box truth_shift = truth;
truth_shift.x = truth_shift.y = 0;
// for each anchor
for(n = 0; n < l.total; ++n){
box pred = {0};
pred.w = l.biases[2*n]/net.w;
pred.h = l.biases[2*n+1]/net.h;
float iou = box_iou(pred, truth_shift);
if (iou > best_iou){
best_iou = iou;
best_n = n;
}
}
int mask_n = int_index(l.mask, best_n, l.n);
if(mask_n >= 0){
int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
#ifdef ENABLE_GIOU
ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
// range is 0 <= 1
tot_iou += all_ious.iou;
tot_iou_loss += 1 - all_ious.iou;
// range is -1 <= giou <= 1
tot_giou += all_ious.giou;
tot_giou_loss += 1 - all_ious.giou;
#else
float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
#endif
int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
avg_obj += l.output[obj_index];
#ifdef ENABLE_GIOU
l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
#else
l.delta[obj_index] = 1 - l.output[obj_index];
#endif
int class = net.truth[t*(4 + 1) + b*l.truths + 4];
if (l.map) class = l.map[class];
int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
++count;
++class_count;
#ifdef ENABLE_GIOU
if (all_ious.iou > .5) recall += 1;
if (all_ious.iou > .75) recall75 += 1;
#else
if(iou > .5) recall += 1;
if(iou > .75) recall75 += 1;
avg_iou += iou;
#endif
}
}
}
#ifdef ENABLE_GIOU
// Always compute classification loss both for iou + cls loss and for logging with mse loss
// TODO: remove IOU loss fields before computing MSE on class
// probably split into two arrays
int stride = l.w*l.h;
float* no_iou_loss_delta = calloc(l.batch * l.outputs, sizeof(float));
memcpy(no_iou_loss_delta, l.delta, l.batch * l.outputs * sizeof(float));
for (b = 0; b < l.batch; ++b) {
for (j = 0; j < l.h; ++j) {
for (i = 0; i < l.w; ++i) {
for (n = 0; n < l.n; ++n) {
int index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
no_iou_loss_delta[index + 0 * stride] = 0;
no_iou_loss_delta[index + 1 * stride] = 0;
no_iou_loss_delta[index + 2 * stride] = 0;
no_iou_loss_delta[index + 3 * stride] = 0;
}
}
}
}
float classification_loss = l.cls_normalizer * pow(mag_array(no_iou_loss_delta, l.outputs * l.batch), 2);
free(no_iou_loss_delta);
float avg_iou_loss = 0;
// gIOU loss + MSE (objectness) loss
if (l.iou_loss == MSE) {
*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
}
else {
if (l.iou_loss == GIOU) {
avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_giou_loss / count) : 0;
}
else {
avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_iou_loss / count) : 0;
}
*(l.cost) = avg_iou_loss + classification_loss;
}
printf("v3 (%s loss, Normalizer: (iou: %f, cls: %f) Region %d Avg (IOU: %f, GIOU: %f), Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n", (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.cls_normalizer, net.index, tot_iou / count, tot_giou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count);
#else
*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
#endif //ENABLE_GIOU
}
./darknet detector train cfg/voc.data cfg/yolov3-voc-giou.cfg darknet53.conv.74
python newscripts/voc_all_map.py --data_file cfg/voc.data --cfg_file cfg/yolov3-voc-giou-test.cfg --weights_folder backup_weights/