【darknet训练细节】一个隐藏的超参数:scale
2019-06-16 本文已影响0人
yuanCruise
整体源码如下,该源码是利用yolo训练检测网络时,对输入数据作前处理的源代码,该代码段中包含了从外部传入的超参数(jitter),也有内部写死的超参数(scale)。下面将对该代码做详细的解析。
data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, int classes, float jitter, float hue, float saturation, float exposure)
{
char **random_paths = get_random_paths(paths, n, m);
int i;
data d = {0};
d.shallow = 0;
d.X.rows = n;
d.X.vals = calloc(d.X.rows, sizeof(float*));
d.X.cols = h*w*3;
d.y = make_matrix(n, 5*boxes);
for(i = 0; i < n; ++i){
image orig = load_image_color(random_paths[i], 0, 0);
image sized = make_image(w, h, orig.c);
fill_image(sized, .5);
float dw = jitter * orig.w;
float dh = jitter * orig.h;
float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
//float scale = rand_uniform(.25, 2);
float scale = 1;
float nw, nh;
if(new_ar < 1){
nh = scale * h;
nw = nh * new_ar;
} else {
nw = scale * w;
nh = nw / new_ar;
}
float dx = rand_uniform(0, w - nw);
float dy = rand_uniform(0, h - nh);
place_image(orig, nw, nh, dx, dy, sized);
random_distort_image(sized, hue, saturation, exposure);
int flip = rand()%2;
if(flip) flip_image(sized);
d.X.vals[i] = sized.data;
fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, -dx/w, -dy/h, nw/w, nh/h);
free_image(orig);
}
free(random_paths);
return d;
}
1.新建data数据
,data是包含了图片数据和其对应的标签文件,下述代码初始化了一个data数据结构,并根据初始化大小分配了内存空间。
data d = {0};
d.shallow = 0;
d.X.rows = n;
d.X.vals = calloc(d.X.rows, sizeof(float*));
d.X.cols = h*w*3;
d.y = make_matrix(n, 5*boxes);
上述代码段中用到的make_matrix函数用来分配各种大小的矩阵的内存空间。本例中是用来分配每个通道n,所对应的每个标签box,因为每个box包含了xywhid五个值,所以要乘以5.
matrix make_matrix(int rows, int cols)
{
int i;
matrix m;
m.rows = rows;
m.cols = cols;
m.vals = calloc(m.rows, sizeof(float *));
for(i = 0; i < m.rows; ++i){
m.vals[i] = calloc(m.cols, sizeof(float));
}
return m;
}
2.导入图片数据
,如下所示函数将图片读入,创建内存空间。
image orig = load_image_color(random_paths[i], 0, 0);
image sized = make_image(w, h, orig.c);
fill_image(sized, .5);
上述代码中先用load_image_color将图片导入,从如下源码中我们发现在导入的过程中,图片已经被resize好了。并用make_image分配空间,再用fill_image把image中的每个值初始化为0.5。
image load_image_color(char *filename, int w, int h)
{
return load_image(filename, w, h, 3);
}
-------------------------------------------------------------
image load_image(char *filename, int w, int h, int c)
{
#ifdef OPENCV
image out = load_image_cv(filename, c);
#else
image out = load_image_stb(filename, c);
#endif
if((h && w) && (h != out.h || w != out.w)){
image resized = resize_image(out, w, h);
free_image(out);
out = resized;
}
return out;
}
------------------------------------------------------------
image make_image(int w, int h, int c)
{
image out = make_empty_image(w,h,c);
out.data = calloc(h*w*c, sizeof(float));
return out;
}
------------------------------------------------------------
void fill_image(image m, float s)
{
int i;
for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
}
3.设置超参数
,设置部分超参数,对后续图片进行处理,部分超参数由外部cfg设置,部分超参数内部写死(比如scale参数)。
float dw = jitter * orig.w;
float dh = jitter * orig.h;
float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
//float scale = rand_uniform(.25, 2);
float scale = 1;
float nw, nh;
if(new_ar < 1){
nh = scale * h;
nw = nh * new_ar;
} else {
nw = scale * w;
nh = nw / new_ar;
}
float dx = rand_uniform(0, w - nw);
float dy = rand_uniform(0, h - nh);
place_image(orig, nw, nh, dx, dy, sized);
random_distort_image(sized, hue, saturation, exposure);
如上述代码所示,jitter是一种对宽高的随机抖动。scale是一种内部写死的表示对输入图片的宽高缩放(通过源码我们发现是对随机抖动后,较长的边进行缩放的)。其中place_image函数,实现了不论你上面做了多少尺度的scale的缩放,最终还是会放到sized大小的范围内来。
void place_image(image im, int w, int h, int dx, int dy, image canvas)
{
int x, y, c;
for(c = 0; c < im.c; ++c){
for(y = 0; y < h; ++y){
for(x = 0; x < w; ++x){
float rx = ((float)x / w) * im.w;
float ry = ((float)y / h) * im.h;
float val = bilinear_interpolate(im, rx, ry, c);
set_pixel(canvas, x + dx, y + dy, c, val);
}
}
}
}
----------------------------------------------------------------------------
static float bilinear_interpolate(image im, float x, float y, int c)
{
int ix = (int) floorf(x);
int iy = (int) floorf(y);
float dx = x - ix;
float dy = y - iy;
float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) +
dy * (1-dx) * get_pixel_extend(im, ix, iy+1, c) +
(1-dy) * dx * get_pixel_extend(im, ix+1, iy, c) +
dy * dx * get_pixel_extend(im, ix+1, iy+1, c);
return val;
}
----------------------------------------------------------------------------
static void set_pixel(image m, int x, int y, int c, float val)
{
if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
assert(x < m.w && y < m.h && c < m.c);
m.data[c*m.h*m.w + y*m.w + x] = val;
}
仔细剖析了place_image函数,对图片im的长宽通道数中的每个像素进行计算,如果缩放后的图比原始的小,则周边就用双线性差值进行填补(基本就是灰的),如果缩放后的图比原始的大,则那些部分就直接丢弃(通过set_pixel中的条件判断可以得出该结论)。
4.备注
,如下图为data,image等数据结构。
typedef struct{
int w, h;
matrix X;
matrix y;
int shallow;
int *num_boxes;
box **boxes;
} data;
typedef struct{
float x, y, w, h;//x,y中心点,w,h宽高(都是占比)
} box;
typedef struct matrix{
int rows, cols; // 矩阵的行与列数
float **vals; // 矩阵所存储的数据,二维数组
} matrix;
typedef struct {
int h;
int w;
int c;
float *data;
} image;