(十三)宏块损耗计算x264_slicetype_mb_cost
2018-09-11 本文已影响0人
奔向火星005
x264_slicetype_mb_cost的完整源码如下:
static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
x264_frame_t **frames, int p0, int p1, int b,
int dist_scale_factor, int do_search[2], const x264_weight_t *w,
int *output_inter, int *output_intra )
{
x264_frame_t *fref0 = frames[p0]; //前向参考帧
x264_frame_t *fref1 = frames[p1]; //后向参考帧
x264_frame_t *fenc = frames[b]; //待参考帧
const int b_bidir = (b < p1);
const int i_mb_x = h->mb.i_mb_x; //为宏块为单位的坐标
const int i_mb_y = h->mb.i_mb_y;
const int i_mb_stride = h->mb.i_mb_width; //以宏块为单位的宽
const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride; //二维坐标转到一维坐标
const int i_stride = fenc->i_stride_lowres; //二分之一像素图像的行跨度
const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride); //以像素为单位的一维坐标
const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; //权重
int16_t (*fenc_mvs[2])[2] = { &fenc->lowres_mvs[0][b-p0-1][i_mb_xy], &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] }; //fenc->lowres_mvs中存储的是前向和后向运动向量
int (*fenc_costs[2]) = { &fenc->lowres_mv_costs[0][b-p0-1][i_mb_xy], &fenc->lowres_mv_costs[1][p1-b-1][i_mb_xy] }; //运动向量损耗值
int b_frame_score_mb = (i_mb_x > 0 && i_mb_x < h->mb.i_mb_width - 1 &&
i_mb_y > 0 && i_mb_y < h->mb.i_mb_height - 1) ||
h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
ALIGNED_ARRAY_16( pixel, pix1,[9*FDEC_STRIDE] ); //pixel pix1[9*32]
pixel *pix2 = pix1+8;
x264_me_t m[2]; //m[0]是前向参考,m[1]是后向参考
int i_bcost = COST_MAX;
int list_used = 0;
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
int lowres_penalty = 4;
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 );
if( p0 == p1 ) //前向和后向是同一个,是I帧
goto lowres_intra_mb;
//下面是一堆搜索范围的处理
int mv_range = 2 * h->param.analyse.i_mv_range;
// no need for h->mb.mv_min[]
h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range );
h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 );
h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2;
h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2;
if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
{
h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range );
h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 );
h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2;
h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2;
}
// 装载1/2像素各个平面(原, 水平, 垂直, 对角线)
#define LOAD_HPELS_LUMA(dst, src) \
{ \
(dst)[0] = &(src)[0][i_pel_offset]; \
(dst)[1] = &(src)[1][i_pel_offset]; \
(dst)[2] = &(src)[2][i_pel_offset]; \
(dst)[3] = &(src)[3][i_pel_offset]; \
}
#define LOAD_WPELS_LUMA(dst,src) \
(dst) = &(src)[i_pel_offset];
#define CLIP_MV( mv ) \
{ \
mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
mv[1] = x264_clip3( mv[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); \
}
#define TRY_BIDIR( mv0, mv1, penalty ) \
{ \
int i_cost; \
if( h->param.analyse.i_subpel_refine <= 1 ) \
{ \
int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \
int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \
pixel *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
pixel *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \
} \
else \
{ \
intptr_t stride1 = 16, stride2 = 16; \
pixel *src1, *src2; \
src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
(mv0)[0], (mv0)[1], 8, 8, w ); \
src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
(mv1)[0], (mv1)[1], 8, 8, w ); \
h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
} \
i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \
m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
}
m[0].i_pixel = PIXEL_8x8;
m[0].p_cost_mv = a->p_cost_mv;
m[0].i_stride[0] = i_stride;
m[0].p_fenc[0] = h->mb.pic.p_fenc[0];
m[0].weight = w;
m[0].i_ref = 0;
LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres ); //装载前向参考帧的二分之一像素各平面
m[0].p_fref_w = m[0].p_fref[0]; //o,h,v,d中的o
if( w[0].weightfn )
LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] );
if( b_bidir ) // 如果是B帧
{
ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );
m[1].i_pixel = PIXEL_8x8;
m[1].p_cost_mv = a->p_cost_mv;
m[1].i_stride[0] = i_stride;
m[1].p_fenc[0] = h->mb.pic.p_fenc[0];
m[1].i_ref = 0;
m[1].weight = x264_weight_none;
LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres ); //装载后向参考帧的二分之一像素各平面
m[1].p_fref_w = m[1].p_fref[0];
//如果后向参考帧的前向参考mv有效
if( fref1->lowres_mvs[0][p1-p0-1][0][0] != 0x7FFF )
{
int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
dmv[1][0] = dmv[0][0] - mvr[0];
dmv[1][1] = dmv[0][1] - mvr[1];
CLIP_MV( dmv[0] );
CLIP_MV( dmv[1] );
if( h->param.analyse.i_subpel_refine <= 1 )
M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
}
else
M64( dmv ) = 0;
//dmv[0]是前向参考mv,dmv[1]是后向参考mv,TRY_BIDIR是先由前向和后向参考帧生成一个平均参考帧,再计算平均参考帧的cost
TRY_BIDIR( dmv[0], dmv[1], 0 );
if( M64( dmv ) ) //如果dmv不为0,还要计算mv为0时的前向和后向参考帧的平均参考帧的cost
{
int i_cost;
h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight );
i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 );
COPY2_IF_LT( i_bcost, i_cost, list_used, 3 );
}
}
for( int l = 0; l < 1 + b_bidir; l++ )
{
if( do_search[l] )
{
int i_mvc = 0;
int16_t (*fenc_mv)[2] = fenc_mvs[l];
ALIGNED_4( int16_t mvc[4][2] ); //int16_t mvc[4][2]; //mvc[0][0]是x方向,mvc[0][1]是y方向
/* Reverse-order MV prediction. */
M32( mvc[0] ) = 0;
M32( mvc[2] ) = 0;
#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
if( i_mb_x < h->mb.i_mb_width - 1 ) //因为运动搜索时是从最后一个宏块开始处理的,因此第一次进来时i_mb_x == h->mb.i_mb_width - 1,没有可参考的mv
MVC( fenc_mv[1] );
if( i_mb_y < h->i_threadslice_end - 1 ) //同上
{
MVC( fenc_mv[i_mb_stride] );
if( i_mb_x > 0 )
MVC( fenc_mv[i_mb_stride-1] );
if( i_mb_x < h->mb.i_mb_width - 1 )
MVC( fenc_mv[i_mb_stride+1] );
}
#undef MVC
if( i_mvc <= 1 ) //i_mvc小于或等1,表示没有可用的参考向量或只有一个运动参考向量,就是mvc[0]
CP32( m[l].mvp, mvc[0] );
else //否则,说明有多个参考运动向量,则取mvc[0], mvc[1], mvc[2]三个向量的中值
x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
/* Fast skip for cases of near-zero residual. Shortcut: don't bother except in the mv0 case,
* since anything else is likely to have enough residual to not trigger the skip. */
if( !M32( m[l].mvp ) ) //如果预测mv为0,则直接比较参考帧中的相同坐标的宏块
{
m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
if( m[l].cost < 64 )
{
M32( m[l].mv ) = 0;
goto skip_motionest;
}
}
x264_me_search( h, &m[l], mvc, i_mvc ); //进行运动搜索,找出cost最小的宏块
m[l].cost -= a->p_cost_mv[0]; // remove mvcost from skip mbs
if( M32( m[l].mv ) )
m[l].cost += 5 * a->i_lambda;
skip_motionest:
CP32( fenc_mvs[l], m[l].mv ); //将当前mv保存到fenc_mvs(也就是fenc->lowres_mvs),因为下一次运动估计需要用它来做预测mv
*fenc_costs[l] = m[l].cost;
}
else
{
CP32( m[l].mv, fenc_mvs[l] );
m[l].cost = *fenc_costs[l];
}
COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
}
if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
lowres_intra_mb: //I帧处理,详见上一篇文章
if( !fenc->b_intra_calculated )
{
ALIGNED_ARRAY_16( pixel, edge,[36] );
pixel *pix = &pix1[8+FDEC_STRIDE];
pixel *src = &fenc->lowres[0][i_pel_offset];
const int intra_penalty = 5 * a->i_lambda;
int satds[3];
int pixoff = 4 / sizeof(pixel);
/* Avoid store forwarding stalls by writing larger chunks */
memcpy( pix-FDEC_STRIDE, src-i_stride, 16 * sizeof(pixel) );
for( int i = -1; i < 8; i++ )
M32( &pix[i*FDEC_STRIDE-pixoff] ) = M32( &src[i*i_stride-pixoff] );
h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
int i_icost = X264_MIN3( satds[0], satds[1], satds[2] );
if( h->param.analyse.i_subpel_refine > 1 )
{
h->predict_8x8c[I_PRED_CHROMA_P]( pix );
int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
for( int i = 3; i < 9; i++ )
{
h->predict_8x8[i]( pix, edge );
satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
}
}
i_icost = ((i_icost + intra_penalty) >> (BIT_DEPTH - 8)) + lowres_penalty;
fenc->i_intra_cost[i_mb_xy] = i_icost;
int i_icost_aq = i_icost;
if( h->param.rc.i_aq_mode )
i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
output_intra[ROW_SATD] += i_icost_aq;
if( b_frame_score_mb )
{
output_intra[COST_EST] += i_icost;
output_intra[COST_EST_AQ] += i_icost_aq;
}
}
i_bcost = (i_bcost >> (BIT_DEPTH - 8)) + lowres_penalty;
/* forbid intra-mbs in B-frames, because it's rare and not worth checking */
/* FIXME: Should we still forbid them now that we cache intra scores? */
if( !b_bidir )
{
int i_icost = fenc->i_intra_cost[i_mb_xy];
int b_intra = i_icost < i_bcost;
if( b_intra )
{
i_bcost = i_icost;
list_used = 0;
}
if( b_frame_score_mb )
output_inter[INTRA_MBS] += b_intra;
}
/* In an I-frame, we've already added the results above in the intra section. */
if( p0 != p1 )
{
int i_bcost_aq = i_bcost;
if( h->param.rc.i_aq_mode )
i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
output_inter[ROW_SATD] += i_bcost_aq;
if( b_frame_score_mb )
{
/* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
output_inter[COST_EST] += i_bcost;
output_inter[COST_EST_AQ] += i_bcost_aq;
}
}
//最后会把最小损耗的cost保存到fenc->lowres_costs
fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
}
基本流程在注释里已做说明,现在补充说明两点:
1.运动向量的保存。把上面的代码再摘出来:
int16_t (*fenc_mvs[2])[2] = { &fenc->lowres_mvs[0][b-p0-1][i_mb_xy], &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] };
运动向量保存在fenc->lowres_mvs中,它的声明是:
typedef struct x264_frame
{
//省略
/* motion data */
int8_t *mb_type;
uint8_t *mb_partition;
int16_t (*mv[2])[2];
int16_t (*mv16x16)[2];
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2]; //lowres_mvs[2][X264_BFRAME_MAX+1]是一个指针,指向一个有2个int16_t元素的数组
//省略
}
lowres_mvs这个变量初看起来有点费解,我专门画了个图来辅助理解:
运动估计00.png
2.预测顺序是从最后一个宏块开始往前遍历的,这可以从x264_slicetype_slice_cost函数中看出来:
static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s )
{
x264_t *h = s->h;
/* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
* This considerably improves MV prediction overall. */
//上面的注释大意是:低分辨率(二分之一像素)的预测是从后向前进行的,因为MVs(运动向量)在编码器中被用做预测器
/* The edge mbs seem to reduce the predictive quality of the
* whole frame's score, but are needed for a spatial distribution. */
int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges );
int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges );
int start_x = h->mb.i_mb_width - 2 + do_edges;
int end_x = 1 - do_edges;
//从这里也可以看出,是从最后一个宏块开始的
for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- )
for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- )
x264_slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor,
s->do_search, s->w, s->output_inter, s->output_intra );
}