[FFmpeg-devel,2/3] lavc/vaapi_encode_h264: respect "slices" option in h264 vaapi encoder

Submitted by Jun Zhao on July 30, 2018, 11:42 a.m.

Details

Message ID 1532950960-7640-3-git-send-email-mypopydev@gmail.com
State New
Headers show

Commit Message

Jun Zhao July 30, 2018, 11:42 a.m.
Enable multi-slice support in AVC/H.264 vaapi encoder.

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Signed-off-by: Jun Zhao <jun.zhao@intel.com>
---
 libavcodec/vaapi_encode_h264.c |   31 +++++++++++++++++++++++++------
 1 files changed, 25 insertions(+), 6 deletions(-)

Comments

Mark Thompson Aug. 21, 2018, midnight
On 30/07/18 12:42, Jun Zhao wrote:
> Enable multi-slice support in AVC/H.264 vaapi encoder.
> 
> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
> Signed-off-by: Jun Zhao <jun.zhao@intel.com>
> ---
>  libavcodec/vaapi_encode_h264.c |   31 +++++++++++++++++++++++++------
>  1 files changed, 25 insertions(+), 6 deletions(-)
> 
> diff --git a/libavcodec/vaapi_encode_h264.c b/libavcodec/vaapi_encode_h264.c
> index 905c507..70c89e8 100644
> --- a/libavcodec/vaapi_encode_h264.c
> +++ b/libavcodec/vaapi_encode_h264.c
> @@ -581,6 +581,7 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
>      H264RawSPS                       *sps = &priv->sps;
>      VAEncPictureParameterBufferH264 *vpic = pic->codec_picture_params;
>      int i;
> +    int slices;
>  
>      memset(&priv->current_access_unit, 0,
>             sizeof(priv->current_access_unit));
> @@ -690,7 +691,17 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
>      vpic->pic_fields.bits.idr_pic_flag       = (pic->type == PICTURE_TYPE_IDR);
>      vpic->pic_fields.bits.reference_pic_flag = (pic->type != PICTURE_TYPE_B);
>  
> -    pic->nb_slices = 1;
> +    slices = 1;
> +    if (ctx->max_slices) {
> +        if (avctx->slices <= FFMIN(ctx->max_slices, priv->mb_height)) {
> +            slices = FFMAX(avctx->slices, slices);
> +        } else {
> +            av_log(avctx, AV_LOG_ERROR, "The max slices number per frame "
> +                   "cannot be more than %d.\n", FFMIN(ctx->max_slices, priv->mb_height));
> +            return AVERROR_INVALIDDATA;

AVERROR(EINVAL) for invalid user parameters.

> +        }
> +    }
> +    pic->nb_slices = slices;
>  
>      return 0;
>  }
> @@ -716,8 +727,7 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
>          sh->nal_unit_header.nal_ref_idc   = pic->type != PICTURE_TYPE_B;
>      }
>  
> -    // Only one slice per frame.
> -    sh->first_mb_in_slice = 0;
> +    sh->first_mb_in_slice = !!slice->index;
>      sh->slice_type        = priv->slice_type;
>  
>      sh->pic_parameter_set_id = pps->pic_parameter_set_id;
> @@ -738,14 +748,19 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
>          sh->slice_qp_delta = priv->fixed_qp_idr - (pps->pic_init_qp_minus26 + 26);
>  
>  
> -    vslice->macroblock_address = sh->first_mb_in_slice;
> -    vslice->num_macroblocks    = priv->mb_width * priv->mb_height;
> +    vslice->macroblock_address = slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
> +    if (slice->index == pic->nb_slices - 1) {
> +        vslice->num_macroblocks =  priv->mb_width *  priv->mb_height
> +                                   - slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
> +        priv->idr_pic_count++;
> +    } else
> +        vslice->num_macroblocks = priv->mb_width * (priv->mb_height / pic->nb_slices);

This dumps all of the rounding error in the last slice.  E.g. 1080p with 8 slices gives you 68 macroblocks high, so you get seven slices with 68/8 = 8 macroblock height and the last one has 12 macroblock height.

It should be balanced so that all slices are roughly the same size (8-slice 1080p -> four slices of 9 + four slices of 8).  It might make sense to put the residual rounding error away from the middle of the frame too (so 9, 9, 8, 8, 8, 8, 9, 9), though that's probably second-order.

>  
>      vslice->macroblock_info = VA_INVALID_ID;
>  
>      vslice->slice_type           = sh->slice_type % 5;
>      vslice->pic_parameter_set_id = sh->pic_parameter_set_id;
> -    vslice->idr_pic_id           = sh->idr_pic_id;
> +    vslice->idr_pic_id           = priv->idr_pic_count;
>  
>      vslice->pic_order_cnt_lsb = sh->pic_order_cnt_lsb;
>  
> @@ -855,6 +870,10 @@ static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
>          }
>      }
>  
> +    if (!ctx->max_slices && avctx->slices > 0)
> +        av_log(avctx, AV_LOG_WARNING, "The encode slice option is not "
> +               "supported with the driver.\n");

Maybe this should fail in the same way as the above case where you ask for too many slices?  If the user requests them it is probably for conformance reasons, so continuing and generating a stream without slices seems unlikely to be useful.  (The Mesa driver on AMD hits this case.)

> +
>      return 0;
>  }
>  
> 

Unfortunately, this doesn't seem to work at all for me - the driver is happy with the input, but I always get corrupt output.  I tried on both Haswell and Coffee Lake with the current i965 driver.

E.g. (with "Intel i965 driver for Intel(R) Coffee Lake - 2.1.1.pre1 (2.1.0-44-g40b15a5)"):

$ ./ffmpeg_g -v 55 -y -threads 1 -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -hwaccel_output_format vaapi -i ~/video/test/bbb_1080_264.mp4 -an -c:v h264_vaapi -slices 2 -frames:v 1 test.264
...
[h264_vaapi @ 0x5607d595e280] Encode frame: 1920x1080 (0).
[h264_vaapi @ 0x5607d595e280] Pictures: IDR (0/0)
[h264_vaapi @ 0x5607d595e280] Issuing encode for pic 0/0 as type IDR.
[h264_vaapi @ 0x5607d595e280] No reference pictures.
[h264_vaapi @ 0x5607d595e280] Input surface is 0x4000013.
[h264_vaapi @ 0x5607d595e280] Recon surface is 0x400001a.
[h264_vaapi @ 0x5607d595e280] Allocated output buffer 0x8000003
[h264_vaapi @ 0x5607d595e280] Output buffer is 0x8000003.
[h264_vaapi @ 0x5607d595e280] Param buffer (27) is 0x8000002.
[h264_vaapi @ 0x5607d595e280] Param buffer (22) is 0x8000001.
[h264_vaapi @ 0x5607d595e280] Param buffer (23) is 0x8000000.
[h264_vaapi @ 0x5607d595e280] Packed header buffer (1) is 0x8000004/0x8000005 (328 bits).
[h264_vaapi @ 0x5607d595e280] Packed header buffer (4) is 0x8000006/0x8000007 (1040 bits).
[h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x8000008/0x8000009 (72 bits).
[h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000a.
[h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x800000b/0x800000c (72 bits).
[h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000d.
[h264_vaapi @ 0x5607d595e280] Pictures ending truncated GOP: IDR (0/0)
[h264_vaapi @ 0x5607d595e280] Sync to pic 0/0 (input surface 0x4000013).
[h264_vaapi @ 0x5607d595e280] Output buffer: 623 bytes (status 00000000).
[h264_vaapi @ 0x5607d595e280] Output read for pic 0/0.
...

which looks good - two slice param buffers and two slice headers.

But:

$ ffmpeg -i test.264 out.mjpeg
...
[h264 @ 0x559556ae1d40] top block unavailable for requested intra mode -1
[h264 @ 0x559556ae1d40] error while decoding MB 0 1, bytestream 132
[h264 @ 0x559556ae1d40] concealing 8090 DC, 8090 AC, 8090 MV errors in I frame
Input #0, h264, from 'test.264':
  Duration: N/A, bitrate: N/A
    Stream #0:0: Video: h264 (High), yuv420p(progressive), 1920x1080 [SAR 1:1 DAR 16:9], 60 tbr, 1200k tbn, 120 tbc
...

and the frame data does not resemble the input at all (but the headers are correct).

The reference decoder has some similar complaints and produces nothing useful:

$ ldecod.exe
...
warning: Intra_8x8_Horizontal_Up prediction mode not allowed at mb 120
warning: Intra_4x4_Vertical_Left prediction mode not allowed at mb 120
...


I do remember this working in one of the earlier versions.  What are you testing with here?  (I'll build an older driver tomorrow to try, in case that's somehow been broken.)

Thanks,

- Mark
mypopy@gmail.com Aug. 21, 2018, 12:51 a.m.
On Tue, Aug 21, 2018 at 8:05 AM Mark Thompson <sw@jkqxz.net> wrote:
>
> On 30/07/18 12:42, Jun Zhao wrote:
> > Enable multi-slice support in AVC/H.264 vaapi encoder.
> >
> > Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
> > Signed-off-by: Jun Zhao <jun.zhao@intel.com>
> > ---
> >  libavcodec/vaapi_encode_h264.c |   31 +++++++++++++++++++++++++------
> >  1 files changed, 25 insertions(+), 6 deletions(-)
> >
> > diff --git a/libavcodec/vaapi_encode_h264.c b/libavcodec/vaapi_encode_h264.c
> > index 905c507..70c89e8 100644
> > --- a/libavcodec/vaapi_encode_h264.c
> > +++ b/libavcodec/vaapi_encode_h264.c
> > @@ -581,6 +581,7 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
> >      H264RawSPS                       *sps = &priv->sps;
> >      VAEncPictureParameterBufferH264 *vpic = pic->codec_picture_params;
> >      int i;
> > +    int slices;
> >
> >      memset(&priv->current_access_unit, 0,
> >             sizeof(priv->current_access_unit));
> > @@ -690,7 +691,17 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
> >      vpic->pic_fields.bits.idr_pic_flag       = (pic->type == PICTURE_TYPE_IDR);
> >      vpic->pic_fields.bits.reference_pic_flag = (pic->type != PICTURE_TYPE_B);
> >
> > -    pic->nb_slices = 1;
> > +    slices = 1;
> > +    if (ctx->max_slices) {
> > +        if (avctx->slices <= FFMIN(ctx->max_slices, priv->mb_height)) {
> > +            slices = FFMAX(avctx->slices, slices);
> > +        } else {
> > +            av_log(avctx, AV_LOG_ERROR, "The max slices number per frame "
> > +                   "cannot be more than %d.\n", FFMIN(ctx->max_slices, priv->mb_height));
> > +            return AVERROR_INVALIDDATA;
>
> AVERROR(EINVAL) for invalid user parameters.
Will follow the comment.
>
> > +        }
> > +    }
> > +    pic->nb_slices = slices;
> >
> >      return 0;
> >  }
> > @@ -716,8 +727,7 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
> >          sh->nal_unit_header.nal_ref_idc   = pic->type != PICTURE_TYPE_B;
> >      }
> >
> > -    // Only one slice per frame.
> > -    sh->first_mb_in_slice = 0;
> > +    sh->first_mb_in_slice = !!slice->index;
> >      sh->slice_type        = priv->slice_type;
> >
> >      sh->pic_parameter_set_id = pps->pic_parameter_set_id;
> > @@ -738,14 +748,19 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
> >          sh->slice_qp_delta = priv->fixed_qp_idr - (pps->pic_init_qp_minus26 + 26);
> >
> >
> > -    vslice->macroblock_address = sh->first_mb_in_slice;
> > -    vslice->num_macroblocks    = priv->mb_width * priv->mb_height;
> > +    vslice->macroblock_address = slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
> > +    if (slice->index == pic->nb_slices - 1) {
> > +        vslice->num_macroblocks =  priv->mb_width *  priv->mb_height
> > +                                   - slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
> > +        priv->idr_pic_count++;
> > +    } else
> > +        vslice->num_macroblocks = priv->mb_width * (priv->mb_height / pic->nb_slices);
>
> This dumps all of the rounding error in the last slice.  E.g. 1080p with 8 slices gives you 68 macroblocks high, so you get seven slices with 68/8 = 8 macroblock height and the last one has 12 macroblock height.
>
> It should be balanced so that all slices are roughly the same size (8-slice 1080p -> four slices of 9 + four slices of 8).  It might make sense to put the residual rounding error away from the middle of the frame too (so 9, 9, 8, 8, 8, 8, 9, 9), though that's probably second-order.
I agree with the comment, as my point, how about change the slice number as :

68 / 8 = 8 .. 4, and we give (9, 9, 9, 9, 8, 8, 8, 8) in this case?

>
> >
> >      vslice->macroblock_info = VA_INVALID_ID;
> >
> >      vslice->slice_type           = sh->slice_type % 5;
> >      vslice->pic_parameter_set_id = sh->pic_parameter_set_id;
> > -    vslice->idr_pic_id           = sh->idr_pic_id;
> > +    vslice->idr_pic_id           = priv->idr_pic_count;
> >
> >      vslice->pic_order_cnt_lsb = sh->pic_order_cnt_lsb;
> >
> > @@ -855,6 +870,10 @@ static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
> >          }
> >      }<
>
> > +    if (!ctx->max_slices && avctx->slices > 0)
> > +        av_log(avctx, AV_LOG_WARNING, "The encode slice option is not "
> > +               "supported with the driver.\n");
>
> Maybe this should fail in the same way as the above case where you ask for too many slices?  If the user requests them it is probably for conformance reasons, so continuing and generating a stream without slices seems unlikely to be useful.  (The Mesa driver on AMD hits this case.)
Will double-check this part, Thanks.

>
> > +
> >      return 0;
> >  }
> >
> >
>
> Unfortunately, this doesn't seem to work at all for me - the driver is happy with the input, but I always get corrupt output.  I tried on both Haswell and Coffee Lake with the current i965 driver.
>
> E.g. (with "Intel i965 driver for Intel(R) Coffee Lake - 2.1.1.pre1 (2.1.0-44-g40b15a5)"):
>
> $ ./ffmpeg_g -v 55 -y -threads 1 -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -hwaccel_output_format vaapi -i ~/video/test/bbb_1080_264.mp4 -an -c:v h264_vaapi -slices 2 -frames:v 1 test.264
> ...
> [h264_vaapi @ 0x5607d595e280] Encode frame: 1920x1080 (0).
> [h264_vaapi @ 0x5607d595e280] Pictures: IDR (0/0)
> [h264_vaapi @ 0x5607d595e280] Issuing encode for pic 0/0 as type IDR.
> [h264_vaapi @ 0x5607d595e280] No reference pictures.
> [h264_vaapi @ 0x5607d595e280] Input surface is 0x4000013.
> [h264_vaapi @ 0x5607d595e280] Recon surface is 0x400001a.
> [h264_vaapi @ 0x5607d595e280] Allocated output buffer 0x8000003
> [h264_vaapi @ 0x5607d595e280] Output buffer is 0x8000003.
> [h264_vaapi @ 0x5607d595e280] Param buffer (27) is 0x8000002.
> [h264_vaapi @ 0x5607d595e280] Param buffer (22) is 0x8000001.
> [h264_vaapi @ 0x5607d595e280] Param buffer (23) is 0x8000000.
> [h264_vaapi @ 0x5607d595e280] Packed header buffer (1) is 0x8000004/0x8000005 (328 bits).
> [h264_vaapi @ 0x5607d595e280] Packed header buffer (4) is 0x8000006/0x8000007 (1040 bits).
> [h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x8000008/0x8000009 (72 bits).
> [h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000a.
> [h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x800000b/0x800000c (72 bits).
> [h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000d.
> [h264_vaapi @ 0x5607d595e280] Pictures ending truncated GOP: IDR (0/0)
> [h264_vaapi @ 0x5607d595e280] Sync to pic 0/0 (input surface 0x4000013).
> [h264_vaapi @ 0x5607d595e280] Output buffer: 623 bytes (status 00000000).
> [h264_vaapi @ 0x5607d595e280] Output read for pic 0/0.
> ...
>
> which looks good - two slice param buffers and two slice headers.
>
> But:
>
> $ ffmpeg -i test.264 out.mjpeg
> ...
> [h264 @ 0x559556ae1d40] top block unavailable for requested intra mode -1
> [h264 @ 0x559556ae1d40] error while decoding MB 0 1, bytestream 132
> [h264 @ 0x559556ae1d40] concealing 8090 DC, 8090 AC, 8090 MV errors in I frame
> Input #0, h264, from 'test.264':
>   Duration: N/A, bitrate: N/A
>     Stream #0:0: Video: h264 (High), yuv420p(progressive), 1920x1080 [SAR 1:1 DAR 16:9], 60 tbr, 1200k tbn, 120 tbc
> ...
>
> and the frame data does not resemble the input at all (but the headers are correct).
>
> The reference decoder has some similar complaints and produces nothing useful:
>
> $ ldecod.exe
> ...
> warning: Intra_8x8_Horizontal_Up prediction mode not allowed at mb 120
> warning: Intra_4x4_Vertical_Left prediction mode not allowed at mb 120
> ...
>
>
> I do remember this working in one of the earlier versions.  What are you testing with here?  (I'll build an older driver tomorrow to try, in case that's somehow been broken.)
I don't test this function in  Haswell and Coffee Lake with i965
driver, I just test this function in SKL with iHD driver, then I will
try to run this function with i965 driver in SKL. BTW, as I know, i965
and old
iHD driver has a limitation in multiple-slices encoder, it's needed
the last slice  macroblocks <=  pervious slice mackblocks, I will
confirm is it this patch trigger the limitation. Thanks
>
> Thanks,
>
> - Mark
Mark Thompson Aug. 27, 2018, 5:59 p.m.
On 21/08/18 01:51, mypopy@gmail.com wrote:
> On Tue, Aug 21, 2018 at 8:05 AM Mark Thompson <sw@jkqxz.net> wrote:
>>
>> On 30/07/18 12:42, Jun Zhao wrote:
>>> Enable multi-slice support in AVC/H.264 vaapi encoder.
>>>
>>> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
>>> Signed-off-by: Jun Zhao <jun.zhao@intel.com>
>>> ---
>>>  libavcodec/vaapi_encode_h264.c |   31 +++++++++++++++++++++++++------
>>>  1 files changed, 25 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/libavcodec/vaapi_encode_h264.c b/libavcodec/vaapi_encode_h264.c
>>> index 905c507..70c89e8 100644
>>> --- a/libavcodec/vaapi_encode_h264.c
>>> +++ b/libavcodec/vaapi_encode_h264.c
>>> @@ -581,6 +581,7 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
>>>      H264RawSPS                       *sps = &priv->sps;
>>>      VAEncPictureParameterBufferH264 *vpic = pic->codec_picture_params;
>>>      int i;
>>> +    int slices;
>>>
>>>      memset(&priv->current_access_unit, 0,
>>>             sizeof(priv->current_access_unit));
>>> @@ -690,7 +691,17 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
>>>      vpic->pic_fields.bits.idr_pic_flag       = (pic->type == PICTURE_TYPE_IDR);
>>>      vpic->pic_fields.bits.reference_pic_flag = (pic->type != PICTURE_TYPE_B);
>>>
>>> -    pic->nb_slices = 1;
>>> +    slices = 1;
>>> +    if (ctx->max_slices) {
>>> +        if (avctx->slices <= FFMIN(ctx->max_slices, priv->mb_height)) {
>>> +            slices = FFMAX(avctx->slices, slices);
>>> +        } else {
>>> +            av_log(avctx, AV_LOG_ERROR, "The max slices number per frame "
>>> +                   "cannot be more than %d.\n", FFMIN(ctx->max_slices, priv->mb_height));
>>> +            return AVERROR_INVALIDDATA;
>>
>> AVERROR(EINVAL) for invalid user parameters.
> Will follow the comment.
>>
>>> +        }
>>> +    }
>>> +    pic->nb_slices = slices;
>>>
>>>      return 0;
>>>  }
>>> @@ -716,8 +727,7 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
>>>          sh->nal_unit_header.nal_ref_idc   = pic->type != PICTURE_TYPE_B;
>>>      }
>>>
>>> -    // Only one slice per frame.
>>> -    sh->first_mb_in_slice = 0;
>>> +    sh->first_mb_in_slice = !!slice->index;

The problem is here.  This is not what first_mb_in_slice means - it should be set to the same value as macroblock_address.

>>>      sh->slice_type        = priv->slice_type;
>>>
>>>      sh->pic_parameter_set_id = pps->pic_parameter_set_id;
>>> @@ -738,14 +748,19 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
>>>          sh->slice_qp_delta = priv->fixed_qp_idr - (pps->pic_init_qp_minus26 + 26);
>>>
>>>
>>> -    vslice->macroblock_address = sh->first_mb_in_slice;
>>> -    vslice->num_macroblocks    = priv->mb_width * priv->mb_height;
>>> +    vslice->macroblock_address = slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
>>> +    if (slice->index == pic->nb_slices - 1) {
>>> +        vslice->num_macroblocks =  priv->mb_width *  priv->mb_height
>>> +                                   - slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
>>> +        priv->idr_pic_count++;
>>> +    } else
>>> +        vslice->num_macroblocks = priv->mb_width * (priv->mb_height / pic->nb_slices);
>>
>> This dumps all of the rounding error in the last slice.  E.g. 1080p with 8 slices gives you 68 macroblocks high, so you get seven slices with 68/8 = 8 macroblock height and the last one has 12 macroblock height.
>>
>> It should be balanced so that all slices are roughly the same size (8-slice 1080p -> four slices of 9 + four slices of 8).  It might make sense to put the residual rounding error away from the middle of the frame too (so 9, 9, 8, 8, 8, 8, 9, 9), though that's probably second-order.
> I agree with the comment, as my point, how about change the slice number as :
> 
> 68 / 8 = 8 .. 4, and we give (9, 9, 9, 9, 8, 8, 8, 8) in this case?

Is it the size constraint mentioned below which is causing you to make that choice?  Intuitively I would place the larger slices at the top and bottom of the frame, hence (9, 9, 8, 8, 8, 8, 9, 9).

>>
>>>
>>>      vslice->macroblock_info = VA_INVALID_ID;
>>>
>>>      vslice->slice_type           = sh->slice_type % 5;
>>>      vslice->pic_parameter_set_id = sh->pic_parameter_set_id;
>>> -    vslice->idr_pic_id           = sh->idr_pic_id;
>>> +    vslice->idr_pic_id           = priv->idr_pic_count;
>>>
>>>      vslice->pic_order_cnt_lsb = sh->pic_order_cnt_lsb;
>>>
>>> @@ -855,6 +870,10 @@ static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
>>>          }
>>>      }<
>>
>>> +    if (!ctx->max_slices && avctx->slices > 0)
>>> +        av_log(avctx, AV_LOG_WARNING, "The encode slice option is not "
>>> +               "supported with the driver.\n");
>>
>> Maybe this should fail in the same way as the above case where you ask for too many slices?  If the user requests them it is probably for conformance reasons, so continuing and generating a stream without slices seems unlikely to be useful.  (The Mesa driver on AMD hits this case.)
> Will double-check this part, Thanks.
> 
>>
>>> +
>>>      return 0;
>>>  }
>>>
>>>
>>
>> Unfortunately, this doesn't seem to work at all for me - the driver is happy with the input, but I always get corrupt output.  I tried on both Haswell and Coffee Lake with the current i965 driver.
>>
>> E.g. (with "Intel i965 driver for Intel(R) Coffee Lake - 2.1.1.pre1 (2.1.0-44-g40b15a5)"):
>>
>> $ ./ffmpeg_g -v 55 -y -threads 1 -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -hwaccel_output_format vaapi -i ~/video/test/bbb_1080_264.mp4 -an -c:v h264_vaapi -slices 2 -frames:v 1 test.264
>> ...
>> [h264_vaapi @ 0x5607d595e280] Encode frame: 1920x1080 (0).
>> [h264_vaapi @ 0x5607d595e280] Pictures: IDR (0/0)
>> [h264_vaapi @ 0x5607d595e280] Issuing encode for pic 0/0 as type IDR.
>> [h264_vaapi @ 0x5607d595e280] No reference pictures.
>> [h264_vaapi @ 0x5607d595e280] Input surface is 0x4000013.
>> [h264_vaapi @ 0x5607d595e280] Recon surface is 0x400001a.
>> [h264_vaapi @ 0x5607d595e280] Allocated output buffer 0x8000003
>> [h264_vaapi @ 0x5607d595e280] Output buffer is 0x8000003.
>> [h264_vaapi @ 0x5607d595e280] Param buffer (27) is 0x8000002.
>> [h264_vaapi @ 0x5607d595e280] Param buffer (22) is 0x8000001.
>> [h264_vaapi @ 0x5607d595e280] Param buffer (23) is 0x8000000.
>> [h264_vaapi @ 0x5607d595e280] Packed header buffer (1) is 0x8000004/0x8000005 (328 bits).
>> [h264_vaapi @ 0x5607d595e280] Packed header buffer (4) is 0x8000006/0x8000007 (1040 bits).
>> [h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x8000008/0x8000009 (72 bits).
>> [h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000a.
>> [h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x800000b/0x800000c (72 bits).
>> [h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000d.
>> [h264_vaapi @ 0x5607d595e280] Pictures ending truncated GOP: IDR (0/0)
>> [h264_vaapi @ 0x5607d595e280] Sync to pic 0/0 (input surface 0x4000013).
>> [h264_vaapi @ 0x5607d595e280] Output buffer: 623 bytes (status 00000000).
>> [h264_vaapi @ 0x5607d595e280] Output read for pic 0/0.
>> ...
>>
>> which looks good - two slice param buffers and two slice headers.
>>
>> But:
>>
>> $ ffmpeg -i test.264 out.mjpeg
>> ...
>> [h264 @ 0x559556ae1d40] top block unavailable for requested intra mode -1
>> [h264 @ 0x559556ae1d40] error while decoding MB 0 1, bytestream 132
>> [h264 @ 0x559556ae1d40] concealing 8090 DC, 8090 AC, 8090 MV errors in I frame
>> Input #0, h264, from 'test.264':
>>   Duration: N/A, bitrate: N/A
>>     Stream #0:0: Video: h264 (High), yuv420p(progressive), 1920x1080 [SAR 1:1 DAR 16:9], 60 tbr, 1200k tbn, 120 tbc
>> ...
>>
>> and the frame data does not resemble the input at all (but the headers are correct).
>>
>> The reference decoder has some similar complaints and produces nothing useful:
>>
>> $ ldecod.exe
>> ...
>> warning: Intra_8x8_Horizontal_Up prediction mode not allowed at mb 120
>> warning: Intra_4x4_Vertical_Left prediction mode not allowed at mb 120
>> ...
>>
>>
>> I do remember this working in one of the earlier versions.  What are you testing with here?  (I'll build an older driver tomorrow to try, in case that's somehow been broken.)
> I don't test this function in  Haswell and Coffee Lake with i965
> driver, I just test this function in SKL with iHD driver, then I will
> try to run this function with i965 driver in SKL. BTW, as I know, i965
> and old
> iHD driver has a limitation in multiple-slices encoder, it's needed
> the last slice  macroblocks <=  pervious slice mackblocks, I will
> confirm is it this patch trigger the limitation. Thanks

The problem which caused the corrupt output was that first_mb_in_slice was not set correctly; see above.  With that fixed, the patch does work for me on the i965 driver.

Thanks,

- Mark
mypopy@gmail.com Aug. 28, 2018, 2:32 a.m.
On Tue, Aug 28, 2018 at 2:07 AM Mark Thompson <sw@jkqxz.net> wrote:
>
> On 21/08/18 01:51, mypopy@gmail.com wrote:
> > On Tue, Aug 21, 2018 at 8:05 AM Mark Thompson <sw@jkqxz.net> wrote:
> >>
> >> On 30/07/18 12:42, Jun Zhao wrote:
> >>> Enable multi-slice support in AVC/H.264 vaapi encoder.
> >>>
> >>> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
> >>> Signed-off-by: Jun Zhao <jun.zhao@intel.com>
> >>> ---
> >>>  libavcodec/vaapi_encode_h264.c |   31 +++++++++++++++++++++++++------
> >>>  1 files changed, 25 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/libavcodec/vaapi_encode_h264.c b/libavcodec/vaapi_encode_h264.c
> >>> index 905c507..70c89e8 100644
> >>> --- a/libavcodec/vaapi_encode_h264.c
> >>> +++ b/libavcodec/vaapi_encode_h264.c
> >>> @@ -581,6 +581,7 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
> >>>      H264RawSPS                       *sps = &priv->sps;
> >>>      VAEncPictureParameterBufferH264 *vpic = pic->codec_picture_params;
> >>>      int i;
> >>> +    int slices;
> >>>
> >>>      memset(&priv->current_access_unit, 0,
> >>>             sizeof(priv->current_access_unit));
> >>> @@ -690,7 +691,17 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
> >>>      vpic->pic_fields.bits.idr_pic_flag       = (pic->type == PICTURE_TYPE_IDR);
> >>>      vpic->pic_fields.bits.reference_pic_flag = (pic->type != PICTURE_TYPE_B);
> >>>
> >>> -    pic->nb_slices = 1;
> >>> +    slices = 1;
> >>> +    if (ctx->max_slices) {
> >>> +        if (avctx->slices <= FFMIN(ctx->max_slices, priv->mb_height)) {
> >>> +            slices = FFMAX(avctx->slices, slices);
> >>> +        } else {
> >>> +            av_log(avctx, AV_LOG_ERROR, "The max slices number per frame "
> >>> +                   "cannot be more than %d.\n", FFMIN(ctx->max_slices, priv->mb_height));
> >>> +            return AVERROR_INVALIDDATA;
> >>
> >> AVERROR(EINVAL) for invalid user parameters.
> > Will follow the comment.
> >>
> >>> +        }
> >>> +    }
> >>> +    pic->nb_slices = slices;
> >>>
> >>>      return 0;
> >>>  }
> >>> @@ -716,8 +727,7 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
> >>>          sh->nal_unit_header.nal_ref_idc   = pic->type != PICTURE_TYPE_B;
> >>>      }
> >>>
> >>> -    // Only one slice per frame.
> >>> -    sh->first_mb_in_slice = 0;
> >>> +    sh->first_mb_in_slice = !!slice->index;
>
> The problem is here.  This is not what first_mb_in_slice means - it should be set to the same value as macroblock_address.
>
> >>>      sh->slice_type        = priv->slice_type;
> >>>
> >>>      sh->pic_parameter_set_id = pps->pic_parameter_set_id;
> >>> @@ -738,14 +748,19 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
> >>>          sh->slice_qp_delta = priv->fixed_qp_idr - (pps->pic_init_qp_minus26 + 26);
> >>>
> >>>
> >>> -    vslice->macroblock_address = sh->first_mb_in_slice;
> >>> -    vslice->num_macroblocks    = priv->mb_width * priv->mb_height;
> >>> +    vslice->macroblock_address = slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
> >>> +    if (slice->index == pic->nb_slices - 1) {
> >>> +        vslice->num_macroblocks =  priv->mb_width *  priv->mb_height
> >>> +                                   - slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
> >>> +        priv->idr_pic_count++;
> >>> +    } else
> >>> +        vslice->num_macroblocks = priv->mb_width * (priv->mb_height / pic->nb_slices);
> >>
> >> This dumps all of the rounding error in the last slice.  E.g. 1080p with 8 slices gives you 68 macroblocks high, so you get seven slices with 68/8 = 8 macroblock height and the last one has 12 macroblock height.
> >>
> >> It should be balanced so that all slices are roughly the same size (8-slice 1080p -> four slices of 9 + four slices of 8).  It might make sense to put the residual rounding error away from the middle of the frame too (so 9, 9, 8, 8, 8, 8, 9, 9), though that's probably second-order.
> > I agree with the comment, as my point, how about change the slice number as :
> >
> > 68 / 8 = 8 .. 4, and we give (9, 9, 9, 9, 8, 8, 8, 8) in this case?
>
> Is it the size constraint mentioned below which is causing you to make that choice?  Intuitively I would place the larger slices at the top and bottom of the frame, hence (9, 9, 8, 8, 8, 8, 9, 9).
>
Yes, this is the reason causing me to make that choice like (9, 9, 9,
9, 8, 8, 8, 8), than we can avoid some mutil-slice encoding issue when
use a old driver (i965 or iHD).
> >>
> >>>
> >>>      vslice->macroblock_info = VA_INVALID_ID;
> >>>
> >>>      vslice->slice_type           = sh->slice_type % 5;
> >>>      vslice->pic_parameter_set_id = sh->pic_parameter_set_id;
> >>> -    vslice->idr_pic_id           = sh->idr_pic_id;
> >>> +    vslice->idr_pic_id           = priv->idr_pic_count;
> >>>
> >>>      vslice->pic_order_cnt_lsb = sh->pic_order_cnt_lsb;
> >>>
> >>> @@ -855,6 +870,10 @@ static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
> >>>          }
> >>>      }<
> >>
> >>> +    if (!ctx->max_slices && avctx->slices > 0)
> >>> +        av_log(avctx, AV_LOG_WARNING, "The encode slice option is not "
> >>> +               "supported with the driver.\n");
> >>
> >> Maybe this should fail in the same way as the above case where you ask for too many slices?  If the user requests them it is probably for conformance reasons, so continuing and generating a stream without slices seems unlikely to be useful.  (The Mesa driver on AMD hits this case.)
> > Will double-check this part, Thanks.
> >
> >>
> >>> +
> >>>      return 0;
> >>>  }
> >>>
> >>>
> >>
> >> Unfortunately, this doesn't seem to work at all for me - the driver is happy with the input, but I always get corrupt output.  I tried on both Haswell and Coffee Lake with the current i965 driver.
> >>
> >> E.g. (with "Intel i965 driver for Intel(R) Coffee Lake - 2.1.1.pre1 (2.1.0-44-g40b15a5)"):
> >>
> >> $ ./ffmpeg_g -v 55 -y -threads 1 -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -hwaccel_output_format vaapi -i ~/video/test/bbb_1080_264.mp4 -an -c:v h264_vaapi -slices 2 -frames:v 1 test.264
> >> ...
> >> [h264_vaapi @ 0x5607d595e280] Encode frame: 1920x1080 (0).
> >> [h264_vaapi @ 0x5607d595e280] Pictures: IDR (0/0)
> >> [h264_vaapi @ 0x5607d595e280] Issuing encode for pic 0/0 as type IDR.
> >> [h264_vaapi @ 0x5607d595e280] No reference pictures.
> >> [h264_vaapi @ 0x5607d595e280] Input surface is 0x4000013.
> >> [h264_vaapi @ 0x5607d595e280] Recon surface is 0x400001a.
> >> [h264_vaapi @ 0x5607d595e280] Allocated output buffer 0x8000003
> >> [h264_vaapi @ 0x5607d595e280] Output buffer is 0x8000003.
> >> [h264_vaapi @ 0x5607d595e280] Param buffer (27) is 0x8000002.
> >> [h264_vaapi @ 0x5607d595e280] Param buffer (22) is 0x8000001.
> >> [h264_vaapi @ 0x5607d595e280] Param buffer (23) is 0x8000000.
> >> [h264_vaapi @ 0x5607d595e280] Packed header buffer (1) is 0x8000004/0x8000005 (328 bits).
> >> [h264_vaapi @ 0x5607d595e280] Packed header buffer (4) is 0x8000006/0x8000007 (1040 bits).
> >> [h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x8000008/0x8000009 (72 bits).
> >> [h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000a.
> >> [h264_vaapi @ 0x5607d595e280] Packed header buffer (3) is 0x800000b/0x800000c (72 bits).
> >> [h264_vaapi @ 0x5607d595e280] Param buffer (24) is 0x800000d.
> >> [h264_vaapi @ 0x5607d595e280] Pictures ending truncated GOP: IDR (0/0)
> >> [h264_vaapi @ 0x5607d595e280] Sync to pic 0/0 (input surface 0x4000013).
> >> [h264_vaapi @ 0x5607d595e280] Output buffer: 623 bytes (status 00000000).
> >> [h264_vaapi @ 0x5607d595e280] Output read for pic 0/0.
> >> ...
> >>
> >> which looks good - two slice param buffers and two slice headers.
> >>
> >> But:
> >>
> >> $ ffmpeg -i test.264 out.mjpeg
> >> ...
> >> [h264 @ 0x559556ae1d40] top block unavailable for requested intra mode -1
> >> [h264 @ 0x559556ae1d40] error while decoding MB 0 1, bytestream 132
> >> [h264 @ 0x559556ae1d40] concealing 8090 DC, 8090 AC, 8090 MV errors in I frame
> >> Input #0, h264, from 'test.264':
> >>   Duration: N/A, bitrate: N/A
> >>     Stream #0:0: Video: h264 (High), yuv420p(progressive), 1920x1080 [SAR 1:1 DAR 16:9], 60 tbr, 1200k tbn, 120 tbc
> >> ...
> >>
> >> and the frame data does not resemble the input at all (but the headers are correct).
> >>
> >> The reference decoder has some similar complaints and produces nothing useful:
> >>
> >> $ ldecod.exe
> >> ...
> >> warning: Intra_8x8_Horizontal_Up prediction mode not allowed at mb 120
> >> warning: Intra_4x4_Vertical_Left prediction mode not allowed at mb 120
> >> ...
> >>
> >>
> >> I do remember this working in one of the earlier versions.  What are you testing with here?  (I'll build an older driver tomorrow to try, in case that's somehow been broken.)
> > I don't test this function in  Haswell and Coffee Lake with i965
> > driver, I just test this function in SKL with iHD driver, then I will
> > try to run this function with i965 driver in SKL. BTW, as I know, i965
> > and old
> > iHD driver has a limitation in multiple-slices encoder, it's needed
> > the last slice  macroblocks <=  pervious slice mackblocks, I will
> > confirm is it this patch trigger the limitation. Thanks
>
> The problem which caused the corrupt output was that first_mb_in_slice was not set correctly; see above.  With that fixed, the patch does work for me on the i965 driver.
>
> Thanks,
>
> - Mark
Thanks root cause this issue, will fix this part.

Patch hide | download patch | download mbox

diff --git a/libavcodec/vaapi_encode_h264.c b/libavcodec/vaapi_encode_h264.c
index 905c507..70c89e8 100644
--- a/libavcodec/vaapi_encode_h264.c
+++ b/libavcodec/vaapi_encode_h264.c
@@ -581,6 +581,7 @@  static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
     H264RawSPS                       *sps = &priv->sps;
     VAEncPictureParameterBufferH264 *vpic = pic->codec_picture_params;
     int i;
+    int slices;
 
     memset(&priv->current_access_unit, 0,
            sizeof(priv->current_access_unit));
@@ -690,7 +691,17 @@  static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
     vpic->pic_fields.bits.idr_pic_flag       = (pic->type == PICTURE_TYPE_IDR);
     vpic->pic_fields.bits.reference_pic_flag = (pic->type != PICTURE_TYPE_B);
 
-    pic->nb_slices = 1;
+    slices = 1;
+    if (ctx->max_slices) {
+        if (avctx->slices <= FFMIN(ctx->max_slices, priv->mb_height)) {
+            slices = FFMAX(avctx->slices, slices);
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "The max slices number per frame "
+                   "cannot be more than %d.\n", FFMIN(ctx->max_slices, priv->mb_height));
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    pic->nb_slices = slices;
 
     return 0;
 }
@@ -716,8 +727,7 @@  static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
         sh->nal_unit_header.nal_ref_idc   = pic->type != PICTURE_TYPE_B;
     }
 
-    // Only one slice per frame.
-    sh->first_mb_in_slice = 0;
+    sh->first_mb_in_slice = !!slice->index;
     sh->slice_type        = priv->slice_type;
 
     sh->pic_parameter_set_id = pps->pic_parameter_set_id;
@@ -738,14 +748,19 @@  static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
         sh->slice_qp_delta = priv->fixed_qp_idr - (pps->pic_init_qp_minus26 + 26);
 
 
-    vslice->macroblock_address = sh->first_mb_in_slice;
-    vslice->num_macroblocks    = priv->mb_width * priv->mb_height;
+    vslice->macroblock_address = slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
+    if (slice->index == pic->nb_slices - 1) {
+        vslice->num_macroblocks =  priv->mb_width *  priv->mb_height
+                                   - slice->index * priv->mb_width * (priv->mb_height / pic->nb_slices);
+        priv->idr_pic_count++;
+    } else
+        vslice->num_macroblocks = priv->mb_width * (priv->mb_height / pic->nb_slices);
 
     vslice->macroblock_info = VA_INVALID_ID;
 
     vslice->slice_type           = sh->slice_type % 5;
     vslice->pic_parameter_set_id = sh->pic_parameter_set_id;
-    vslice->idr_pic_id           = sh->idr_pic_id;
+    vslice->idr_pic_id           = priv->idr_pic_count;
 
     vslice->pic_order_cnt_lsb = sh->pic_order_cnt_lsb;
 
@@ -855,6 +870,10 @@  static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
         }
     }
 
+    if (!ctx->max_slices && avctx->slices > 0)
+        av_log(avctx, AV_LOG_WARNING, "The encode slice option is not "
+               "supported with the driver.\n");
+
     return 0;
 }