diff mbox series

[FFmpeg-devel,v1,1/2] vaapi: add vaapi_cavs support

Message ID 20240119154950.444144-1-jianfeng.zheng@mthreads.com
State New
Headers show
Series [FFmpeg-devel,v1,1/2] vaapi: add vaapi_cavs support | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 fail Make fate failed

Commit Message

Jianfeng Zheng Jan. 19, 2024, 3:49 p.m. UTC
see https://github.com/intel/libva/pull/738

[Moore Threads](https://www.mthreads.com) (short for Mthreads) is a
Chinese GPU manufacturer. All our products, like MTTS70/MTTS80/.. ,
support AVS/AVS+ HW decoding at max 2k resolution.

Signed-off-by: jianfeng.zheng <jianfeng.zheng@mthreads.com>
---
 configure                 |  14 ++
 libavcodec/Makefile       |   1 +
 libavcodec/cavs.c         |  12 +
 libavcodec/cavs.h         |  36 ++-
 libavcodec/cavs_parser.c  |  16 ++
 libavcodec/cavsdec.c      | 473 +++++++++++++++++++++++++++++++++-----
 libavcodec/defs.h         |   3 +
 libavcodec/hwaccels.h     |   1 +
 libavcodec/profiles.c     |   6 +
 libavcodec/profiles.h     |   1 +
 libavcodec/vaapi_cavs.c   | 164 +++++++++++++
 libavcodec/vaapi_decode.c |   4 +
 12 files changed, 669 insertions(+), 62 deletions(-)
 create mode 100644 libavcodec/vaapi_cavs.c

Comments

Zhao Zhili Jan. 20, 2024, 4:20 a.m. UTC | #1
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of jianfeng.zheng
> Sent: 2024年1月19日 23:50
> To: ffmpeg-devel@ffmpeg.org
> Cc: jianfeng.zheng <jianfeng.zheng@mthreads.com>
> Subject: [FFmpeg-devel] [PATCH v1 1/2] vaapi: add vaapi_cavs support
> 
> see https://github.com/intel/libva/pull/738
> 
> [Moore Threads](https://www.mthreads.com) (short for Mthreads) is a
> Chinese GPU manufacturer. All our products, like MTTS70/MTTS80/.. ,
> support AVS/AVS+ HW decoding at max 2k resolution.

Please use description more objective and neutrality.

> 
> Signed-off-by: jianfeng.zheng <jianfeng.zheng@mthreads.com>
> ---
>  configure                 |  14 ++
>  libavcodec/Makefile       |   1 +
>  libavcodec/cavs.c         |  12 +
>  libavcodec/cavs.h         |  36 ++-
>  libavcodec/cavs_parser.c  |  16 ++
>  libavcodec/cavsdec.c      | 473 +++++++++++++++++++++++++++++++++-----
>  libavcodec/defs.h         |   3 +
>  libavcodec/hwaccels.h     |   1 +
>  libavcodec/profiles.c     |   6 +
>  libavcodec/profiles.h     |   1 +
>  libavcodec/vaapi_cavs.c   | 164 +++++++++++++
>  libavcodec/vaapi_decode.c |   4 +
>  12 files changed, 669 insertions(+), 62 deletions(-)
>  create mode 100644 libavcodec/vaapi_cavs.c
> 
> diff --git a/configure b/configure
> index c8ae0a061d..89759eda5d 100755
> --- a/configure
> +++ b/configure
> @@ -2463,6 +2463,7 @@ HAVE_LIST="
>      xmllint
>      zlib_gzip
>      openvino2
> +    va_profile_avs
>  "
> 
>  # options emitted with CONFIG_ prefix but not available on the command line
> @@ -3202,6 +3203,7 @@ wmv3_dxva2_hwaccel_select="vc1_dxva2_hwaccel"
>  wmv3_nvdec_hwaccel_select="vc1_nvdec_hwaccel"
>  wmv3_vaapi_hwaccel_select="vc1_vaapi_hwaccel"
>  wmv3_vdpau_hwaccel_select="vc1_vdpau_hwaccel"
> +cavs_vaapi_hwaccel_deps="vaapi va_profile_avs VAPictureParameterBufferAVS"
> 
>  # hardware-accelerated codecs
>  mediafoundation_deps="mftransform_h MFCreateAlignedMemoryBuffer"
> @@ -7175,6 +7177,18 @@ if enabled vaapi; then
>      check_type "va/va.h va/va_enc_vp8.h"  "VAEncPictureParameterBufferVP8"
>      check_type "va/va.h va/va_enc_vp9.h"  "VAEncPictureParameterBufferVP9"
>      check_type "va/va.h va/va_enc_av1.h"  "VAEncPictureParameterBufferAV1"
> +
> +    #
> +    # Using 'VA_CHECK_VERSION' in source codes make things easy. But we have to wait
> +    # until newly added VAProfile being distributed by VAAPI released version.
> +    #
> +    # Before or after that, we can use auto-detection to keep version compatibility.
> +    # It always works.
> +    #
> +    disable va_profile_avs &&
> +        test_code cc va/va.h "VAProfile p1 = VAProfileAVSJizhun, p2 = VAProfileAVSGuangdian;" &&
> +        enable va_profile_avs
> +    enabled va_profile_avs && check_type "va/va.h va/va_dec_avs.h" "VAPictureParameterBufferAVS"
>  fi
> 
>  if enabled_all opencl libdrm ; then
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index bb42095165..7d92375fed 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -1055,6 +1055,7 @@ OBJS-$(CONFIG_VP9_VAAPI_HWACCEL)          += vaapi_vp9.o
>  OBJS-$(CONFIG_VP9_VDPAU_HWACCEL)          += vdpau_vp9.o
>  OBJS-$(CONFIG_VP9_VIDEOTOOLBOX_HWACCEL)   += videotoolbox_vp9.o
>  OBJS-$(CONFIG_VP8_QSV_HWACCEL)            += qsvdec.o
> +OBJS-$(CONFIG_CAVS_VAAPI_HWACCEL)         += vaapi_cavs.o
> 
>  # Objects duplicated from other libraries for shared builds
>  SHLIBOBJS                              += log2_tab.o reverse.o
> diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c
> index fdd577f7fb..ed7b278336 100644
> --- a/libavcodec/cavs.c
> +++ b/libavcodec/cavs.c

Please split the patch.

> @@ -810,6 +810,14 @@ av_cold int ff_cavs_init(AVCodecContext *avctx)
>      if (!h->cur.f || !h->DPB[0].f || !h->DPB[1].f)
>          return AVERROR(ENOMEM);
> 
> +    h->out[0].f = av_frame_alloc();
> +    h->out[1].f = av_frame_alloc();
> +    h->out[2].f = av_frame_alloc();
> +    if (!h->out[0].f || !h->out[1].f || !h->out[2].f) {
> +        ff_cavs_end(avctx);
> +        return AVERROR(ENOMEM);
> +    }
> +
>      h->luma_scan[0]                     = 0;
>      h->luma_scan[1]                     = 8;
>      h->intra_pred_l[INTRA_L_VERT]       = intra_pred_vert;
> @@ -840,6 +848,10 @@ av_cold int ff_cavs_end(AVCodecContext *avctx)
>      av_frame_free(&h->DPB[0].f);
>      av_frame_free(&h->DPB[1].f);
> 
> +    av_frame_free(&h->out[0].f);
> +    av_frame_free(&h->out[1].f);
> +    av_frame_free(&h->out[2].f);
> +
>      av_freep(&h->top_qp);
>      av_freep(&h->top_mv[0]);
>      av_freep(&h->top_mv[1]);
> diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h
> index 244c322b35..ef03c1a974 100644
> --- a/libavcodec/cavs.h
> +++ b/libavcodec/cavs.h
> @@ -39,8 +39,10 @@
>  #define EXT_START_CODE          0x000001b5
>  #define USER_START_CODE         0x000001b2
>  #define CAVS_START_CODE         0x000001b0
> +#define VIDEO_SEQ_END_CODE      0x000001b1
>  #define PIC_I_START_CODE        0x000001b3
>  #define PIC_PB_START_CODE       0x000001b6
> +#define VIDEO_EDIT_CODE         0x000001b7
> 
>  #define A_AVAIL                          1
>  #define B_AVAIL                          2
> @@ -164,10 +166,15 @@ struct dec_2dvlc {
>  typedef struct AVSFrame {
>      AVFrame *f;
>      int poc;
> +    int outputed;
> +
> +    AVBufferRef   *hwaccel_priv_buf;
> +    void          *hwaccel_picture_private;
>  } AVSFrame;
> 
>  typedef struct AVSContext {
>      AVCodecContext *avctx;
> +    int got_pix_fmt;
>      BlockDSPContext bdsp;
>      H264ChromaContext h264chroma;
>      VideoDSPContext vdsp;
> @@ -175,6 +182,7 @@ typedef struct AVSContext {
>      GetBitContext gb;
>      AVSFrame cur;     ///< currently decoded frame
>      AVSFrame DPB[2];  ///< reference frames
> +    AVSFrame out[3];  ///< output queue, size 2 maybe enough
>      int dist[2];     ///< temporal distances from current frame to ref frames
>      int low_delay;
>      int profile, level;
> @@ -182,12 +190,38 @@ typedef struct AVSContext {
>      int mb_width, mb_height;
>      int width, height;
>      int stream_revision; ///<0 for samples from 2006, 1 for rm52j encoder
> -    int progressive;
> +    int progressive_seq;
> +    int progressive_frame;
>      int pic_structure;
> +    int no_forward_ref_flag;
> +    int pb_field_enhanced_flag;   ///< only used in GUANGDIAN
>      int skip_mode_flag; ///< select between skip_count or one skip_flag per MB
>      int loop_filter_disable;
>      int alpha_offset, beta_offset;
>      int ref_flag;
> +
> +    /** \defgroup guangdian profile
> +     * @{
> +     */
> +    int aec_flag;
> +    int weight_quant_flag;
> +    int chroma_quant_param_delta_cb;
> +    int chroma_quant_param_delta_cr;
> +    uint8_t wqm_8x8[64];
> +    /**@}*/
> +
> +    /** \defgroup slice weighting
> +     * FFmpeg don't support slice weighting natively, but maybe needed for HWaccel.
> +     * @{
> +     */
> +    uint32_t slice_weight_pred_flag : 1;
> +    uint32_t mb_weight_pred_flag    : 1;
> +    uint8_t luma_scale[4];
> +    int8_t luma_shift[4];
> +    uint8_t chroma_scale[4];
> +    int8_t chroma_shift[4];
> +    /**@}*/
> +
>      int mbx, mby, mbidx; ///< macroblock coordinates
>      int flags;         ///< availability flags of neighbouring macroblocks
>      int stc;           ///< last start code
> diff --git a/libavcodec/cavs_parser.c b/libavcodec/cavs_parser.c
> index 4a03effd0f..a41a82d8d1 100644
> --- a/libavcodec/cavs_parser.c
> +++ b/libavcodec/cavs_parser.c
> @@ -65,6 +65,22 @@ static int cavs_find_frame_end(ParseContext *pc, const uint8_t *buf,
>                  pc->state=-1;
>                  return i-3;
>              }
> +            if((state&0xFFFFFF00) == 0x100){
> +                if(state != EXT_START_CODE && state != USER_START_CODE){
> +                    state = state >> 8;
> +                    break;
> +                }
> +            }
> +        }
> +        for(; i<buf_size; i++){
> +            state= (state<<8) | buf[i];
> +            if((state&0xFFFFFF00) == 0x100){
> +                if(state > SLICE_MAX_START_CODE){
> +                    pc->frame_start_found=0;
> +                    pc->state=-1;
> +                    return i-3;
> +                }
> +            }

Why? A parser should split data into frames, not slices.

>          }
>      }
>      pc->frame_start_found= pic_found;
> diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c
> index b356da0b04..18c38cd3ff 100644
> --- a/libavcodec/cavsdec.c
> +++ b/libavcodec/cavsdec.c
> @@ -25,11 +25,15 @@
>   * @author Stefan Gehrer <stefan.gehrer@gmx.de>
>   */
> 
> +#include "config_components.h"
>  #include "libavutil/avassert.h"
>  #include "libavutil/emms.h"
>  #include "avcodec.h"
>  #include "get_bits.h"
>  #include "golomb.h"
> +#include "hwaccel_internal.h"
> +#include "hwconfig.h"
> +#include "profiles.h"
>  #include "cavs.h"
>  #include "codec_internal.h"
>  #include "decode.h"
> @@ -37,6 +41,43 @@
>  #include "mpeg12data.h"
>  #include "startcode.h"
> 
> +static const uint8_t default_wq_param[4][6] = {
> +    {128,  98, 106, 116, 116, 128},
> +    {135, 143, 143, 160, 160, 213},
> +    {128,  98, 106, 116, 116, 128},
> +    {128, 128, 128, 128, 128, 128},
> +};
> +static const uint8_t wq_model_2_param[4][64] = {
> +    {
> +        0, 0, 0, 4, 4, 4, 5, 5,
> +        0, 0, 3, 3, 3, 3, 5, 5,
> +        0, 3, 2, 2, 1, 1, 5, 5,
> +        4, 3, 2, 2, 1, 5, 5, 5,
> +        4, 3, 1, 1, 5, 5, 5, 5,
> +        4, 3, 1, 5, 5, 5, 5, 5,
> +        5, 5, 5, 5, 5, 5, 5, 5,
> +        5, 5, 5, 5, 5, 5, 5, 5,
> +    }, {
> +        0, 0, 0, 4, 4, 4, 5, 5,
> +        0, 0, 4, 4, 4, 4, 5, 5,
> +        0, 3, 2, 2, 2, 1, 5, 5,
> +        3, 3, 2, 2, 1, 5, 5, 5,
> +        3, 3, 2, 1, 5, 5, 5, 5,
> +        3, 3, 1, 5, 5, 5, 5, 5,
> +        5, 5, 5, 5, 5, 5, 5, 5,
> +        5, 5, 5, 5, 5, 5, 5, 5,
> +    }, {
> +        0, 0, 0, 4, 4, 3, 5, 5,
> +        0, 0, 4, 4, 3, 2, 5, 5,
> +        0, 4, 4, 3, 2, 1, 5, 5,
> +        4, 4, 3, 2, 1, 5, 5, 5,
> +        4, 3, 2, 1, 5, 5, 5, 5,
> +        3, 2, 1, 5, 5, 5, 5, 5,
> +        5, 5, 5, 5, 5, 5, 5, 5,
> +        5, 5, 5, 5, 5, 5, 5, 5,
> +    }
> +};
> +
>  static const uint8_t mv_scan[4] = {
>      MV_FWD_X0, MV_FWD_X1,
>      MV_FWD_X2, MV_FWD_X3
> @@ -927,7 +968,11 @@ static int decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
> 
>  static inline int decode_slice_header(AVSContext *h, GetBitContext *gb)
>  {
> -    if (h->stc > 0xAF)
> +    int i, nref;
> +
> +    av_log(h->avctx, AV_LOG_TRACE, "slice start code 0x%02x\n", h->stc);
> +
> +    if (h->stc > SLICE_MAX_START_CODE)
>          av_log(h->avctx, AV_LOG_ERROR, "unexpected start code 0x%02x\n", h->stc);
> 
>      if (h->stc >= h->mb_height) {
> @@ -946,14 +991,119 @@ static inline int decode_slice_header(AVSContext *h, GetBitContext *gb)
>      }
>      /* inter frame or second slice can have weighting params */
>      if ((h->cur.f->pict_type != AV_PICTURE_TYPE_I) ||
> -        (!h->pic_structure && h->mby >= h->mb_width / 2))
> -        if (get_bits1(gb)) { //slice_weighting_flag
> -            av_log(h->avctx, AV_LOG_ERROR,
> -                   "weighted prediction not yet supported\n");
> +        (!h->pic_structure && h->mby >= h->mb_height / 2)) {
> +        h->slice_weight_pred_flag = get_bits1(gb);
> +        if (h->slice_weight_pred_flag) {
> +            nref = h->cur.f->pict_type == AV_PICTURE_TYPE_I ? 1 : (h->pic_structure ? 2 : 4);
> +            for (i = 0; i < nref; i++) {
> +                h->luma_scale[i] = get_bits(gb, 8);
> +                h->luma_shift[i] = get_sbits(gb, 8);
> +                skip_bits1(gb);
> +                h->chroma_scale[i] = get_bits(gb, 8);
> +                h->chroma_shift[i] = get_sbits(gb, 8);
> +                skip_bits1(gb);
> +            }
> +            h->mb_weight_pred_flag = get_bits1(gb);
> +            if (!h->avctx->hwaccel) {
> +                av_log(h->avctx, AV_LOG_ERROR,
> +                    "weighted prediction not yet supported\n");
> +            }
>          }
> +    }
> +    if (h->aec_flag) {
> +        align_get_bits(gb);
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * skip stuffing bits before next start code "0x000001"
> + * @return '0' no stuffing bits placed at h->gb being skip, else '1'.
> + */
> +static inline int skip_stuffing_bits(AVSContext *h)
> +{
> +    GetBitContext gb0 = h->gb;
> +    GetBitContext *gb = &h->gb;
> +    const uint8_t *start;
> +    const uint8_t *ptr;
> +    const uint8_t *end;
> +    int align;
> +    int stuffing_zeros;
> +
> +#if 0
> +    /**
> +     * skip 1 bit stuffing_bit '1' and 0~7 bit stuffing_bit '0'
> +     */
> +    if (!get_bits1(gb)) {
> +        av_log(h->avctx, AV_LOG_WARNING, "NOT stuffing_bit '1'\n");
> +        goto restore_get_bits;
> +    }
> +    align = (-get_bits_count(gb)) & 7;
> +    if (show_bits_long(gb, align)) {
> +        av_log(h->avctx, AV_LOG_WARNING, "NOT %d stuffing_bit '0..0'\n", align);
> +        goto restore_get_bits;
> +    }
> +#else
> +    /**
> +     * Seems like not all the stream follow "next_start_code()" strictly.
> +     */
> +    align = (-get_bits_count(gb)) & 7;
> +    if (align == 0 && show_bits_long(gb, 8) == 0x80) {
> +        skip_bits_long(gb, 8);
> +    }
> +#endif
> +
> +    /**
> +     *  skip leading zero bytes before 0x 00 00 01 stc
> +     */
> +    ptr = start = align_get_bits(gb);
> +    end = gb->buffer_end;
> +    while (ptr < end && *ptr == 0)
> +        ptr++;
> +
> +    if ((ptr >= end) || (*ptr == 1 && ptr - start >= 2)) {
> +        stuffing_zeros = (ptr >= end ? end - start : ptr - start - 2);
> +        if (stuffing_zeros > 0)
> +            av_log(h->avctx, AV_LOG_DEBUG, "Skip 0x%x stuffing zeros @0x%x.\n",
> +                    stuffing_zeros, (int)(start - gb->buffer));
> +        skip_bits_long(gb, stuffing_zeros * 8);
> +        return 1;
> +    } else {
> +        av_log(h->avctx, AV_LOG_DEBUG, "No next_start_code() found @0x%x.\n",
> +                (int)(start - gb->buffer));
> +        goto restore_get_bits;
> +    }
> +
> +restore_get_bits:
> +    h->gb = gb0;
>      return 0;
>  }
> 
> +static inline int skip_extension_and_user_data(AVSContext *h)
> +{
> +    int stc = -1;
> +    const uint8_t *start = align_get_bits(&h->gb);
> +    const uint8_t *end = h->gb.buffer_end;
> +    const uint8_t *ptr, *next;
> +
> +    for (ptr = start; ptr + 4 < end; ptr = next) {
> +        stc = show_bits_long(&h->gb, 32);
> +        if (stc != EXT_START_CODE && stc != USER_START_CODE) {
> +            break;
> +        }
> +        next = avpriv_find_start_code(ptr + 4, end, &stc);
> +        if (next < end) {
> +            next -= 4;
> +        }
> +        skip_bits(&h->gb, (next - ptr) * 8);
> +        av_log(h->avctx, AV_LOG_DEBUG, "skip %d byte ext/user data\n",
> +                (int)(next - ptr));
> +    }
> +
> +    return ptr > start;
> +}
> +
>  static inline int check_for_slice(AVSContext *h)
>  {
>      GetBitContext *gb = &h->gb;
> @@ -981,44 +1131,133 @@ static inline int check_for_slice(AVSContext *h)
>   * frame level
>   *
>   ****************************************************************************/
> +static int hwaccel_pic(AVSContext *h)
> +{
> +    int ret = 0;
> +    int stc = -1;
> +    const uint8_t *frm_start = align_get_bits(&h->gb);
> +    const uint8_t *frm_end = h->gb.buffer_end;
> +    const uint8_t *slc_start = frm_start;
> +    const uint8_t *slc_end = frm_end;
> +    GetBitContext gb = h->gb;
> +    const FFHWAccel *hwaccel = ffhwaccel(h->avctx->hwaccel);
> +
> +    ret = hwaccel->start_frame(h->avctx, NULL, 0);
> +    if (ret < 0)
> +        return ret;
> +
> +    for (slc_start = frm_start; slc_start + 4 < frm_end; slc_start = slc_end) {
> +        slc_end = avpriv_find_start_code(slc_start + 4, frm_end, &stc);
> +        if (slc_end < frm_end) {
> +            slc_end -= 4;
> +        }
> +
> +        init_get_bits(&h->gb, slc_start, (slc_end - slc_start) * 8);
> +        if (!check_for_slice(h)) {
> +            break;
> +        }
> +
> +        ret = hwaccel->decode_slice(h->avctx, slc_start, slc_end - slc_start);
> +        if (ret < 0) {
> +            break;
> +        }
> +    }
> +
> +    h->gb = gb;
> +    skip_bits(&h->gb, (slc_start - frm_start) * 8);
> +
> +    if (ret < 0)
> +        return ret;
> +
> +    return hwaccel->end_frame(h->avctx);
> +}
> +
> +/**
> + * @brief remove frame out of dpb
> + */
> +static void cavs_frame_unref(AVSFrame *frame)
> +{
> +    /* frame->f can be NULL if context init failed */
> +    if (!frame->f || !frame->f->buf[0])
> +        return;
> +
> +    av_buffer_unref(&frame->hwaccel_priv_buf);
> +    frame->hwaccel_picture_private = NULL;
> +
> +    av_frame_unref(frame->f);
> +}
> +
> +static int output_one_frame(AVSContext *h, AVFrame *data, int *got_frame)
> +{
> +    if (h->out[0].f->buf[0]) {
> +        av_log(h->avctx, AV_LOG_DEBUG, "output frame: poc=%d\n", h->out[0].poc);
> +        av_frame_move_ref(data, h->out[0].f);
> +        *got_frame = 1;
> +
> +        // out[0] <- out[1] <- out[2] <- out[0]
> +        cavs_frame_unref(&h->out[2]);
> +        FFSWAP(AVSFrame, h->out[0], h->out[2]);
> +        FFSWAP(AVSFrame, h->out[0], h->out[1]);
> +
> +        return 1;
> +    }
> +
> +    return 0;
> +}
> +
> +static void queue_one_frame(AVSContext *h, AVSFrame *out)
> +{
> +    int idx = !h->out[0].f->buf[0] ? 0 : (!h->out[1].f->buf[0] ? 1 : 2);
> +    av_log(h->avctx, AV_LOG_DEBUG, "queue in out[%d]: poc=%d\n", idx, out->poc);
> +    av_frame_ref(h->out[idx].f, out->f);
> +    h->out[idx].poc = out->poc;
> +}
> 
>  static int decode_pic(AVSContext *h)
>  {
>      int ret;
>      int skip_count    = -1;
>      enum cavs_mb mb_type;
> +    char tc[4];
> 
>      if (!h->top_qp) {
>          av_log(h->avctx, AV_LOG_ERROR, "No sequence header decoded yet\n");
>          return AVERROR_INVALIDDATA;
>      }
> 
> -    av_frame_unref(h->cur.f);
> +    cavs_frame_unref(&h->cur);
> +
> +    skip_bits(&h->gb, 16);//bbv_delay
> +    if (h->profile == AV_PROFILE_CAVS_GUANGDIAN) {
> +        skip_bits(&h->gb, 8);//bbv_dwlay_extension
> +    }
> 
> -    skip_bits(&h->gb, 16);//bbv_dwlay
>      if (h->stc == PIC_PB_START_CODE) {
>          h->cur.f->pict_type = get_bits(&h->gb, 2) + AV_PICTURE_TYPE_I;
>          if (h->cur.f->pict_type > AV_PICTURE_TYPE_B) {
>              av_log(h->avctx, AV_LOG_ERROR, "illegal picture type\n");
>              return AVERROR_INVALIDDATA;
>          }
> +
>          /* make sure we have the reference frames we need */
> -        if (!h->DPB[0].f->data[0] ||
> -           (!h->DPB[1].f->data[0] && h->cur.f->pict_type == AV_PICTURE_TYPE_B))
> +        if (!h->DPB[0].f->buf[0] ||
> +            (!h->DPB[1].f->buf[0] && h->cur.f->pict_type == AV_PICTURE_TYPE_B)) {
> +            av_log(h->avctx, AV_LOG_ERROR, "Invalid reference frame\n");
>              return AVERROR_INVALIDDATA;
> +        }
>      } else {
>          h->cur.f->pict_type = AV_PICTURE_TYPE_I;
> -        if (get_bits1(&h->gb))
> -            skip_bits(&h->gb, 24);//time_code
> -        /* old sample clips were all progressive and no low_delay,
> -           bump stream revision if detected otherwise */
> -        if (h->low_delay || !(show_bits(&h->gb, 9) & 1))
> -            h->stream_revision = 1;
> -        /* similarly test top_field_first and repeat_first_field */
> -        else if (show_bits(&h->gb, 11) & 3)
> -            h->stream_revision = 1;
> -        if (h->stream_revision > 0)
> -            skip_bits(&h->gb, 1); //marker_bit
> +        if (get_bits1(&h->gb)) {    //time_code
> +            skip_bits(&h->gb, 1);
> +            tc[0] = get_bits(&h->gb, 5);
> +            tc[1] = get_bits(&h->gb, 6);
> +            tc[2] = get_bits(&h->gb, 6);
> +            tc[3] = get_bits(&h->gb, 6);
> +            av_log(h->avctx, AV_LOG_DEBUG, "timecode: %d:%d:%d.%d\n",
> +                    tc[0], tc[1], tc[2], tc[3]);
> +        }
> +
> +        skip_bits(&h->gb, 1);
>      }
> 
>      if (get_bits_left(&h->gb) < 23)
> @@ -1029,6 +1268,17 @@ static int decode_pic(AVSContext *h)
>      if (ret < 0)
>          return ret;
> 
> +    if (h->avctx->hwaccel) {
> +        const FFHWAccel *hwaccel = ffhwaccel(h->avctx->hwaccel);
> +        av_assert0(!h->cur.hwaccel_picture_private);
> +        if (hwaccel->frame_priv_data_size) {
> +            h->cur.hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
> +            if (!h->cur.hwaccel_priv_buf)
> +                return AVERROR(ENOMEM);
> +            h->cur.hwaccel_picture_private = h->cur.hwaccel_priv_buf->data;
> +        }
> +    }
> +
>      if (!h->edge_emu_buffer) {
>          int alloc_size = FFALIGN(FFABS(h->cur.f->linesize[0]) + 32, 32);
>          h->edge_emu_buffer = av_mallocz(alloc_size * 2 * 24);
> @@ -1039,6 +1289,8 @@ static int decode_pic(AVSContext *h)
>      if ((ret = ff_cavs_init_pic(h)) < 0)
>          return ret;
>      h->cur.poc = get_bits(&h->gb, 8) * 2;
> +    av_log(h->avctx, AV_LOG_DEBUG, "poc=%d, type=%d\n",
> +            h->cur.poc, h->cur.f->pict_type);
> 
>      /* get temporal distances and MV scaling factors */
>      if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
> @@ -1052,8 +1304,12 @@ static int decode_pic(AVSContext *h)
>      if (h->cur.f->pict_type == AV_PICTURE_TYPE_B) {
>          h->sym_factor = h->dist[0] * h->scale_den[1];
>          if (FFABS(h->sym_factor) > 32768) {
> +            av_log(h->avctx, AV_LOG_ERROR, "poc=%d/%d/%d, dist=%d/%d\n",
> +                    h->DPB[1].poc, h->DPB[0].poc, h->cur.poc, h->dist[0], h->dist[1]);
>              av_log(h->avctx, AV_LOG_ERROR, "sym_factor %d too large\n", h->sym_factor);
> -            return AVERROR_INVALIDDATA;
> +
> +            if (!h->avctx->hwaccel)
> +                return AVERROR_INVALIDDATA;
>          }
>      } else {
>          h->direct_den[0] = h->dist[0] ? 16384 / h->dist[0] : 0;
> @@ -1062,9 +1318,9 @@ static int decode_pic(AVSContext *h)
> 
>      if (h->low_delay)
>          get_ue_golomb(&h->gb); //bbv_check_times
> -    h->progressive   = get_bits1(&h->gb);
> +    h->progressive_frame = get_bits1(&h->gb);
>      h->pic_structure = 1;
> -    if (!h->progressive)
> +    if (!h->progressive_frame)
>          h->pic_structure = get_bits1(&h->gb);
>      if (!h->pic_structure && h->stc == PIC_PB_START_CODE)
>          skip_bits1(&h->gb);     //advanced_pred_mode_disable
> @@ -1073,14 +1329,18 @@ static int decode_pic(AVSContext *h)
>      h->pic_qp_fixed =
>      h->qp_fixed = get_bits1(&h->gb);
>      h->qp       = get_bits(&h->gb, 6);
> +    h->skip_mode_flag = 0;
> +    h->ref_flag = 0;
>      if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
> -        if (!h->progressive && !h->pic_structure)
> -            skip_bits1(&h->gb);//what is this?
> +        if (!h->progressive_frame && !h->pic_structure)
> +            h->skip_mode_flag  = get_bits1(&h->gb);
>          skip_bits(&h->gb, 4);   //reserved bits
>      } else {
>          if (!(h->cur.f->pict_type == AV_PICTURE_TYPE_B && h->pic_structure == 1))
>              h->ref_flag        = get_bits1(&h->gb);
> -        skip_bits(&h->gb, 4);   //reserved bits
> +        h->no_forward_ref_flag = get_bits1(&h->gb);
> +        h->pb_field_enhanced_flag = get_bits1(&h->gb);
> +        skip_bits(&h->gb, 2);   //reserved bits
>          h->skip_mode_flag      = get_bits1(&h->gb);
>      }
>      h->loop_filter_disable     = get_bits1(&h->gb);
> @@ -1096,8 +1356,46 @@ static int decode_pic(AVSContext *h)
>          h->alpha_offset = h->beta_offset  = 0;
>      }
> 
> +    h->weight_quant_flag = 0;
> +    if (h->profile == AV_PROFILE_CAVS_GUANGDIAN) {
> +        h->weight_quant_flag = get_bits1(&h->gb);
> +        if (h->weight_quant_flag) {
> +            int wq_param[6] = {128, 128, 128, 128, 128, 128};
> +            int i, wqp_index, wq_model;
> +            const uint8_t *m2p;
> +
> +            skip_bits1(&h->gb);
> +            if (!get_bits1(&h->gb)) {
> +                h->chroma_quant_param_delta_cb = get_se_golomb(&h->gb);
> +                h->chroma_quant_param_delta_cr = get_se_golomb(&h->gb);
> +            }
> +            wqp_index = get_bits(&h->gb, 2);
> +            wq_model = get_bits(&h->gb, 2);
> +            m2p = wq_model_2_param[wq_model];
> +
> +            for (i = 0; i < 6; i++) {
> +                int delta = (wqp_index == 1 || wqp_index == 2) ? get_se_golomb(&h->gb) : 0;
> +                wq_param[i] = default_wq_param[wqp_index][i] + delta;
> +                av_log(h->avctx, AV_LOG_DEBUG, "wqp[%d]=%d\n", i, wq_param[i]);
> +            }
> +            for (i = 0; i < 64; i++) {
> +                h->wqm_8x8[i] = wq_param[ m2p[i] ];
> +            }
> +        } else {
> +            memset(h->wqm_8x8, 128, sizeof(h->wqm_8x8));
> +        }
> +        h->aec_flag = get_bits1(&h->gb);
> +        av_log(h->avctx, AV_LOG_DEBUG, "wq_flag=%d, aec_flag=%d\n",
> +                h->weight_quant_flag, h->aec_flag);
> +    }
> +
> +    skip_stuffing_bits(h);
> +    skip_extension_and_user_data(h);
> +
>      ret = 0;
> -    if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
> +    if (h->avctx->hwaccel) {
> +        ret = hwaccel_pic(h);
> +    } else if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
>          do {
>              check_for_slice(h);
>              ret = decode_mb_i(h, 0);
> @@ -1160,11 +1458,6 @@ static int decode_pic(AVSContext *h)
>          } while (ff_cavs_next_mb(h));
>      }
>      emms_c();
> -    if (ret >= 0 && h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
> -        av_frame_unref(h->DPB[1].f);
> -        FFSWAP(AVSFrame, h->cur, h->DPB[1]);
> -        FFSWAP(AVSFrame, h->DPB[0], h->DPB[1]);
> -    }
>      return ret;
>  }
> 
> @@ -1181,13 +1474,8 @@ static int decode_seq_header(AVSContext *h)
>      int ret;
> 
>      h->profile = get_bits(&h->gb, 8);
> -    if (h->profile != 0x20) {
> -        avpriv_report_missing_feature(h->avctx,
> -                                      "only supprt JiZhun profile");
> -        return AVERROR_PATCHWELCOME;
> -    }
>      h->level   = get_bits(&h->gb, 8);
> -    skip_bits1(&h->gb); //progressive sequence
> +    h->progressive_seq = get_bits1(&h->gb);
> 
>      width  = get_bits(&h->gb, 14);
>      height = get_bits(&h->gb, 14);
> @@ -1214,6 +1502,9 @@ static int decode_seq_header(AVSContext *h)
>      skip_bits1(&h->gb);    //marker_bit
>      skip_bits(&h->gb, 12); //bit_rate_upper
>      h->low_delay =  get_bits1(&h->gb);
> +    av_log(h->avctx, AV_LOG_DEBUG,
> +            "seq: profile=0x%02x, level=0x%02x, size=%dx%d, low_delay=%d\n",
> +            h->profile, h->level, width, height, h->low_delay);
> 
>      ret = ff_set_dimensions(h->avctx, width, height);
>      if (ret < 0)
> @@ -1239,43 +1530,61 @@ static int cavs_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
>                               int *got_frame, AVPacket *avpkt)
>  {
>      AVSContext *h      = avctx->priv_data;
> -    const uint8_t *buf = avpkt->data;
> -    int buf_size       = avpkt->size;
>      uint32_t stc       = -1;
>      int input_size, ret;
>      const uint8_t *buf_end;
>      const uint8_t *buf_ptr;
>      int frame_start = 0;
> 
> -    if (buf_size == 0) {
> -        if (!h->low_delay && h->DPB[0].f->data[0]) {
> -            *got_frame = 1;
> -            av_frame_move_ref(rframe, h->DPB[0].f);
> +    if (avpkt->size == 0) {
> +        if (h->DPB[0].f->buf[0] && !h->DPB[0].outputed) {
> +            queue_one_frame(h, &h->DPB[0]);
> +            cavs_frame_unref(&h->DPB[0]);
>          }
> +        output_one_frame(h, rframe, got_frame);
>          return 0;
>      }
> 
>      h->stc = 0;
> 
> -    buf_ptr = buf;
> -    buf_end = buf + buf_size;
> -    for(;;) {
> +    buf_ptr = avpkt->data;
> +    buf_end = avpkt->data + avpkt->size;
> +    for(; buf_ptr < buf_end;) {
>          buf_ptr = avpriv_find_start_code(buf_ptr, buf_end, &stc);
>          if ((stc & 0xFFFFFE00) || buf_ptr == buf_end) {
>              if (!h->stc)
>                  av_log(h->avctx, AV_LOG_WARNING, "no frame decoded\n");
> -            return FFMAX(0, buf_ptr - buf);
> +            return FFMAX(0, buf_ptr - avpkt->data);
>          }
>          input_size = (buf_end - buf_ptr) * 8;
> +        av_log(h->avctx, AV_LOG_TRACE, "Found start code 0x%04x, sz=%d\n",
> +                stc, input_size / 8);
>          switch (stc) {
>          case CAVS_START_CODE:
>              init_get_bits(&h->gb, buf_ptr, input_size);
> -            decode_seq_header(h);
> +            if ((ret = decode_seq_header(h)) < 0)
> +                return ret;
> +            avctx->profile = h->profile;
> +            avctx->level = h->level;
> +            if (!h->got_pix_fmt) {
> +                h->got_pix_fmt = 1;
> +                ret = ff_get_format(avctx, avctx->codec->pix_fmts);
> +                if (ret < 0)
> +                    return ret;
> +
> +                avctx->pix_fmt = ret;
> +
> +                if (h->profile == AV_PROFILE_CAVS_GUANGDIAN && !avctx->hwaccel) {
> +                    av_log(avctx, AV_LOG_ERROR, "Your platform doesn't suppport hardware"
> +                                    " accelerated for CAVS Guangdian Profile decoding.\n");
> +                    return AVERROR(ENOTSUP);
> +                }
> +            }
>              break;
>          case PIC_I_START_CODE:
>              if (!h->got_keyframe) {
> -                av_frame_unref(h->DPB[0].f);
> -                av_frame_unref(h->DPB[1].f);
> +                cavs_frame_unref(&h->DPB[0]);
> +                cavs_frame_unref(&h->DPB[1]);
>                  h->got_keyframe = 1;
>              }
>          case PIC_PB_START_CODE:
> @@ -1285,23 +1594,39 @@ static int cavs_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
>              if (*got_frame)
>                  av_frame_unref(rframe);
>              *got_frame = 0;
> -            if (!h->got_keyframe)
> +            if (!h->got_keyframe) {
> +                av_log(avctx, AV_LOG_ERROR, "No keyframe decoded before P/B frame.\n");
>                  break;
> +            }
>              init_get_bits(&h->gb, buf_ptr, input_size);
>              h->stc = stc;
> -            if (decode_pic(h))
> -                break;
> -            *got_frame = 1;
> +            if ((ret = decode_pic(h)) < 0)
> +                return ret;
> +            buf_ptr = align_get_bits(&h->gb);
> +
> +            h->cur.outputed = 0;
>              if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
> -                if (h->DPB[!h->low_delay].f->data[0]) {
> -                    if ((ret = av_frame_ref(rframe, h->DPB[!h->low_delay].f)) < 0)
> -                        return ret;
> -                } else {
> -                    *got_frame = 0;
> +                // at most one delay
> +                if (h->DPB[0].f->buf[0] && !h->DPB[0].outputed) {
> +                    queue_one_frame(h, &h->DPB[0]);
> +                    h->DPB[0].outputed = 1;
> +                }
> +
> +                if (h->low_delay) {
> +                    queue_one_frame(h, &h->cur);
> +                    h->cur.outputed = 1;
>                  }
> +
> +                // null -> curr -> DPB[0] -> DPB[1]
> +                cavs_frame_unref(&h->DPB[1]);
> +                FFSWAP(AVSFrame, h->cur, h->DPB[1]);
> +                FFSWAP(AVSFrame, h->DPB[0], h->DPB[1]);
>              } else {
> -                av_frame_move_ref(rframe, h->cur.f);
> +                queue_one_frame(h, &h->cur);
> +                cavs_frame_unref(&h->cur);
>              }
> +
> +            output_one_frame(h, rframe, got_frame);
>              break;
>          case EXT_START_CODE:
>              //mpeg_decode_extension(avctx, buf_ptr, input_size);
> @@ -1309,16 +1634,34 @@ static int cavs_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
>          case USER_START_CODE:
>              //mpeg_decode_user_data(avctx, buf_ptr, input_size);
>              break;
> +        case VIDEO_EDIT_CODE:
> +            av_log(h->avctx, AV_LOG_WARNING, "Skip video_edit_code\n");
> +            break;
> +        case VIDEO_SEQ_END_CODE:
> +            av_log(h->avctx, AV_LOG_WARNING, "Skip video_sequence_end_code\n");
> +            break;
>          default:
>              if (stc <= SLICE_MAX_START_CODE) {
> +                h->stc = stc & 0xff;
>                  init_get_bits(&h->gb, buf_ptr, input_size);
>                  decode_slice_header(h, &h->gb);
> +            } else {
> +                av_log(h->avctx, AV_LOG_WARNING, "Skip unsupported start code 0x%04X\n", stc);
>              }
>              break;
>          }
>      }
> +    return (buf_ptr - avpkt->data);
>  }
> 
> +static const enum AVPixelFormat cavs_hwaccel_pixfmt_list_420[] = {
> +#if CONFIG_CAVS_VAAPI_HWACCEL
> +    AV_PIX_FMT_VAAPI,
> +#endif
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_NONE
> +};
> +
>  const FFCodec ff_cavs_decoder = {
>      .p.name         = "cavs",
>      CODEC_LONG_NAME("Chinese AVS (Audio Video Standard) (AVS1-P2, JiZhun profile)"),
> @@ -1331,4 +1674,12 @@ const FFCodec ff_cavs_decoder = {
>      .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
>      .flush          = cavs_flush,
>      .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
> +    .p.pix_fmts     = cavs_hwaccel_pixfmt_list_420,
> +    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
> +#if CONFIG_CAVS_VAAPI_HWACCEL
> +                        HWACCEL_VAAPI(cavs),
> +#endif
> +                        NULL
> +                    },
> +    .p.profiles     = NULL_IF_CONFIG_SMALL(ff_cavs_profiles),
>  };
> diff --git a/libavcodec/defs.h b/libavcodec/defs.h
> index 00d840ec19..d59816a70f 100644
> --- a/libavcodec/defs.h
> +++ b/libavcodec/defs.h
> @@ -192,6 +192,9 @@
>  #define AV_PROFILE_EVC_BASELINE             0
>  #define AV_PROFILE_EVC_MAIN                 1
> 
> +#define AV_PROFILE_CAVS_JIZHUN                      0x20
> +#define AV_PROFILE_CAVS_GUANGDIAN                   0x48
> +
> 
>  #define AV_LEVEL_UNKNOWN                  -99
> 
> diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
> index 5171e4c7d7..a1a973b460 100644
> --- a/libavcodec/hwaccels.h
> +++ b/libavcodec/hwaccels.h
> @@ -89,5 +89,6 @@ extern const struct FFHWAccel ff_wmv3_dxva2_hwaccel;
>  extern const struct FFHWAccel ff_wmv3_nvdec_hwaccel;
>  extern const struct FFHWAccel ff_wmv3_vaapi_hwaccel;
>  extern const struct FFHWAccel ff_wmv3_vdpau_hwaccel;
> +extern const struct FFHWAccel ff_cavs_vaapi_hwaccel;
> 
>  #endif /* AVCODEC_HWACCELS_H */
> diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
> index 5bb8f150e6..b312f12281 100644
> --- a/libavcodec/profiles.c
> +++ b/libavcodec/profiles.c
> @@ -200,4 +200,10 @@ const AVProfile ff_evc_profiles[] = {
>      { AV_PROFILE_UNKNOWN },
>  };
> 
> +const AVProfile ff_cavs_profiles[] = {
> +    { AV_PROFILE_CAVS_JIZHUN,       "Jizhun"            },
> +    { AV_PROFILE_CAVS_GUANGDIAN,    "Guangdian"         },
> +    { AV_PROFILE_UNKNOWN },
> +};
> +
>  #endif /* !CONFIG_SMALL */
> diff --git a/libavcodec/profiles.h b/libavcodec/profiles.h
> index 270430a48b..9a2b348ad4 100644
> --- a/libavcodec/profiles.h
> +++ b/libavcodec/profiles.h
> @@ -75,5 +75,6 @@ extern const AVProfile ff_prores_profiles[];
>  extern const AVProfile ff_mjpeg_profiles[];
>  extern const AVProfile ff_arib_caption_profiles[];
>  extern const AVProfile ff_evc_profiles[];
> +extern const AVProfile ff_cavs_profiles[];
> 
>  #endif /* AVCODEC_PROFILES_H */
> diff --git a/libavcodec/vaapi_cavs.c b/libavcodec/vaapi_cavs.c
> new file mode 100644
> index 0000000000..4a7a9b95ad
> --- /dev/null
> +++ b/libavcodec/vaapi_cavs.c
> @@ -0,0 +1,164 @@
> +/*
> + * AVS (Chinese GY/T 257.1b
diff mbox series

Patch

diff --git a/configure b/configure
index c8ae0a061d..89759eda5d 100755
--- a/configure
+++ b/configure
@@ -2463,6 +2463,7 @@  HAVE_LIST="
     xmllint
     zlib_gzip
     openvino2
+    va_profile_avs
 "
 
 # options emitted with CONFIG_ prefix but not available on the command line
@@ -3202,6 +3203,7 @@  wmv3_dxva2_hwaccel_select="vc1_dxva2_hwaccel"
 wmv3_nvdec_hwaccel_select="vc1_nvdec_hwaccel"
 wmv3_vaapi_hwaccel_select="vc1_vaapi_hwaccel"
 wmv3_vdpau_hwaccel_select="vc1_vdpau_hwaccel"
+cavs_vaapi_hwaccel_deps="vaapi va_profile_avs VAPictureParameterBufferAVS"
 
 # hardware-accelerated codecs
 mediafoundation_deps="mftransform_h MFCreateAlignedMemoryBuffer"
@@ -7175,6 +7177,18 @@  if enabled vaapi; then
     check_type "va/va.h va/va_enc_vp8.h"  "VAEncPictureParameterBufferVP8"
     check_type "va/va.h va/va_enc_vp9.h"  "VAEncPictureParameterBufferVP9"
     check_type "va/va.h va/va_enc_av1.h"  "VAEncPictureParameterBufferAV1"
+
+    #
+    # Using 'VA_CHECK_VERSION' in source codes make things easy. But we have to wait
+    # until newly added VAProfile being distributed by VAAPI released version.
+    #
+    # Before or after that, we can use auto-detection to keep version compatibility.
+    # It always works.
+    #
+    disable va_profile_avs &&
+        test_code cc va/va.h "VAProfile p1 = VAProfileAVSJizhun, p2 = VAProfileAVSGuangdian;" &&
+        enable va_profile_avs
+    enabled va_profile_avs && check_type "va/va.h va/va_dec_avs.h" "VAPictureParameterBufferAVS"
 fi
 
 if enabled_all opencl libdrm ; then
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb42095165..7d92375fed 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1055,6 +1055,7 @@  OBJS-$(CONFIG_VP9_VAAPI_HWACCEL)          += vaapi_vp9.o
 OBJS-$(CONFIG_VP9_VDPAU_HWACCEL)          += vdpau_vp9.o
 OBJS-$(CONFIG_VP9_VIDEOTOOLBOX_HWACCEL)   += videotoolbox_vp9.o
 OBJS-$(CONFIG_VP8_QSV_HWACCEL)            += qsvdec.o
+OBJS-$(CONFIG_CAVS_VAAPI_HWACCEL)         += vaapi_cavs.o
 
 # Objects duplicated from other libraries for shared builds
 SHLIBOBJS                              += log2_tab.o reverse.o
diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c
index fdd577f7fb..ed7b278336 100644
--- a/libavcodec/cavs.c
+++ b/libavcodec/cavs.c
@@ -810,6 +810,14 @@  av_cold int ff_cavs_init(AVCodecContext *avctx)
     if (!h->cur.f || !h->DPB[0].f || !h->DPB[1].f)
         return AVERROR(ENOMEM);
 
+    h->out[0].f = av_frame_alloc();
+    h->out[1].f = av_frame_alloc();
+    h->out[2].f = av_frame_alloc();
+    if (!h->out[0].f || !h->out[1].f || !h->out[2].f) {
+        ff_cavs_end(avctx);
+        return AVERROR(ENOMEM);
+    }
+
     h->luma_scan[0]                     = 0;
     h->luma_scan[1]                     = 8;
     h->intra_pred_l[INTRA_L_VERT]       = intra_pred_vert;
@@ -840,6 +848,10 @@  av_cold int ff_cavs_end(AVCodecContext *avctx)
     av_frame_free(&h->DPB[0].f);
     av_frame_free(&h->DPB[1].f);
 
+    av_frame_free(&h->out[0].f);
+    av_frame_free(&h->out[1].f);
+    av_frame_free(&h->out[2].f);
+
     av_freep(&h->top_qp);
     av_freep(&h->top_mv[0]);
     av_freep(&h->top_mv[1]);
diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h
index 244c322b35..ef03c1a974 100644
--- a/libavcodec/cavs.h
+++ b/libavcodec/cavs.h
@@ -39,8 +39,10 @@ 
 #define EXT_START_CODE          0x000001b5
 #define USER_START_CODE         0x000001b2
 #define CAVS_START_CODE         0x000001b0
+#define VIDEO_SEQ_END_CODE      0x000001b1
 #define PIC_I_START_CODE        0x000001b3
 #define PIC_PB_START_CODE       0x000001b6
+#define VIDEO_EDIT_CODE         0x000001b7
 
 #define A_AVAIL                          1
 #define B_AVAIL                          2
@@ -164,10 +166,15 @@  struct dec_2dvlc {
 typedef struct AVSFrame {
     AVFrame *f;
     int poc;
+    int outputed;
+
+    AVBufferRef   *hwaccel_priv_buf;
+    void          *hwaccel_picture_private;
 } AVSFrame;
 
 typedef struct AVSContext {
     AVCodecContext *avctx;
+    int got_pix_fmt;
     BlockDSPContext bdsp;
     H264ChromaContext h264chroma;
     VideoDSPContext vdsp;
@@ -175,6 +182,7 @@  typedef struct AVSContext {
     GetBitContext gb;
     AVSFrame cur;     ///< currently decoded frame
     AVSFrame DPB[2];  ///< reference frames
+    AVSFrame out[3];  ///< output queue, size 2 maybe enough
     int dist[2];     ///< temporal distances from current frame to ref frames
     int low_delay;
     int profile, level;
@@ -182,12 +190,38 @@  typedef struct AVSContext {
     int mb_width, mb_height;
     int width, height;
     int stream_revision; ///<0 for samples from 2006, 1 for rm52j encoder
-    int progressive;
+    int progressive_seq;
+    int progressive_frame;
     int pic_structure;
+    int no_forward_ref_flag;
+    int pb_field_enhanced_flag;   ///< only used in GUANGDIAN
     int skip_mode_flag; ///< select between skip_count or one skip_flag per MB
     int loop_filter_disable;
     int alpha_offset, beta_offset;
     int ref_flag;
+
+    /** \defgroup guangdian profile
+     * @{
+     */
+    int aec_flag;
+    int weight_quant_flag;
+    int chroma_quant_param_delta_cb;
+    int chroma_quant_param_delta_cr;
+    uint8_t wqm_8x8[64];
+    /**@}*/
+
+    /** \defgroup slice weighting
+     * FFmpeg don't support slice weighting natively, but maybe needed for HWaccel.
+     * @{
+     */
+    uint32_t slice_weight_pred_flag : 1;
+    uint32_t mb_weight_pred_flag    : 1;
+    uint8_t luma_scale[4];
+    int8_t luma_shift[4];
+    uint8_t chroma_scale[4];
+    int8_t chroma_shift[4];
+    /**@}*/
+
     int mbx, mby, mbidx; ///< macroblock coordinates
     int flags;         ///< availability flags of neighbouring macroblocks
     int stc;           ///< last start code
diff --git a/libavcodec/cavs_parser.c b/libavcodec/cavs_parser.c
index 4a03effd0f..a41a82d8d1 100644
--- a/libavcodec/cavs_parser.c
+++ b/libavcodec/cavs_parser.c
@@ -65,6 +65,22 @@  static int cavs_find_frame_end(ParseContext *pc, const uint8_t *buf,
                 pc->state=-1;
                 return i-3;
             }
+            if((state&0xFFFFFF00) == 0x100){
+                if(state != EXT_START_CODE && state != USER_START_CODE){
+                    state = state >> 8;
+                    break;
+                }
+            }
+        }
+        for(; i<buf_size; i++){
+            state= (state<<8) | buf[i];
+            if((state&0xFFFFFF00) == 0x100){
+                if(state > SLICE_MAX_START_CODE){
+                    pc->frame_start_found=0;
+                    pc->state=-1;
+                    return i-3;
+                }
+            }
         }
     }
     pc->frame_start_found= pic_found;
diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c
index b356da0b04..18c38cd3ff 100644
--- a/libavcodec/cavsdec.c
+++ b/libavcodec/cavsdec.c
@@ -25,11 +25,15 @@ 
  * @author Stefan Gehrer <stefan.gehrer@gmx.de>
  */
 
+#include "config_components.h"
 #include "libavutil/avassert.h"
 #include "libavutil/emms.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "golomb.h"
+#include "hwaccel_internal.h"
+#include "hwconfig.h"
+#include "profiles.h"
 #include "cavs.h"
 #include "codec_internal.h"
 #include "decode.h"
@@ -37,6 +41,43 @@ 
 #include "mpeg12data.h"
 #include "startcode.h"
 
+static const uint8_t default_wq_param[4][6] = {
+    {128,  98, 106, 116, 116, 128},
+    {135, 143, 143, 160, 160, 213},
+    {128,  98, 106, 116, 116, 128},
+    {128, 128, 128, 128, 128, 128},
+};
+static const uint8_t wq_model_2_param[4][64] = {
+    {
+        0, 0, 0, 4, 4, 4, 5, 5,
+        0, 0, 3, 3, 3, 3, 5, 5,
+        0, 3, 2, 2, 1, 1, 5, 5,
+        4, 3, 2, 2, 1, 5, 5, 5,
+        4, 3, 1, 1, 5, 5, 5, 5,
+        4, 3, 1, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 5, 5, 5, 5,
+    }, {
+        0, 0, 0, 4, 4, 4, 5, 5,
+        0, 0, 4, 4, 4, 4, 5, 5,
+        0, 3, 2, 2, 2, 1, 5, 5,
+        3, 3, 2, 2, 1, 5, 5, 5,
+        3, 3, 2, 1, 5, 5, 5, 5,
+        3, 3, 1, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 5, 5, 5, 5,
+    }, {
+        0, 0, 0, 4, 4, 3, 5, 5,
+        0, 0, 4, 4, 3, 2, 5, 5,
+        0, 4, 4, 3, 2, 1, 5, 5,
+        4, 4, 3, 2, 1, 5, 5, 5,
+        4, 3, 2, 1, 5, 5, 5, 5,
+        3, 2, 1, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 5, 5, 5, 5,
+    }
+};
+
 static const uint8_t mv_scan[4] = {
     MV_FWD_X0, MV_FWD_X1,
     MV_FWD_X2, MV_FWD_X3
@@ -927,7 +968,11 @@  static int decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
 
 static inline int decode_slice_header(AVSContext *h, GetBitContext *gb)
 {
-    if (h->stc > 0xAF)
+    int i, nref;
+
+    av_log(h->avctx, AV_LOG_TRACE, "slice start code 0x%02x\n", h->stc);
+
+    if (h->stc > SLICE_MAX_START_CODE)
         av_log(h->avctx, AV_LOG_ERROR, "unexpected start code 0x%02x\n", h->stc);
 
     if (h->stc >= h->mb_height) {
@@ -946,14 +991,119 @@  static inline int decode_slice_header(AVSContext *h, GetBitContext *gb)
     }
     /* inter frame or second slice can have weighting params */
     if ((h->cur.f->pict_type != AV_PICTURE_TYPE_I) ||
-        (!h->pic_structure && h->mby >= h->mb_width / 2))
-        if (get_bits1(gb)) { //slice_weighting_flag
-            av_log(h->avctx, AV_LOG_ERROR,
-                   "weighted prediction not yet supported\n");
+        (!h->pic_structure && h->mby >= h->mb_height / 2)) {
+        h->slice_weight_pred_flag = get_bits1(gb);
+        if (h->slice_weight_pred_flag) {
+            nref = h->cur.f->pict_type == AV_PICTURE_TYPE_I ? 1 : (h->pic_structure ? 2 : 4);
+            for (i = 0; i < nref; i++) {
+                h->luma_scale[i] = get_bits(gb, 8);
+                h->luma_shift[i] = get_sbits(gb, 8);
+                skip_bits1(gb);
+                h->chroma_scale[i] = get_bits(gb, 8);
+                h->chroma_shift[i] = get_sbits(gb, 8);
+                skip_bits1(gb);
+            }
+            h->mb_weight_pred_flag = get_bits1(gb);
+            if (!h->avctx->hwaccel) {
+                av_log(h->avctx, AV_LOG_ERROR,
+                    "weighted prediction not yet supported\n");
+            }
         }
+    }
+    if (h->aec_flag) {
+        align_get_bits(gb);
+    }
+
+    return 0;
+}
+
+/**
+ * skip stuffing bits before next start code "0x000001"
+ * @return '0' no stuffing bits placed at h->gb being skip, else '1'.
+ */
+static inline int skip_stuffing_bits(AVSContext *h)
+{
+    GetBitContext gb0 = h->gb;
+    GetBitContext *gb = &h->gb;
+    const uint8_t *start;
+    const uint8_t *ptr;
+    const uint8_t *end;
+    int align;
+    int stuffing_zeros;
+
+#if 0
+    /**
+     * skip 1 bit stuffing_bit '1' and 0~7 bit stuffing_bit '0'
+     */
+    if (!get_bits1(gb)) {
+        av_log(h->avctx, AV_LOG_WARNING, "NOT stuffing_bit '1'\n");
+        goto restore_get_bits;
+    }
+    align = (-get_bits_count(gb)) & 7;
+    if (show_bits_long(gb, align)) {
+        av_log(h->avctx, AV_LOG_WARNING, "NOT %d stuffing_bit '0..0'\n", align);
+        goto restore_get_bits;
+    }
+#else
+    /**
+     * Seems like not all the stream follow "next_start_code()" strictly.
+     */
+    align = (-get_bits_count(gb)) & 7;
+    if (align == 0 && show_bits_long(gb, 8) == 0x80) {
+        skip_bits_long(gb, 8);
+    }
+#endif
+
+    /**
+     *  skip leading zero bytes before 0x 00 00 01 stc
+     */
+    ptr = start = align_get_bits(gb);
+    end = gb->buffer_end;
+    while (ptr < end && *ptr == 0)
+        ptr++;
+
+    if ((ptr >= end) || (*ptr == 1 && ptr - start >= 2)) {
+        stuffing_zeros = (ptr >= end ? end - start : ptr - start - 2);
+        if (stuffing_zeros > 0)
+            av_log(h->avctx, AV_LOG_DEBUG, "Skip 0x%x stuffing zeros @0x%x.\n",
+                    stuffing_zeros, (int)(start - gb->buffer));
+        skip_bits_long(gb, stuffing_zeros * 8);
+        return 1;
+    } else {
+        av_log(h->avctx, AV_LOG_DEBUG, "No next_start_code() found @0x%x.\n",
+                (int)(start - gb->buffer));
+        goto restore_get_bits;
+    }
+
+restore_get_bits:
+    h->gb = gb0;
     return 0;
 }
 
+static inline int skip_extension_and_user_data(AVSContext *h)
+{
+    int stc = -1;
+    const uint8_t *start = align_get_bits(&h->gb);
+    const uint8_t *end = h->gb.buffer_end;
+    const uint8_t *ptr, *next;
+
+    for (ptr = start; ptr + 4 < end; ptr = next) {
+        stc = show_bits_long(&h->gb, 32);
+        if (stc != EXT_START_CODE && stc != USER_START_CODE) {
+            break;
+        }
+        next = avpriv_find_start_code(ptr + 4, end, &stc);
+        if (next < end) {
+            next -= 4;
+        }
+        skip_bits(&h->gb, (next - ptr) * 8);
+        av_log(h->avctx, AV_LOG_DEBUG, "skip %d byte ext/user data\n",
+                (int)(next - ptr));
+    }
+
+    return ptr > start;
+}
+
 static inline int check_for_slice(AVSContext *h)
 {
     GetBitContext *gb = &h->gb;
@@ -981,44 +1131,133 @@  static inline int check_for_slice(AVSContext *h)
  * frame level
  *
  ****************************************************************************/
+static int hwaccel_pic(AVSContext *h)
+{
+    int ret = 0;
+    int stc = -1;
+    const uint8_t *frm_start = align_get_bits(&h->gb);
+    const uint8_t *frm_end = h->gb.buffer_end;
+    const uint8_t *slc_start = frm_start;
+    const uint8_t *slc_end = frm_end;
+    GetBitContext gb = h->gb;
+    const FFHWAccel *hwaccel = ffhwaccel(h->avctx->hwaccel);
+
+    ret = hwaccel->start_frame(h->avctx, NULL, 0);
+    if (ret < 0)
+        return ret;
+
+    for (slc_start = frm_start; slc_start + 4 < frm_end; slc_start = slc_end) {
+        slc_end = avpriv_find_start_code(slc_start + 4, frm_end, &stc);
+        if (slc_end < frm_end) {
+            slc_end -= 4;
+        }
+
+        init_get_bits(&h->gb, slc_start, (slc_end - slc_start) * 8);
+        if (!check_for_slice(h)) {
+            break;
+        }
+
+        ret = hwaccel->decode_slice(h->avctx, slc_start, slc_end - slc_start);
+        if (ret < 0) {
+            break;
+        }
+    }
+
+    h->gb = gb;
+    skip_bits(&h->gb, (slc_start - frm_start) * 8);
+
+    if (ret < 0)
+        return ret;
+
+    return hwaccel->end_frame(h->avctx);
+}
+
+/**
+ * @brief remove frame out of dpb
+ */
+static void cavs_frame_unref(AVSFrame *frame)
+{
+    /* frame->f can be NULL if context init failed */
+    if (!frame->f || !frame->f->buf[0])
+        return;
+
+    av_buffer_unref(&frame->hwaccel_priv_buf);
+    frame->hwaccel_picture_private = NULL;
+
+    av_frame_unref(frame->f);
+}
+
+static int output_one_frame(AVSContext *h, AVFrame *data, int *got_frame)
+{
+    if (h->out[0].f->buf[0]) {
+        av_log(h->avctx, AV_LOG_DEBUG, "output frame: poc=%d\n", h->out[0].poc);
+        av_frame_move_ref(data, h->out[0].f);
+        *got_frame = 1;
+
+        // out[0] <- out[1] <- out[2] <- out[0]
+        cavs_frame_unref(&h->out[2]);
+        FFSWAP(AVSFrame, h->out[0], h->out[2]);
+        FFSWAP(AVSFrame, h->out[0], h->out[1]);
+
+        return 1;
+    }
+
+    return 0;
+}
+
+static void queue_one_frame(AVSContext *h, AVSFrame *out)
+{
+    int idx = !h->out[0].f->buf[0] ? 0 : (!h->out[1].f->buf[0] ? 1 : 2);
+    av_log(h->avctx, AV_LOG_DEBUG, "queue in out[%d]: poc=%d\n", idx, out->poc);
+    av_frame_ref(h->out[idx].f, out->f);
+    h->out[idx].poc = out->poc;
+}
 
 static int decode_pic(AVSContext *h)
 {
     int ret;
     int skip_count    = -1;
     enum cavs_mb mb_type;
+    char tc[4];
 
     if (!h->top_qp) {
         av_log(h->avctx, AV_LOG_ERROR, "No sequence header decoded yet\n");
         return AVERROR_INVALIDDATA;
     }
 
-    av_frame_unref(h->cur.f);
+    cavs_frame_unref(&h->cur);
+
+    skip_bits(&h->gb, 16);//bbv_delay
+    if (h->profile == AV_PROFILE_CAVS_GUANGDIAN) {
+        skip_bits(&h->gb, 8);//bbv_dwlay_extension
+    }
 
-    skip_bits(&h->gb, 16);//bbv_dwlay
     if (h->stc == PIC_PB_START_CODE) {
         h->cur.f->pict_type = get_bits(&h->gb, 2) + AV_PICTURE_TYPE_I;
         if (h->cur.f->pict_type > AV_PICTURE_TYPE_B) {
             av_log(h->avctx, AV_LOG_ERROR, "illegal picture type\n");
             return AVERROR_INVALIDDATA;
         }
+
         /* make sure we have the reference frames we need */
-        if (!h->DPB[0].f->data[0] ||
-           (!h->DPB[1].f->data[0] && h->cur.f->pict_type == AV_PICTURE_TYPE_B))
+        if (!h->DPB[0].f->buf[0] ||
+            (!h->DPB[1].f->buf[0] && h->cur.f->pict_type == AV_PICTURE_TYPE_B)) {
+            av_log(h->avctx, AV_LOG_ERROR, "Invalid reference frame\n");
             return AVERROR_INVALIDDATA;
+        }
     } else {
         h->cur.f->pict_type = AV_PICTURE_TYPE_I;
-        if (get_bits1(&h->gb))
-            skip_bits(&h->gb, 24);//time_code
-        /* old sample clips were all progressive and no low_delay,
-           bump stream revision if detected otherwise */
-        if (h->low_delay || !(show_bits(&h->gb, 9) & 1))
-            h->stream_revision = 1;
-        /* similarly test top_field_first and repeat_first_field */
-        else if (show_bits(&h->gb, 11) & 3)
-            h->stream_revision = 1;
-        if (h->stream_revision > 0)
-            skip_bits(&h->gb, 1); //marker_bit
+        if (get_bits1(&h->gb)) {    //time_code
+            skip_bits(&h->gb, 1);
+            tc[0] = get_bits(&h->gb, 5);
+            tc[1] = get_bits(&h->gb, 6);
+            tc[2] = get_bits(&h->gb, 6);
+            tc[3] = get_bits(&h->gb, 6);
+            av_log(h->avctx, AV_LOG_DEBUG, "timecode: %d:%d:%d.%d\n", 
+                    tc[0], tc[1], tc[2], tc[3]);
+        }
+            
+        skip_bits(&h->gb, 1);
     }
 
     if (get_bits_left(&h->gb) < 23)
@@ -1029,6 +1268,17 @@  static int decode_pic(AVSContext *h)
     if (ret < 0)
         return ret;
 
+    if (h->avctx->hwaccel) {
+        const FFHWAccel *hwaccel = ffhwaccel(h->avctx->hwaccel);
+        av_assert0(!h->cur.hwaccel_picture_private);
+        if (hwaccel->frame_priv_data_size) {
+            h->cur.hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
+            if (!h->cur.hwaccel_priv_buf)
+                return AVERROR(ENOMEM);
+            h->cur.hwaccel_picture_private = h->cur.hwaccel_priv_buf->data;
+        }
+    }
+
     if (!h->edge_emu_buffer) {
         int alloc_size = FFALIGN(FFABS(h->cur.f->linesize[0]) + 32, 32);
         h->edge_emu_buffer = av_mallocz(alloc_size * 2 * 24);
@@ -1039,6 +1289,8 @@  static int decode_pic(AVSContext *h)
     if ((ret = ff_cavs_init_pic(h)) < 0)
         return ret;
     h->cur.poc = get_bits(&h->gb, 8) * 2;
+    av_log(h->avctx, AV_LOG_DEBUG, "poc=%d, type=%d\n",
+            h->cur.poc, h->cur.f->pict_type);
 
     /* get temporal distances and MV scaling factors */
     if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
@@ -1052,8 +1304,12 @@  static int decode_pic(AVSContext *h)
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_B) {
         h->sym_factor = h->dist[0] * h->scale_den[1];
         if (FFABS(h->sym_factor) > 32768) {
+            av_log(h->avctx, AV_LOG_ERROR, "poc=%d/%d/%d, dist=%d/%d\n",
+                    h->DPB[1].poc, h->DPB[0].poc, h->cur.poc, h->dist[0], h->dist[1]);
             av_log(h->avctx, AV_LOG_ERROR, "sym_factor %d too large\n", h->sym_factor);
-            return AVERROR_INVALIDDATA;
+
+            if (!h->avctx->hwaccel)
+                return AVERROR_INVALIDDATA;
         }
     } else {
         h->direct_den[0] = h->dist[0] ? 16384 / h->dist[0] : 0;
@@ -1062,9 +1318,9 @@  static int decode_pic(AVSContext *h)
 
     if (h->low_delay)
         get_ue_golomb(&h->gb); //bbv_check_times
-    h->progressive   = get_bits1(&h->gb);
+    h->progressive_frame = get_bits1(&h->gb);
     h->pic_structure = 1;
-    if (!h->progressive)
+    if (!h->progressive_frame)
         h->pic_structure = get_bits1(&h->gb);
     if (!h->pic_structure && h->stc == PIC_PB_START_CODE)
         skip_bits1(&h->gb);     //advanced_pred_mode_disable
@@ -1073,14 +1329,18 @@  static int decode_pic(AVSContext *h)
     h->pic_qp_fixed =
     h->qp_fixed = get_bits1(&h->gb);
     h->qp       = get_bits(&h->gb, 6);
+    h->skip_mode_flag = 0;
+    h->ref_flag = 0;
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
-        if (!h->progressive && !h->pic_structure)
-            skip_bits1(&h->gb);//what is this?
+        if (!h->progressive_frame && !h->pic_structure)
+            h->skip_mode_flag  = get_bits1(&h->gb);
         skip_bits(&h->gb, 4);   //reserved bits
     } else {
         if (!(h->cur.f->pict_type == AV_PICTURE_TYPE_B && h->pic_structure == 1))
             h->ref_flag        = get_bits1(&h->gb);
-        skip_bits(&h->gb, 4);   //reserved bits
+        h->no_forward_ref_flag = get_bits1(&h->gb);
+        h->pb_field_enhanced_flag = get_bits1(&h->gb);
+        skip_bits(&h->gb, 2);   //reserved bits
         h->skip_mode_flag      = get_bits1(&h->gb);
     }
     h->loop_filter_disable     = get_bits1(&h->gb);
@@ -1096,8 +1356,46 @@  static int decode_pic(AVSContext *h)
         h->alpha_offset = h->beta_offset  = 0;
     }
 
+    h->weight_quant_flag = 0;
+    if (h->profile == AV_PROFILE_CAVS_GUANGDIAN) {
+        h->weight_quant_flag = get_bits1(&h->gb);
+        if (h->weight_quant_flag) {
+            int wq_param[6] = {128, 128, 128, 128, 128, 128};
+            int i, wqp_index, wq_model;
+            const uint8_t *m2p;
+
+            skip_bits1(&h->gb);
+            if (!get_bits1(&h->gb)) {
+                h->chroma_quant_param_delta_cb = get_se_golomb(&h->gb);
+                h->chroma_quant_param_delta_cr = get_se_golomb(&h->gb);
+            }
+            wqp_index = get_bits(&h->gb, 2);
+            wq_model = get_bits(&h->gb, 2);
+            m2p = wq_model_2_param[wq_model];
+
+            for (i = 0; i < 6; i++) {
+                int delta = (wqp_index == 1 || wqp_index == 2) ? get_se_golomb(&h->gb) : 0;
+                wq_param[i] = default_wq_param[wqp_index][i] + delta;
+                av_log(h->avctx, AV_LOG_DEBUG, "wqp[%d]=%d\n", i, wq_param[i]);
+            }
+            for (i = 0; i < 64; i++) {
+                h->wqm_8x8[i] = wq_param[ m2p[i] ];
+            }
+        } else {
+            memset(h->wqm_8x8, 128, sizeof(h->wqm_8x8));
+        }
+        h->aec_flag = get_bits1(&h->gb);
+        av_log(h->avctx, AV_LOG_DEBUG, "wq_flag=%d, aec_flag=%d\n",
+                h->weight_quant_flag, h->aec_flag);
+    }
+
+    skip_stuffing_bits(h);
+    skip_extension_and_user_data(h);
+
     ret = 0;
-    if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
+    if (h->avctx->hwaccel) {
+        ret = hwaccel_pic(h);
+    } else if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
         do {
             check_for_slice(h);
             ret = decode_mb_i(h, 0);
@@ -1160,11 +1458,6 @@  static int decode_pic(AVSContext *h)
         } while (ff_cavs_next_mb(h));
     }
     emms_c();
-    if (ret >= 0 && h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-        av_frame_unref(h->DPB[1].f);
-        FFSWAP(AVSFrame, h->cur, h->DPB[1]);
-        FFSWAP(AVSFrame, h->DPB[0], h->DPB[1]);
-    }
     return ret;
 }
 
@@ -1181,13 +1474,8 @@  static int decode_seq_header(AVSContext *h)
     int ret;
 
     h->profile = get_bits(&h->gb, 8);
-    if (h->profile != 0x20) {
-        avpriv_report_missing_feature(h->avctx,
-                                      "only supprt JiZhun profile");
-        return AVERROR_PATCHWELCOME;
-    }
     h->level   = get_bits(&h->gb, 8);
-    skip_bits1(&h->gb); //progressive sequence
+    h->progressive_seq = get_bits1(&h->gb);
 
     width  = get_bits(&h->gb, 14);
     height = get_bits(&h->gb, 14);
@@ -1214,6 +1502,9 @@  static int decode_seq_header(AVSContext *h)
     skip_bits1(&h->gb);    //marker_bit
     skip_bits(&h->gb, 12); //bit_rate_upper
     h->low_delay =  get_bits1(&h->gb);
+    av_log(h->avctx, AV_LOG_DEBUG,
+            "seq: profile=0x%02x, level=0x%02x, size=%dx%d, low_delay=%d\n",
+            h->profile, h->level, width, height, h->low_delay);
 
     ret = ff_set_dimensions(h->avctx, width, height);
     if (ret < 0)
@@ -1239,43 +1530,61 @@  static int cavs_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
                              int *got_frame, AVPacket *avpkt)
 {
     AVSContext *h      = avctx->priv_data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
     uint32_t stc       = -1;
     int input_size, ret;
     const uint8_t *buf_end;
     const uint8_t *buf_ptr;
     int frame_start = 0;
 
-    if (buf_size == 0) {
-        if (!h->low_delay && h->DPB[0].f->data[0]) {
-            *got_frame = 1;
-            av_frame_move_ref(rframe, h->DPB[0].f);
+    if (avpkt->size == 0) {
+        if (h->DPB[0].f->buf[0] && !h->DPB[0].outputed) {
+            queue_one_frame(h, &h->DPB[0]);
+            cavs_frame_unref(&h->DPB[0]);
         }
+        output_one_frame(h, rframe, got_frame);
         return 0;
     }
 
     h->stc = 0;
 
-    buf_ptr = buf;
-    buf_end = buf + buf_size;
-    for(;;) {
+    buf_ptr = avpkt->data;
+    buf_end = avpkt->data + avpkt->size;
+    for(; buf_ptr < buf_end;) {
         buf_ptr = avpriv_find_start_code(buf_ptr, buf_end, &stc);
         if ((stc & 0xFFFFFE00) || buf_ptr == buf_end) {
             if (!h->stc)
                 av_log(h->avctx, AV_LOG_WARNING, "no frame decoded\n");
-            return FFMAX(0, buf_ptr - buf);
+            return FFMAX(0, buf_ptr - avpkt->data);
         }
         input_size = (buf_end - buf_ptr) * 8;
+        av_log(h->avctx, AV_LOG_TRACE, "Found start code 0x%04x, sz=%d\n",
+                stc, input_size / 8);
         switch (stc) {
         case CAVS_START_CODE:
             init_get_bits(&h->gb, buf_ptr, input_size);
-            decode_seq_header(h);
+            if ((ret = decode_seq_header(h)) < 0)
+                return ret;
+            avctx->profile = h->profile;
+            avctx->level = h->level;
+            if (!h->got_pix_fmt) {
+                h->got_pix_fmt = 1;
+                ret = ff_get_format(avctx, avctx->codec->pix_fmts);
+                if (ret < 0)
+                    return ret;
+
+                avctx->pix_fmt = ret;
+
+                if (h->profile == AV_PROFILE_CAVS_GUANGDIAN && !avctx->hwaccel) {
+                    av_log(avctx, AV_LOG_ERROR, "Your platform doesn't suppport hardware"
+                                    " accelerated for CAVS Guangdian Profile decoding.\n");
+                    return AVERROR(ENOTSUP);
+                }
+            }
             break;
         case PIC_I_START_CODE:
             if (!h->got_keyframe) {
-                av_frame_unref(h->DPB[0].f);
-                av_frame_unref(h->DPB[1].f);
+                cavs_frame_unref(&h->DPB[0]);
+                cavs_frame_unref(&h->DPB[1]);
                 h->got_keyframe = 1;
             }
         case PIC_PB_START_CODE:
@@ -1285,23 +1594,39 @@  static int cavs_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
             if (*got_frame)
                 av_frame_unref(rframe);
             *got_frame = 0;
-            if (!h->got_keyframe)
+            if (!h->got_keyframe) {
+                av_log(avctx, AV_LOG_ERROR, "No keyframe decoded before P/B frame.\n");
                 break;
+            }
             init_get_bits(&h->gb, buf_ptr, input_size);
             h->stc = stc;
-            if (decode_pic(h))
-                break;
-            *got_frame = 1;
+            if ((ret = decode_pic(h)) < 0)
+                return ret;
+            buf_ptr = align_get_bits(&h->gb);
+
+            h->cur.outputed = 0;
             if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-                if (h->DPB[!h->low_delay].f->data[0]) {
-                    if ((ret = av_frame_ref(rframe, h->DPB[!h->low_delay].f)) < 0)
-                        return ret;
-                } else {
-                    *got_frame = 0;
+                // at most one delay
+                if (h->DPB[0].f->buf[0] && !h->DPB[0].outputed) {
+                    queue_one_frame(h, &h->DPB[0]);
+                    h->DPB[0].outputed = 1;
+                }
+
+                if (h->low_delay) {
+                    queue_one_frame(h, &h->cur);
+                    h->cur.outputed = 1;
                 }
+
+                // null -> curr -> DPB[0] -> DPB[1]
+                cavs_frame_unref(&h->DPB[1]);
+                FFSWAP(AVSFrame, h->cur, h->DPB[1]);
+                FFSWAP(AVSFrame, h->DPB[0], h->DPB[1]);
             } else {
-                av_frame_move_ref(rframe, h->cur.f);
+                queue_one_frame(h, &h->cur);
+                cavs_frame_unref(&h->cur);
             }
+
+            output_one_frame(h, rframe, got_frame);
             break;
         case EXT_START_CODE:
             //mpeg_decode_extension(avctx, buf_ptr, input_size);
@@ -1309,16 +1634,34 @@  static int cavs_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
         case USER_START_CODE:
             //mpeg_decode_user_data(avctx, buf_ptr, input_size);
             break;
+        case VIDEO_EDIT_CODE:
+            av_log(h->avctx, AV_LOG_WARNING, "Skip video_edit_code\n");
+            break;
+        case VIDEO_SEQ_END_CODE:
+            av_log(h->avctx, AV_LOG_WARNING, "Skip video_sequence_end_code\n");
+            break;
         default:
             if (stc <= SLICE_MAX_START_CODE) {
+                h->stc = stc & 0xff;
                 init_get_bits(&h->gb, buf_ptr, input_size);
                 decode_slice_header(h, &h->gb);
+            } else {
+                av_log(h->avctx, AV_LOG_WARNING, "Skip unsupported start code 0x%04X\n", stc);
             }
             break;
         }
     }
+    return (buf_ptr - avpkt->data);
 }
 
+static const enum AVPixelFormat cavs_hwaccel_pixfmt_list_420[] = {
+#if CONFIG_CAVS_VAAPI_HWACCEL
+    AV_PIX_FMT_VAAPI,
+#endif
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
 const FFCodec ff_cavs_decoder = {
     .p.name         = "cavs",
     CODEC_LONG_NAME("Chinese AVS (Audio Video Standard) (AVS1-P2, JiZhun profile)"),
@@ -1331,4 +1674,12 @@  const FFCodec ff_cavs_decoder = {
     .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
     .flush          = cavs_flush,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
+    .p.pix_fmts     = cavs_hwaccel_pixfmt_list_420,
+    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
+#if CONFIG_CAVS_VAAPI_HWACCEL
+                        HWACCEL_VAAPI(cavs),
+#endif
+                        NULL
+                    },
+    .p.profiles     = NULL_IF_CONFIG_SMALL(ff_cavs_profiles),
 };
diff --git a/libavcodec/defs.h b/libavcodec/defs.h
index 00d840ec19..d59816a70f 100644
--- a/libavcodec/defs.h
+++ b/libavcodec/defs.h
@@ -192,6 +192,9 @@ 
 #define AV_PROFILE_EVC_BASELINE             0
 #define AV_PROFILE_EVC_MAIN                 1
 
+#define AV_PROFILE_CAVS_JIZHUN                      0x20
+#define AV_PROFILE_CAVS_GUANGDIAN                   0x48
+
 
 #define AV_LEVEL_UNKNOWN                  -99
 
diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
index 5171e4c7d7..a1a973b460 100644
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -89,5 +89,6 @@  extern const struct FFHWAccel ff_wmv3_dxva2_hwaccel;
 extern const struct FFHWAccel ff_wmv3_nvdec_hwaccel;
 extern const struct FFHWAccel ff_wmv3_vaapi_hwaccel;
 extern const struct FFHWAccel ff_wmv3_vdpau_hwaccel;
+extern const struct FFHWAccel ff_cavs_vaapi_hwaccel;
 
 #endif /* AVCODEC_HWACCELS_H */
diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
index 5bb8f150e6..b312f12281 100644
--- a/libavcodec/profiles.c
+++ b/libavcodec/profiles.c
@@ -200,4 +200,10 @@  const AVProfile ff_evc_profiles[] = {
     { AV_PROFILE_UNKNOWN },
 };
 
+const AVProfile ff_cavs_profiles[] = {
+    { AV_PROFILE_CAVS_JIZHUN,       "Jizhun"            },
+    { AV_PROFILE_CAVS_GUANGDIAN,    "Guangdian"         },
+    { AV_PROFILE_UNKNOWN },
+};
+
 #endif /* !CONFIG_SMALL */
diff --git a/libavcodec/profiles.h b/libavcodec/profiles.h
index 270430a48b..9a2b348ad4 100644
--- a/libavcodec/profiles.h
+++ b/libavcodec/profiles.h
@@ -75,5 +75,6 @@  extern const AVProfile ff_prores_profiles[];
 extern const AVProfile ff_mjpeg_profiles[];
 extern const AVProfile ff_arib_caption_profiles[];
 extern const AVProfile ff_evc_profiles[];
+extern const AVProfile ff_cavs_profiles[];
 
 #endif /* AVCODEC_PROFILES_H */
diff --git a/libavcodec/vaapi_cavs.c b/libavcodec/vaapi_cavs.c
new file mode 100644
index 0000000000..4a7a9b95ad
--- /dev/null
+++ b/libavcodec/vaapi_cavs.c
@@ -0,0 +1,164 @@ 
+/*
+ * AVS (Chinese GY/T 257.1—2012) HW decode acceleration through VA-API
+ * Copyright (c) 2022 JianfengZheng <jianfeng.zheng@mthreads.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hwconfig.h"
+#include "hwaccel_internal.h"
+#include "vaapi_decode.h"
+#include "cavs.h"
+
+/**
+ * @file
+ * This file implements the glue code between FFmpeg's and VA-API's
+ * structures for AVS (Chinese GY/T 257.1—2012) decoding.
+ */
+
+static int vaapi_avs_pic_type_cvt(int pict_type) 
+{
+    switch (pict_type)
+    {
+    case AV_PICTURE_TYPE_I: return VA_AVS_I_IMG;
+    case AV_PICTURE_TYPE_P: return VA_AVS_P_IMG;
+    case AV_PICTURE_TYPE_B: return VA_AVS_B_IMG;
+    default:                return VA_AVS_I_IMG;
+    }
+}
+
+static void vaapi_avs_fill_pic(VAPictureAVS *va_pic, const AVSFrame *frame)
+{
+    va_pic->surface_id = ff_vaapi_get_surface_id(frame->f);
+    va_pic->poc = frame->poc / 2;
+}
+
+/** Initialize and start decoding a frame with VA API. */
+static int vaapi_avs_start_frame(AVCodecContext         *avctx,
+                                av_unused const uint8_t *buffer,
+                                av_unused uint32_t       size)
+{
+    int i, err;
+    AVSContext   *h   = avctx->priv_data;
+    VAPictureParameterBufferAVS pic_param = {};
+    VAAPIDecodePicture *vapic = h->cur.hwaccel_picture_private;
+    vapic->output_surface = ff_vaapi_get_surface_id(h->cur.f);
+
+    pic_param = (VAPictureParameterBufferAVS) {
+        .width = h->width,
+        .height = h->height,
+        .picture_type = vaapi_avs_pic_type_cvt(h->cur.f->pict_type),
+        .progressive_seq_flag = h->progressive_seq,
+        .progressive_frame_flag = h->progressive_frame,
+        .picture_structure_flag = h->pic_structure,
+        .fixed_pic_qp_flag = h->qp_fixed,
+        .picture_qp = h->qp,
+        .loop_filter_disable_flag = h->loop_filter_disable,
+        .alpha_c_offset = h->alpha_offset,
+        .beta_offset = h->beta_offset,
+        .skip_mode_flag_flag = h->skip_mode_flag,
+        .picture_reference_flag = h->ref_flag,
+    };
+
+    if (h->profile == 0x48) {
+		pic_param.guangdian_fields.guangdian_flag = 1;
+        pic_param.guangdian_fields.aec_flag = h->aec_flag;
+        pic_param.guangdian_fields.weight_quant_flag = h->weight_quant_flag;
+        pic_param.guangdian_fields.chroma_quant_param_delta_cb = h->chroma_quant_param_delta_cb;
+        pic_param.guangdian_fields.chroma_quant_param_delta_cr = h->chroma_quant_param_delta_cr;
+        memcpy(pic_param.guangdian_fields.wqm_8x8, h->wqm_8x8, 64);
+    }
+
+    vaapi_avs_fill_pic(&pic_param.curr_pic, &h->cur);
+    for (i = 0; i < 2; i++) {
+        vaapi_avs_fill_pic(&pic_param.ref_list[i], &h->DPB[i]);
+    }
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, vapic,
+                                            VAPictureParameterBufferType,
+                                            &pic_param, sizeof(pic_param));
+    if (err < 0)
+        goto fail;
+
+    return 0;
+fail:
+    ff_vaapi_decode_cancel(avctx, vapic);
+    return err;
+}
+
+/** End a hardware decoding based frame. */
+static int vaapi_avs_end_frame(AVCodecContext *avctx)
+{
+    AVSContext *h = avctx->priv_data;
+    VAAPIDecodePicture *vapic = h->cur.hwaccel_picture_private;
+    return ff_vaapi_decode_issue(avctx, vapic);
+}
+
+/** Decode the given H.264 slice with VA API. */
+static int vaapi_avs_decode_slice(AVCodecContext *avctx,
+                                   const uint8_t  *buffer,
+                                   uint32_t        size)
+{
+    int err;
+    AVSContext *h = avctx->priv_data;
+    VAAPIDecodePicture *vapic = h->cur.hwaccel_picture_private;
+    VASliceParameterBufferAVS slice_param;
+    slice_param = (VASliceParameterBufferAVS) {
+        .slice_data_size        = size,
+        .slice_data_offset      = 0,
+        .slice_data_flag        = VA_SLICE_DATA_FLAG_ALL,
+        .mb_data_bit_offset     = get_bits_count(&h->gb),
+        .slice_vertical_pos     = h->stc,
+        .fixed_slice_qp_flag    = h->qp_fixed,
+        .slice_qp               = h->qp,
+        .slice_weight_pred_flag = h->slice_weight_pred_flag,
+        .mb_weight_pred_flag    = h->mb_weight_pred_flag,
+    };
+
+    *((uint32_t *)slice_param.luma_scale) = *((uint32_t *)h->luma_scale);
+    *((uint32_t *)slice_param.luma_shift) = *((uint32_t *)h->luma_shift);
+    *((uint32_t *)slice_param.chroma_scale) = *((uint32_t *)h->chroma_scale);
+    *((uint32_t *)slice_param.chroma_shift) = *((uint32_t *)h->chroma_shift);
+
+    err = ff_vaapi_decode_make_slice_buffer(avctx, vapic,
+                                            &slice_param, sizeof(slice_param),
+                                            buffer, size);
+    if (err < 0)
+        goto fail;
+
+    return 0;
+
+fail:
+    ff_vaapi_decode_cancel(avctx, vapic);
+    return err;
+}
+
+const FFHWAccel ff_cavs_vaapi_hwaccel = {
+    .p.name                 = "cavs_vaapi",
+    .p.type                 = AVMEDIA_TYPE_VIDEO,
+    .p.id                   = AV_CODEC_ID_CAVS,
+    .p.pix_fmt              = AV_PIX_FMT_VAAPI,
+    .start_frame          = &vaapi_avs_start_frame,
+    .end_frame            = &vaapi_avs_end_frame,
+    .decode_slice         = &vaapi_avs_decode_slice,
+    .frame_priv_data_size = sizeof(VAAPIDecodePicture),
+    .init                 = &ff_vaapi_decode_init,
+    .uninit               = &ff_vaapi_decode_uninit,
+    .frame_params         = &ff_vaapi_common_frame_params,
+    .priv_data_size       = sizeof(VAAPIDecodeContext),
+    .caps_internal        = HWACCEL_CAP_ASYNC_SAFE,
+};
diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c
index ceac769c52..13a3f6aa42 100644
--- a/libavcodec/vaapi_decode.c
+++ b/libavcodec/vaapi_decode.c
@@ -408,6 +408,10 @@  static const struct {
                            H264ConstrainedBaseline),
     MAP(H264,        H264_MAIN,       H264Main    ),
     MAP(H264,        H264_HIGH,       H264High    ),
+#if HAVE_VA_PROFILE_AVS
+    MAP(CAVS,        CAVS_JIZHUN,     AVSJizhun   ),
+    MAP(CAVS,        CAVS_GUANGDIAN,  AVSGuangdian),
+#endif
 #if VA_CHECK_VERSION(0, 37, 0)
     MAP(HEVC,        HEVC_MAIN,       HEVCMain    ),
     MAP(HEVC,        HEVC_MAIN_10,    HEVCMain10  ),