diff mbox series

[FFmpeg-devel,v3,6/6] lavc/vaapi_encode: Add VAAPI AV1 encoder

Message ID 20230803060132.501741-6-fei.w.wang@intel.com
State New
Headers show
Series [FFmpeg-devel,v3,1/6] avcodec/cbs_av1: Add tx mode enum values | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Wang, Fei W Aug. 3, 2023, 6:01 a.m. UTC
From: Fei Wang <fei.w.wang@intel.com>

Signed-off-by: Fei Wang <fei.w.wang@intel.com>
---
 Changelog                     |    1 +
 configure                     |    3 +
 doc/encoders.texi             |   13 +
 libavcodec/Makefile           |    1 +
 libavcodec/allcodecs.c        |    1 +
 libavcodec/vaapi_encode.c     |  125 +++-
 libavcodec/vaapi_encode.h     |   12 +
 libavcodec/vaapi_encode_av1.c | 1229 +++++++++++++++++++++++++++++++++
 libavcodec/version.h          |    2 +-
 9 files changed, 1368 insertions(+), 19 deletions(-)
 create mode 100644 libavcodec/vaapi_encode_av1.c

Comments

Mark Thompson Aug. 7, 2023, 9:21 p.m. UTC | #1
On 03/08/2023 07:01, fei.w.wang-at-intel.com@ffmpeg.org wrote:
> From: Fei Wang <fei.w.wang@intel.com>
> 
> Signed-off-by: Fei Wang <fei.w.wang@intel.com>
> ---
>   Changelog                     |    1 +
>   configure                     |    3 +
>   doc/encoders.texi             |   13 +
>   libavcodec/Makefile           |    1 +
>   libavcodec/allcodecs.c        |    1 +
>   libavcodec/vaapi_encode.c     |  125 +++-
>   libavcodec/vaapi_encode.h     |   12 +
>   libavcodec/vaapi_encode_av1.c | 1229 +++++++++++++++++++++++++++++++++
>   libavcodec/version.h          |    2 +-
>   9 files changed, 1368 insertions(+), 19 deletions(-)
>   create mode 100644 libavcodec/vaapi_encode_av1.c

I assume this is tested on Intel hardware.  Is it tested on AMD as well?  (Apparently working in recent Mesa.)


> diff --git a/Changelog b/Changelog
> index bbda4f4fd4..e86f742cd3 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -27,6 +27,7 @@ version <next>:
>   - Bitstream filter for converting VVC from MP4 to Annex B
>   - scale_vt filter for videotoolbox
>   - transpose_vt filter for videotoolbox
> +- VAAPI AV1 encoder
>   
>   version 6.0:
>   - Radiance HDR image support
> diff --git a/configure b/configure
> index 99388e7664..68a238a819 100755
> --- a/configure
> +++ b/configure
> @@ -3322,6 +3322,8 @@ av1_qsv_decoder_select="qsvdec"
>   av1_qsv_encoder_select="qsvenc"
>   av1_qsv_encoder_deps="libvpl"
>   av1_amf_encoder_deps="amf"
> +av1_vaapi_encoder_deps="VAEncPictureParameterBufferAV1"
> +av1_vaapi_encoder_select="cbs_av1 vaapi_encode"
>   
>   # parsers
>   aac_parser_select="adts_header mpeg4audio"
> @@ -7106,6 +7108,7 @@ if enabled vaapi; then
>       check_type "va/va.h va/va_enc_jpeg.h" "VAEncPictureParameterBufferJPEG"
>       check_type "va/va.h va/va_enc_vp8.h"  "VAEncPictureParameterBufferVP8"
>       check_type "va/va.h va/va_enc_vp9.h"  "VAEncPictureParameterBufferVP9"
> +    check_type "va/va.h va/va_enc_av1.h"  "VAEncPictureParameterBufferAV1"
>   fi
>   
>   if enabled_all opencl libdrm ; then
> diff --git a/doc/encoders.texi b/doc/encoders.texi
> index 25d6b7f09e..fb331ebd8e 100644
> --- a/doc/encoders.texi
> +++ b/doc/encoders.texi
> @@ -3991,6 +3991,19 @@ Average variable bitrate.
>   Each encoder also has its own specific options:
>   @table @option
>   
> +@item av1_vaapi
> +@option{profile} sets the value of @emph{seq_profile}.
> +@option{tier} sets the value of @emph{seq_tier}.
> +@option{level} sets the value of @emph{seq_level_idx}.
> +
> +@table @option
> +@item tiles
> +Set the number of tiles to encode the input video with, as columns x rows.
> +(default is 1x1).

Probably needs some clarification that large resolutions must be split into tiles?  Maybe an "auto" value for this (meaning use as few as possible), and let explicit "1x1" fail if the resolution is too large.

> +@item tile_groups
> +Set tile groups number (default is 1).

Meaning what?  It splits into tile group OBUs containing equal numbers of tiles, or of equal size in bits, or something else?

> +@end table
> +
>   @item h264_vaapi
>   @option{profile} sets the value of @emph{profile_idc} and the @emph{constraint_set*_flag}s.
>   @option{level} sets the value of @emph{level_idc}.
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index a6b2ecbb22..473afb4471 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -258,6 +258,7 @@ OBJS-$(CONFIG_AV1_MEDIACODEC_DECODER)  += mediacodecdec.o
>   OBJS-$(CONFIG_AV1_MEDIACODEC_ENCODER)  += mediacodecenc.o
>   OBJS-$(CONFIG_AV1_NVENC_ENCODER)       += nvenc_av1.o nvenc.o
>   OBJS-$(CONFIG_AV1_QSV_ENCODER)         += qsvenc_av1.o
> +OBJS-$(CONFIG_AV1_VAAPI_ENCODER)       += vaapi_encode_av1.o av1_levels.o
>   OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o
>   OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
>   OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
> diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> index 8775d15a4f..c43c1d7b48 100644
> --- a/libavcodec/allcodecs.c
> +++ b/libavcodec/allcodecs.c
> @@ -844,6 +844,7 @@ extern const FFCodec ff_av1_nvenc_encoder;
>   extern const FFCodec ff_av1_qsv_decoder;
>   extern const FFCodec ff_av1_qsv_encoder;
>   extern const FFCodec ff_av1_amf_encoder;
> +extern const FFCodec ff_av1_vaapi_encoder;
>   extern const FFCodec ff_libopenh264_encoder;
>   extern const FFCodec ff_libopenh264_decoder;
>   extern const FFCodec ff_h264_amf_encoder;
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 2604f12b9e..2907e159fb 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -669,6 +669,15 @@ static int vaapi_encode_set_output_timestamp(AVCodecContext *avctx,
>   {
>       VAAPIEncodeContext *ctx = avctx->priv_data;
>   
> +    // AV1 packs P frame and next B frame into one pkt, and uses the other
> +    // repeat frame header pkt at the display order position of the P frame
> +    // to indicate its frame index. Each frame has a corresponding pkt in its
> +    // display order position. So don't need to consider delay for AV1 timestamp.
> +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> +        pkt->dts = pkt->pts - ctx->dts_pts_diff;
> +        return 0;
> +    }

This doesn't get you the right result, though?  The PTS and DTS on every AV1 packet want to be the same.

In any case, please don't put tests for codec ID in the common code.  I suggest that the timestamp behaviour wants to be a new FLAG_*.

> +
>       if (ctx->output_delay == 0) {
>           pkt->dts = pkt->pts;
>       } else if (pic->encode_order < ctx->decode_delay) {
> @@ -689,9 +698,10 @@ static int vaapi_encode_output(AVCodecContext *avctx,
>   {
>       VAAPIEncodeContext *ctx = avctx->priv_data;
>       VACodedBufferSegment *buf_list, *buf;
> -    VAStatus vas;
> +    AVPacket *pkt_ptr = pkt;
>       int total_size = 0;
>       uint8_t *ptr;
> +    VAStatus vas;
>       int err;
>   
>       err = vaapi_encode_wait(avctx, pic);
> @@ -711,11 +721,52 @@ static int vaapi_encode_output(AVCodecContext *avctx,
>       for (buf = buf_list; buf; buf = buf->next)
>           total_size += buf->size;
>   
> -    err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> -    ptr = pkt->data;
> +    /** repack av1 coded frame for not display and repeat frames */
> +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> +        int display_frame = pic->display_order <= pic->encode_order;
>   
> -    if (err < 0)
> -        goto fail_mapped;
> +        if (display_frame) {
> +            total_size += ctx->header_data_size;
> +            err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> +            if (err < 0)
> +                goto fail_mapped;
> +            ptr = pkt->data;
> +
> +            if (ctx->header_data_size) {
> +                memcpy(ptr, ctx->header_data, ctx->header_data_size);
> +                ptr += ctx->header_data_size;
> +                ctx->header_data_size = 0;
> +            }
> +        } else {
> +            ctx->header_data = av_realloc(ctx->header_data, total_size);
> +            if (!ctx->header_data) {
> +                err = AVERROR(ENOMEM);
> +                goto fail_mapped;
> +            }
> +            ptr = ctx->header_data;
> +            ctx->header_data_size = total_size;
> +
> +            if (pic->tail_size) {
> +                if (ctx->tail_pkt->size) {
> +                    err = AVERROR(AVERROR_BUG);
> +                    goto fail_mapped;
> +                }
> +
> +                err = ff_get_encode_buffer(avctx, ctx->tail_pkt, pic->tail_size, 0);
> +                if (err < 0)
> +                    goto fail_mapped;
> +
> +                memcpy(ctx->tail_pkt->data, pic->tail_data, pic->tail_size);
> +                pkt_ptr = ctx->tail_pkt;
> +            }
> +        }
> +    } else {
> +        err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> +        ptr = pkt->data;
> +
> +        if (err < 0)
> +            goto fail_mapped;
> +    }

I'm not convinced that this repeated-realloc-and-copy structure in the main context is a good approach.

How about something like storing the output buffer for every picture in that picture as it is generated (could even stay in the VABuffer without mapping it if that worked?), then when you get a display-now picture concatenate all the oustanding frames together in order into the packet (with a single copy).

The tail data is trivial to generate on the fly (it's only three bytes), so storing it does not seem useful.

>   
>       for (buf = buf_list; buf; buf = buf->next) {
>           av_log(avctx, AV_LOG_DEBUG, "Output buffer: %u bytes "
> @@ -726,10 +777,10 @@ static int vaapi_encode_output(AVCodecContext *avctx,
>       }
>   
>       if (pic->type == PICTURE_TYPE_IDR)
> -        pkt->flags |= AV_PKT_FLAG_KEY;
> +        pkt_ptr->flags |= AV_PKT_FLAG_KEY;
>   
> -    pkt->pts = pic->pts;
> -    pkt->duration = pic->duration;
> +    pkt_ptr->pts = pic->pts;
> +    pkt_ptr->duration = pic->duration;
>   
>       vas = vaUnmapBuffer(ctx->hwctx->display, pic->output_buffer);
>       if (vas != VA_STATUS_SUCCESS) {
> @@ -742,8 +793,8 @@ static int vaapi_encode_output(AVCodecContext *avctx,
>       // for no-delay encoders this is handled in generic codec
>       if (avctx->codec->capabilities & AV_CODEC_CAP_DELAY &&
>           avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
> -        pkt->opaque     = pic->opaque;
> -        pkt->opaque_ref = pic->opaque_ref;
> +        pkt_ptr->opaque     = pic->opaque;
> +        pkt_ptr->opaque_ref = pic->opaque_ref;
>           pic->opaque_ref = NULL;
>       }
>   
> @@ -752,6 +803,9 @@ static int vaapi_encode_output(AVCodecContext *avctx,
>   
>       av_log(avctx, AV_LOG_DEBUG, "Output read for pic %"PRId64"/%"PRId64".\n",
>              pic->display_order, pic->encode_order);
> +
> +    vaapi_encode_set_output_timestamp(avctx, pic, pkt_ptr);
> +
>       return 0;
>   
>   fail_mapped:
> @@ -1128,9 +1182,19 @@ static int vaapi_encode_pick_next(AVCodecContext *avctx,
>   
>       vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
>       if (pic->type != PICTURE_TYPE_IDR) {
> -        vaapi_encode_add_ref(avctx, pic, start,
> -                             pic->type == PICTURE_TYPE_P,
> -                             b_counter > 0, 0);
> +        // TODO: apply both previous and forward multi reference for all vaapi encoders.
> +        // And L0/L1 reference frame number can be set dynamically through query
> +        // VAConfigAttribEncMaxRefFrames attribute.
> +        if (avctx->codec_id == AV_CODEC_ID_AV1) {
> +            for (i = 0; i < ctx->nb_next_prev; i++)
> +                vaapi_encode_add_ref(avctx, pic, ctx->next_prev[i],
> +                                     pic->type == PICTURE_TYPE_P,
> +                                     b_counter > 0, 0);

So an undisclosed aim of the previous patch was to make this extra list of the most recent N frames for this to use with P frames only?

Why this partcular structure for P frames only, though?  Seems like if you support extra references then other codecs could also make use of them in either direction, so we would be better off hinting that the codec would like up to N references and then using the contents of the existing DPB, plus keep extras if there is space.

> +        } else
> +            vaapi_encode_add_ref(avctx, pic, start,
> +                                 pic->type == PICTURE_TYPE_P,
> +                                 b_counter > 0, 0);
> +
>           vaapi_encode_add_ref(avctx, pic, ctx->next_prev[ctx->nb_next_prev - 1], 0, 0, 1);
>       }
>   
> @@ -1292,6 +1356,19 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>       AVFrame *frame = ctx->frame;
>       int err;
>   
> +start:
> +    /** if no B frame before repeat P frame, sent repeat P frame out. */
> +    if (avctx->codec_id == AV_CODEC_ID_AV1 && ctx->tail_pkt->size) {
> +        for (VAAPIEncodePicture *tmp = ctx->pic_start; tmp; tmp = tmp->next) {
> +            if (tmp->type == PICTURE_TYPE_B && tmp->pts < ctx->tail_pkt->pts)
> +                break;
> +            else if (!tmp->next) {
> +                av_packet_move_ref(pkt, ctx->tail_pkt);
> +                goto end;
> +            }
> +        }
> +    }
> +
>       err = ff_encode_get_frame(avctx, frame);
>       if (err < 0 && err != AVERROR_EOF)
>           return err;
> @@ -1356,17 +1433,21 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>           return err;
>       }
>   
> -    vaapi_encode_set_output_timestamp(avctx, pic, pkt);
> -    av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts %"PRId64", "
> -           "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> -
>       ctx->output_order = pic->encode_order;
>       vaapi_encode_clear_old(avctx);
>   
> +    /** loop to get an available pkt in encoder flushing. */
> +    if (ctx->end_of_stream && !pkt->size)
> +        goto start;
> +
> +end:
> +    if (pkt->size)
> +        av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts %"PRId64", "
> +               "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> +
>       return 0;
>   }
>   
> -
>   static av_cold void vaapi_encode_add_global_param(AVCodecContext *avctx, int type,
>                                                     void *buffer, size_t size)
>   {
> @@ -2667,6 +2748,12 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
>       ctx->device = (AVHWDeviceContext*)ctx->device_ref->data;
>       ctx->hwctx = ctx->device->hwctx;
>   
> +    ctx->tail_pkt = av_packet_alloc();
> +    if (!ctx->tail_pkt) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
>       err = vaapi_encode_profile_entrypoint(avctx);
>       if (err < 0)
>           goto fail;
> @@ -2859,9 +2946,11 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
>       }
>   
>       av_frame_free(&ctx->frame);
> +    av_packet_free(&ctx->tail_pkt);
>   
>       av_freep(&ctx->codec_sequence_params);
>       av_freep(&ctx->codec_picture_params);
> +    av_freep(&ctx->header_data);
>       av_fifo_freep2(&ctx->encode_fifo);
>   
>       av_buffer_unref(&ctx->recon_frames_ref);
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index d5452a37b3..03df8d6d46 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -133,6 +133,11 @@ typedef struct VAAPIEncodePicture {
>   
>       int          nb_slices;
>       VAAPIEncodeSlice *slices;
> +
> +    /** Tail data of current pic, used only for repeat header of AV1. */
> +    char tail_data[MAX_PARAM_BUFFER_SIZE];
> +    /** Byte length of tail_data. */
> +    size_t tail_size;
>   } VAAPIEncodePicture;
>   
>   typedef struct VAAPIEncodeProfile {
> @@ -367,6 +372,13 @@ typedef struct VAAPIEncodeContext {
>       AVFifo          *encode_fifo;
>       // Max number of frame buffered in encoder.
>       int             async_depth;
> +
> +    /** Head data for current output pkt, used only for AV1. */
> +    void  *header_data;
> +    size_t header_data_size;
> +
> +    /** Store av1 repeat frame header pkt. */
> +    AVPacket *tail_pkt;
>   } VAAPIEncodeContext;
>   
>   enum {
> diff --git a/libavcodec/vaapi_encode_av1.c b/libavcodec/vaapi_encode_av1.c
> new file mode 100644
> index 0000000000..4c8a9d995d
> --- /dev/null
> +++ b/libavcodec/vaapi_encode_av1.c
> @@ -0,0 +1,1229 @@
> +/*
> + * Copyright (c) 2023 Intel Corporation
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <va/va.h>
> +#include <va/va_enc_av1.h>
> +
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/opt.h"
> +
> +#include "cbs_av1.h"
> +#include "put_bits.h"
> +#include "codec_internal.h"
> +#include "av1_levels.h"
> +#include "vaapi_encode.h"
> +
> +#define AV1_MAX_QUANT 255
> +
> +typedef struct VAAPIEncodeAV1Picture {
> +    int64_t last_idr_frame;
> +    int slot;
> +} VAAPIEncodeAV1Picture;
> +
> +typedef struct VAAPIEncodeAV1Context {
> +    VAAPIEncodeContext common;
> +    AV1RawOBU sh; /**< sequence header.*/
> +    AV1RawOBU fh; /**< frame header.*/
> +    CodedBitstreamContext *cbc;
> +    CodedBitstreamFragment current_obu;
> +    VAConfigAttribValEncAV1 attr;
> +    VAConfigAttribValEncAV1Ext1 attr_ext1;
> +    VAConfigAttribValEncAV1Ext2 attr_ext2;
> +
> +    char sh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded sequence header data. */
> +    size_t sh_data_len; /**< bit length of sh_data. */
> +    char fh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded frame header data. */
> +    size_t fh_data_len; /**< bit length of fh_data. */
> +
> +    uint8_t uniform_tile;
> +    uint8_t use_128x128_superblock;
> +    int sb_cols;
> +    int sb_rows;
> +    int tile_cols_log2;
> +    int tile_rows_log2;
> +    int max_tile_width_sb;
> +    int max_tile_height_sb;
> +    uint8_t width_in_sbs_minus_1[AV1_MAX_TILE_COLS];
> +    uint8_t height_in_sbs_minus_1[AV1_MAX_TILE_ROWS];
> +
> +    int min_log2_tile_cols;
> +    int max_log2_tile_cols;
> +    int min_log2_tile_rows;
> +    int max_log2_tile_rows;
> +
> +    int q_idx_idr;
> +    int q_idx_p;
> +    int q_idx_b;
> +
> +    /** user options */
> +    int profile;
> +    int level;
> +    int tier;
> +    int tile_cols, tile_rows;
> +    int tile_groups;
> +} VAAPIEncodeAV1Context;
> +
> +static av_cold int vaapi_encode_av1_configure(AVCodecContext *avctx)
> +{
> +    VAAPIEncodeContext     *ctx = avctx->priv_data;
> +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> +    int ret;
> +
> +    ret = ff_cbs_init(&priv->cbc, AV_CODEC_ID_AV1, avctx);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (ctx->rc_mode->quality) {
> +        priv->q_idx_p = av_clip(ctx->rc_quality, 0, AV1_MAX_QUANT);
> +        if (fabs(avctx->i_quant_factor) > 0.0)
> +            priv->q_idx_idr =
> +                av_clip((fabs(avctx->i_quant_factor) * priv->q_idx_p  +
> +                         avctx->i_quant_offset) + 0.5,
> +                        0, AV1_MAX_QUANT);
> +        else
> +            priv->q_idx_idr = priv->q_idx_p;
> +
> +        if (fabs(avctx->b_quant_factor) > 0.0)
> +            priv->q_idx_b =
> +                av_clip((fabs(avctx->b_quant_factor) * priv->q_idx_p  +
> +                         avctx->b_quant_offset) + 0.5,
> +                        0, AV1_MAX_QUANT);
> +        else
> +            priv->q_idx_b = priv->q_idx_p;
> +    } else {
> +        /** Arbitrary value */
> +        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 128;
> +    }

Are there any alignment requirements here?  (If I gave it an input which is, say, 1361x1, would that work?)

> +
> +    return 0;
> +}
> +
> +static int vaapi_encode_av1_add_obu(AVCodecContext *avctx,
> +                                    CodedBitstreamFragment *au,
> +                                    uint8_t type,
> +                                    void *obu_unit)
> +{
> +    int ret;
> +
> +    ret = ff_cbs_insert_unit_content(au, -1,
> +                                     type, obu_unit, NULL);
> +    if (ret < 0) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to add OBU unit: "
> +               "type = %d.\n", type);
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vaapi_encode_av1_write_obu(AVCodecContext *avctx,
> +                                      char *data, size_t *data_len,
> +                                      CodedBitstreamFragment *bs)
> +{
> +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> +    int ret;
> +
> +    ret = ff_cbs_write_fragment_data(priv->cbc, bs);
> +    if (ret < 0) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to write packed header.\n");
> +        return ret;
> +    }
> +
> +    if ((size_t)8 * MAX_PARAM_BUFFER_SIZE < 8 * bs->data_size - bs->data_bit_padding) {
> +        av_log(avctx, AV_LOG_ERROR, "Access unit too large: "
> +               "%zu < %zu.\n", (size_t)8 * MAX_PARAM_BUFFER_SIZE,
> +               8 * bs->data_size - bs->data_bit_padding);
> +        return AVERROR(ENOSPC);
> +    }
> +
> +    memcpy(data, bs->data, bs->data_size);
> +    *data_len = 8 * bs->data_size - bs->data_bit_padding;
> +
> +    return 0;
> +}
> +
> +static int get_relative_dist(const AV1RawSequenceHeader *seq,
> +                             unsigned int a, unsigned int b)
> +{
> +    unsigned int diff, m;
> +    if (!seq->enable_order_hint)
> +        return 0;
> +    diff = a - b;
> +    m = 1 << seq->order_hint_bits_minus_1;
> +    diff = (diff & (m - 1)) - (diff & m);
> +    return diff;
> +}
> +
> +static int write_ns(PutBitContext *pbc, uint32_t n, uint32_t value)
> +{
> +    uint32_t w, m, v, extra_bit;
> +
> +    w = av_log2(n) + 1;
> +    m = (1 << w) - n;
> +
> +    if (value < m) {
> +        v = value;
> +        put_bits(pbc, w - 1, v);
> +    } else {
> +        v = m + ((value - m) >> 1);
> +        extra_bit = (value - m) & 1;
> +        put_bits(pbc, w - 1, v);
> +        put_bits(pbc, 1, extra_bit);
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * This API provide the minmum implemention according current enabled features
> + * in frame header. If more features will enable in furture, please make sure
> + * the relative flags of features should be packed correctly into frame header
> + * obu in this API.
> + */
> +static int vaapi_encode_av1_write_frame_header(AVCodecContext *avctx,
> +                                               VAAPIEncodePicture *pic,
> +                                               char *data, size_t *data_len)
> +{
> +    VAAPIEncodeContext              *ctx = avctx->priv_data;
> +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> +    AV1RawOBU                    *fh_obu = &priv->fh;
> +    AV1RawOBU                    *sh_obu = &priv->sh;
> +    AV1RawFrameHeader                *fh = &fh_obu->obu.frame.header;
> +    AV1RawSequenceHeader             *sh = &sh_obu->obu.sequence_header;
> +    VAEncPictureParameterBufferAV1 *vpic = pic->codec_picture_params;
> +    PutBitContext pbc, pbc_tmp;
> +    uint8_t byte;
> +    int qindex, coded_lossless;
> +    int id_len, frame_is_intra, skip_mode_allowed;
> +    int start, payload_bits, obu_size, obu_size_len;
> +    int qindex_offset, loopfilter_offset;
> +    int cdef_start_offset, cdef_end_offset;
> +    int i;
> +
> +    init_put_bits(&pbc, data, MAX_PARAM_BUFFER_SIZE);
> +
> +    /** obu header */
> +    put_bits(&pbc, 1, fh_obu->header.obu_forbidden_bit);
> +    put_bits(&pbc, 4, fh_obu->header.obu_type);
> +    put_bits(&pbc, 1, fh_obu->header.obu_extension_flag);
> +    put_bits(&pbc, 1, fh_obu->header.obu_has_size_field);
> +    put_bits(&pbc, 1, fh_obu->header.obu_reserved_1bit);
> +
> +    /** record pbc status to re-write obu size later. */
> +    if (fh_obu->header.obu_has_size_field) {
> +        pbc_tmp = pbc;
> +        put_bits32(&pbc, 0);
> +        put_bits32(&pbc, 0);
> +    }
> +
> +    start = put_bits_count(&pbc);
> +
> +    /** uncompressed_header() */
> +    if (sh->frame_id_numbers_present_flag)
> +        id_len = sh->additional_frame_id_length_minus_1 +
> +                 sh->delta_frame_id_length_minus_2 + 3;
> +
> +    frame_is_intra = (fh->frame_type == AV1_FRAME_KEY ||
> +                      fh->frame_type == AV1_FRAME_INTRA_ONLY);
> +    if (!sh->reduced_still_picture_header) {
> +        put_bits(&pbc, 1, fh->show_existing_frame);
> +        if (fh->show_existing_frame) {
> +            put_bits(&pbc, 3, fh->frame_to_show_map_idx);
> +            goto trailing_bits;
> +        }
> +        if (sh->frame_id_numbers_present_flag)
> +            put_bits(&pbc, id_len, fh->display_frame_id);
> +
> +        put_bits(&pbc, 2, fh->frame_type);
> +        put_bits(&pbc, 1, fh->show_frame);
> +        if (!fh->show_frame)
> +            put_bits(&pbc, 1, fh->showable_frame);
> +
> +        if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> +            (fh->frame_type == AV1_FRAME_KEY && fh->show_frame)))
> +            put_bits(&pbc, 1, fh->error_resilient_mode);
> +    }
> +
> +    put_bits(&pbc, 1, fh->disable_cdf_update);
> +    if (sh->seq_force_screen_content_tools == AV1_SELECT_SCREEN_CONTENT_TOOLS)
> +        put_bits(&pbc, 1, fh->allow_screen_content_tools);
> +
> +    if (fh->allow_screen_content_tools && sh->seq_force_integer_mv == AV1_SELECT_INTEGER_MV)
> +        put_bits(&pbc, 1, fh->force_integer_mv);
> +
> +    if (sh->frame_id_numbers_present_flag)
> +        put_bits(&pbc, id_len, fh->current_frame_id);
> +
> +    if (fh->frame_type != AV1_FRAME_SWITCH && !sh->reduced_still_picture_header)
> +        put_bits(&pbc, 1, fh->frame_size_override_flag);
> +
> +    if (sh->enable_order_hint)
> +        put_bits(&pbc, sh->order_hint_bits_minus_1 + 1, fh->order_hint);
> +
> +    if (!(frame_is_intra || fh->error_resilient_mode))
> +        put_bits(&pbc, 3, fh->primary_ref_frame);
> +
> +    if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> +        fh->frame_type == AV1_FRAME_KEY && fh->show_frame))
> +        put_bits(&pbc, 8, fh->refresh_frame_flags);
> +
> +    if (frame_is_intra) {
> +        /** render_size() */
> +         put_bits(&pbc, 1, fh->render_and_frame_size_different);
> +    } else {
> +        if (!frame_is_intra && sh->enable_order_hint)
> +            put_bits(&pbc, 1, fh->frame_refs_short_signaling);
> +
> +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> +            if (!fh->frame_refs_short_signaling)
> +                put_bits(&pbc, 3, fh->ref_frame_idx[i]);
> +        }
> +
> +        if (!(fh->frame_size_override_flag && !fh->error_resilient_mode))
> +            put_bits(&pbc, 1, fh->render_and_frame_size_different);
> +
> +        if (!fh->force_integer_mv)
> +            put_bits(&pbc, 1, fh->allow_high_precision_mv);
> +
> +        /** read_interpolation_filter() */
> +        put_bits(&pbc, 1, fh->is_filter_switchable);
> +        if (!fh->is_filter_switchable)
> +            put_bits(&pbc, 2, fh->interpolation_filter);
> +
> +        put_bits(&pbc, 1, fh->is_motion_mode_switchable);
> +    }
> +
> +    if (!(sh->reduced_still_picture_header || fh->disable_cdf_update))
> +        put_bits(&pbc, 1, fh->disable_frame_end_update_cdf);
> +
> +    /** tile_info() */
> +    put_bits(&pbc, 1, fh->uniform_tile_spacing_flag);
> +    if (fh->uniform_tile_spacing_flag) {
> +        for (i = 0; i < priv->tile_cols_log2 - priv->min_log2_tile_cols; i++) {
> +            put_bits(&pbc, 1, 1);
> +        }
> +        if (priv->tile_cols_log2 < priv->max_log2_tile_cols)
> +            put_bits(&pbc, 1, 0);
> +
> +        for (i = 0; i < priv->tile_rows_log2 - priv->min_log2_tile_rows; i++) {
> +            put_bits(&pbc, 1, 1);
> +        }
> +        if (priv->tile_rows_log2 < priv->max_log2_tile_rows)
> +            put_bits(&pbc, 1, 0);
> +    } else {
> +        int start_sb = 0;
> +        int max_width, max_height;
> +        for (i = 0; start_sb < priv->sb_cols; i++) {
> +            max_width = FFMIN(priv->sb_cols - start_sb, priv->max_tile_width_sb);
> +            write_ns(&pbc, max_width, fh->width_in_sbs_minus_1[i]);
> +            start_sb += fh->width_in_sbs_minus_1[i] + 1;
> +        }
> +
> +        start_sb = 0;
> +        for (i = 0; start_sb < priv->sb_rows; i++) {
> +            max_height = FFMIN(priv->sb_rows - start_sb, priv->max_tile_height_sb);
> +            write_ns(&pbc, max_height, fh->height_in_sbs_minus_1[i]);
> +            start_sb += fh->height_in_sbs_minus_1[i] + 1;
> +        }
> +    }
> +
> +    if (priv->tile_cols_log2 || priv->tile_rows_log2) {
> +        put_bits(&pbc, priv->tile_cols_log2 + priv->tile_rows_log2, fh->context_update_tile_id);
> +        put_bits(&pbc, 2, fh->tile_size_bytes_minus1);
> +    }
> +
> +    qindex_offset = put_bits_count(&pbc);
> +    /** quantization_params() */
> +    put_bits(&pbc, 8, fh->base_q_idx);
> +    put_bits(&pbc, 1, fh->delta_q_y_dc);
> +    put_bits(&pbc, 1, fh->delta_q_u_dc);
> +    put_bits(&pbc, 1, fh->delta_q_u_ac);
> +    put_bits(&pbc, 1, fh->using_qmatrix);
> +
> +    /** segmentation_params() */
> +    put_bits(&pbc, 1, fh->segmentation_enabled);
> +
> +    /** delta_q_params() */
> +    if (fh->base_q_idx)
> +        put_bits(&pbc, 1, fh->delta_q_present);
> +    if (fh->delta_q_present)
> +        put_bits(&pbc, 2, fh->delta_q_res);
> +
> +    /** delta_lf_params() */
> +    if (fh->delta_q_present) {
> +        if (!fh->allow_intrabc)
> +            put_bits(&pbc, 1, fh->delta_lf_present);
> +        if (fh->delta_lf_present) {
> +            put_bits(&pbc, 2, fh->delta_lf_res);
> +            put_bits(&pbc, 1, fh->delta_lf_multi);
> +        }
> +    }
> +
> +    /** codelossless */
> +    coded_lossless = 1;
> +    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
> +        if (fh->segmentation_enabled && fh->feature_enabled[i][AV1_SEG_LVL_ALT_Q])
> +            qindex = fh->base_q_idx + fh->feature_value[i][AV1_SEG_LVL_ALT_Q];
> +        else
> +            qindex = fh->base_q_idx;
> +        qindex = av_clip_uintp2(qindex, 8);
> +
> +        if (qindex || fh->delta_q_y_dc || fh->delta_q_u_ac || fh->delta_q_u_dc ||
> +            fh->delta_q_v_ac || fh->delta_q_v_dc)
> +            coded_lossless = 0;
> +    }
> +
> +    loopfilter_offset = put_bits_count(&pbc);
> +    /** loop_filter_params() */
> +    if (!(coded_lossless || fh->allow_intrabc)) {
> +        put_bits(&pbc, 6, fh->loop_filter_level[0]);
> +        put_bits(&pbc, 6, fh->loop_filter_level[1]);
> +        if (fh->loop_filter_level[0] || fh->loop_filter_level[1]) {
> +            put_bits(&pbc, 6, fh->loop_filter_level[2]);
> +            put_bits(&pbc, 6, fh->loop_filter_level[3]);
> +        }
> +        put_bits(&pbc, 3, fh->loop_filter_sharpness);
> +        put_bits(&pbc, 1, fh->loop_filter_delta_enabled);
> +    }
> +
> +    cdef_start_offset = put_bits_count(&pbc);
> +    /** cdef_params() */
> +    if (!(coded_lossless || fh->allow_intrabc || !sh->enable_cdef)) {
> +        put_bits(&pbc, 2, fh->cdef_damping_minus_3);
> +        put_bits(&pbc, 2, fh->cdef_bits);
> +        for (i = 0; i < (1 << fh->cdef_bits); i++) {
> +            put_bits(&pbc, 4, fh->cdef_y_pri_strength[i]);
> +            put_bits(&pbc, 2, fh->cdef_y_sec_strength[i]);
> +            put_bits(&pbc, 4, fh->cdef_uv_pri_strength[i]);
> +            put_bits(&pbc, 2, fh->cdef_uv_sec_strength[i]);
> +        }
> +    }
> +    cdef_end_offset = put_bits_count(&pbc);
> +
> +    /** read_tx_mode() */
> +    if (fh->tx_mode == AV1_TX_MODE_SELECT)
> +        put_bits(&pbc, 1, 1);
> +    else
> +        put_bits(&pbc, 1, 0);
> +
> +    /** frame_reference_mode() */
> +    if (!frame_is_intra)
> +        put_bits(&pbc, 1, fh->reference_select);
> +
> +    /** skip_mode_params() */
> +    if (frame_is_intra || !fh->reference_select || !sh->enable_order_hint)
> +        skip_mode_allowed = 0;
> +    else {
> +        int forward_idx, backward_idx;
> +        int ref_hint, forward_hint, backward_hint;
> +
> +        forward_idx = -1;
> +        backward_idx = -1;
> +
> +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> +            ref_hint = fh->ref_order_hint[fh->ref_frame_idx[i]];
> +            if (get_relative_dist(sh, ref_hint, fh->order_hint) < 0) {
> +                if (forward_idx < 0 || get_relative_dist(sh, ref_hint, forward_idx) > 0) {
> +                    forward_idx = i;
> +                    forward_hint = ref_hint;
> +                }
> +            } else if (get_relative_dist(sh, ref_hint, fh->order_hint) > 0) {
> +                if (backward_idx < 0 || get_relative_dist(sh, ref_hint, backward_hint) < 0) {
> +                    backward_idx = i;
> +                    backward_hint = ref_hint;
> +                }
> +            }
> +        }
> +        if (forward_idx < 0)
> +            skip_mode_allowed = 0;
> +        else if (backward_idx >= 0)
> +            skip_mode_allowed = 1;
> +        else {
> +            int second_forward_idx, second_forward_hint;
> +            second_forward_idx = -1;
> +            for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> +                ref_hint = fh->ref_order_hint[fh->ref_frame_idx[i]];
> +                if (get_relative_dist(sh, ref_hint, forward_hint) < 0) {
> +                    if (second_forward_idx < 0 || get_relative_dist(sh, ref_hint, second_forward_hint) > 0){
> +                        second_forward_idx = i;
> +                        second_forward_hint = ref_hint;
> +                    }
> +                }
> +            }
> +            if (second_forward_idx < 0)
> +                skip_mode_allowed = 0;
> +            else
> +                skip_mode_allowed = 1;
> +        }
> +    }
> +
> +    if (skip_mode_allowed)
> +        put_bits(&pbc, 1, fh->skip_mode_present);
> +
> +    put_bits(&pbc, 1, fh->reduced_tx_set);
> +
> +    /** global_motion_params() */
> +    if (!frame_is_intra) {
> +        for (i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF; i++) {
> +            put_bits(&pbc, 1, fh->is_global[i]);
> +            if (fh->is_global[i]) {
> +                put_bits(&pbc, 1, fh->is_rot_zoom[i]);
> +                if (!fh->is_rot_zoom[i])
> +                    put_bits(&pbc, 1, fh->is_translation[i]);
> +            }
> +        }
> +    }
> +
> +trailing_bits:
> +    payload_bits = put_bits_count(&pbc) - start;
> +
> +    /** trailing_bits() */
> +    put_bits(&pbc, 1, 1);
> +    obu_size = (put_bits_count(&pbc) - start + 7) / 8;
> +    for (i = 0; i < obu_size * 8 - payload_bits - 1; i++)
> +        put_bits(&pbc, 1, 0);
> +
> +    flush_put_bits(&pbc);
> +    *data_len = put_bits_count(&pbc);
> +
> +    /** update obu size in bitstream */
> +    if (fh_obu->header.obu_has_size_field) {
> +        obu_size_len = priv->attr_ext2.bits.obu_size_bytes_minus1 + 1;
> +        for (i = 0; i < obu_size_len; i++) {
> +            byte = obu_size >> (7 * i) & 0x7f;
> +            if (i < obu_size_len - 1)
> +                byte |= 0x80;
> +            put_bits(&pbc_tmp, 8, byte);
> +        }
> +        flush_put_bits(&pbc_tmp);
> +        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 - obu_size_len), obu_size);
> +        *data_len -= (8 - obu_size_len) * 8;
> +    }

Why is there an incomplete duplicate of the cbs_av1 header writing code here?

> +
> +    if (fh->show_existing_frame)
> +        return 0;
> +
> +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> +        vpic->min_base_qindex = av_clip(avctx->qmin, 1, AV1_MAX_QUANT);
> +        vpic->max_base_qindex = av_clip(avctx->qmax, 1, AV1_MAX_QUANT);
> +
> +        vpic->bit_offset_qindex            = qindex_offset - (8 - obu_size_len) * 8;
> +        vpic->bit_offset_loopfilter_params = loopfilter_offset - (8 - obu_size_len) * 8;
> +        vpic->bit_offset_cdef_params       = cdef_start_offset - (8 - obu_size_len) * 8;
> +        vpic->size_in_bits_cdef_params     = cdef_end_offset - cdef_start_offset;
> +        vpic->size_in_bits_frame_hdr_obu   = *data_len;
> +
> +        vpic->byte_offset_frame_hdr_obu_size = (((pic->type == PICTURE_TYPE_IDR) ?
> +                                               priv->sh_data_len / 8 : 0) +
> +                                               (fh_obu->header.obu_extension_flag ?
> +                                               2 : 1));
> +    }
> +
> +    return 0;
> +}
> +
> +static int tile_log2(int blkSize, int target) {
> +    int k;
> +    for (k = 0; (blkSize << k) < target; k++);
> +    return k;
> +}
> +
> +static int vaapi_encode_av1_set_tile(AVCodecContext *avctx)
> +{
> +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> +    int mi_cols, mi_rows, sb_shift, sb_size;
> +    int max_tile_area_sb, max_tile_area_sb_varied;
> +    int tile_width_sb, tile_height_sb, widest_tile_sb;
> +    int min_log2_tiles;
> +    int tile_rows_tmp, i;
> +
> +    if (priv->tile_cols > AV1_MAX_TILE_COLS ||
> +        priv->tile_rows > AV1_MAX_TILE_ROWS) {
> +        av_log(avctx, AV_LOG_ERROR, "Invalid tile number %dx%d, should less than %dx%d.\n",
> +               priv->tile_cols, priv->tile_rows, AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);

Can't this be a constraint on the option rather than a special check here?

> +        return AVERROR(EINVAL);
> +    }
> +
> +    mi_cols = 2 * ((avctx->width + 7) >> 3);
> +    mi_rows = 2 * ((avctx->height + 7) >> 3);
> +    priv->sb_cols = priv->use_128x128_superblock ?
> +                    ((mi_cols + 31) >> 5) : ((mi_cols + 15) >> 4);
> +    priv->sb_rows = priv->use_128x128_superblock ?
> +                    ((mi_rows + 31) >> 5) : ((mi_rows + 15) >> 4);
> +    sb_shift = priv->use_128x128_superblock ? 5 : 4;
> +    sb_size  = sb_shift + 2;
> +    priv->max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size;
> +    max_tile_area_sb = AV1_MAX_TILE_AREA  >> (2 * sb_size);
> +
> +    priv->min_log2_tile_cols = tile_log2(priv->max_tile_width_sb, priv->sb_cols);
> +    priv->max_log2_tile_cols = tile_log2(1, FFMIN(priv->sb_cols, AV1_MAX_TILE_COLS));
> +    priv->max_log2_tile_rows = tile_log2(1, FFMIN(priv->sb_rows, AV1_MAX_TILE_ROWS));
> +    min_log2_tiles = FFMAX(priv->min_log2_tile_cols,
> +                           tile_log2(max_tile_area_sb, priv->sb_rows * priv->sb_cols));
> +
> +    if (priv->tile_cols != av_clip(priv->tile_cols, (priv->sb_cols + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv->sb_cols)) {
> +        priv->tile_cols = av_clip(priv->tile_cols, (priv->sb_cols + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv->sb_cols);
> +        av_log(avctx, AV_LOG_WARNING, "Invalid tile cols, correct to %d.\n", priv->tile_cols);
> +    }

If the user explicitly asked for a number which isn't possible then you probably want to fail here (see comment about some sort of "auto" setting above).

> +
> +    priv->tile_cols_log2 = tile_log2(1, priv->tile_cols);
> +    tile_width_sb = (priv->sb_cols + (1 << priv->tile_cols_log2) - 1) >>
> +                    priv->tile_cols_log2;
> +
> +    if (priv->tile_rows > priv->sb_rows) {
> +        priv->tile_rows = priv->sb_rows;
> +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct to %d.\n", priv->tile_rows);
> +    }
> +
> +    for (tile_rows_tmp = priv->tile_rows; tile_rows_tmp <= priv->sb_rows && tile_rows_tmp <= AV1_MAX_TILE_ROWS; tile_rows_tmp++) {
> +        /** try uniformed tile. */
> +        priv->tile_rows_log2 = tile_log2(1, tile_rows_tmp);
> +        if ((priv->sb_cols + tile_width_sb - 1) / tile_width_sb == priv->tile_cols) {
> +            for (i = 0; i < priv->tile_cols - 1; i++)
> +                priv->width_in_sbs_minus_1[i] = tile_width_sb - 1;
> +            priv->width_in_sbs_minus_1[i] = priv->sb_cols - (priv->tile_cols - 1) * tile_width_sb - 1;
> +
> +            tile_height_sb = (priv->sb_rows + (1 << priv->tile_rows_log2) - 1) >>
> +                             priv->tile_rows_log2;
> +
> +            if ((priv->sb_rows + tile_height_sb - 1) / tile_height_sb == tile_rows_tmp &&
> +                tile_height_sb <= max_tile_area_sb / tile_width_sb) {
> +                for (i = 0; i < tile_rows_tmp - 1; i++)
> +                    priv->height_in_sbs_minus_1[i] = tile_height_sb - 1;
> +                priv->height_in_sbs_minus_1[i] = priv->sb_rows - (tile_rows_tmp - 1) * tile_height_sb - 1;
> +
> +                priv->uniform_tile = 1;
> +                priv->min_log2_tile_rows = FFMAX(min_log2_tiles - priv->tile_cols_log2, 0);
> +
> +                break;
> +            }
> +        }
> +
> +        /** try non-uniformed tile. */
> +        widest_tile_sb = 0;
> +        for (i = 0; i < priv->tile_cols; i++) {
> +            priv->width_in_sbs_minus_1[i] = (i + 1) * priv->sb_cols / priv->tile_cols - i * priv->sb_cols / priv->tile_cols - 1;
> +            widest_tile_sb = FFMAX(widest_tile_sb, priv->width_in_sbs_minus_1[i] + 1);
> +        }
> +
> +        if (min_log2_tiles)
> +            max_tile_area_sb_varied = (priv->sb_rows * priv->sb_cols) >> (min_log2_tiles + 1);
> +        else
> +            max_tile_area_sb_varied = priv->sb_rows * priv->sb_cols;
> +        priv->max_tile_height_sb = FFMAX(1, max_tile_area_sb_varied / widest_tile_sb);
> +
> +        if (tile_rows_tmp == av_clip(tile_rows_tmp, (priv->sb_rows + priv->max_tile_height_sb - 1) / priv->max_tile_height_sb, priv->sb_rows)) {
> +            for (i = 0; i < tile_rows_tmp; i++)
> +                priv->height_in_sbs_minus_1[i] = (i + 1) * priv->sb_rows / tile_rows_tmp - i * priv->sb_rows / tile_rows_tmp - 1;
> +
> +            break;
> +        }
> +    }
> +
> +    if (priv->tile_rows != tile_rows_tmp) {
> +        priv->tile_rows = tile_rows_tmp;
> +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct to %d.\n", priv->tile_rows);
> +    }
> +
> +    /** check if tile cols/rows is supported by driver. */
> +    if (priv->attr_ext2.bits.max_tile_num_minus1) {
> +        if ((priv->tile_cols * priv->tile_rows - 1) > priv->attr_ext2.bits.max_tile_num_minus1) {
> +            av_log(avctx, AV_LOG_ERROR, "Unsupported tile num %d * %d = %d by driver, "
> +                   "should be less than %d.\n", priv->tile_cols, priv->tile_rows,

s/less than/at most/ ?

> +                   priv->tile_cols * priv->tile_rows,
> +                   priv->attr_ext2.bits.max_tile_num_minus1 + 1);
> +            return AVERROR(EINVAL);
> +        }
> +    }
> +    av_log(avctx, AV_LOG_DEBUG, "Setting tile cols/rows to %d/%d.\n",
> +           priv->tile_cols, priv->tile_rows);
> +
> +    /** check if tile group numbers is valid. */
> +    if (priv->tile_groups > priv->tile_cols * priv->tile_rows) {
> +        av_log(avctx, AV_LOG_WARNING, "Invalid tile groups number %d, "
> +        "correct to %d.\n", priv->tile_groups, priv->tile_cols * priv->tile_rows);
> +        priv->tile_groups = priv->tile_cols * priv->tile_rows;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vaapi_encode_av1_write_sequence_header(AVCodecContext *avctx,
> +                                                  char *data, size_t *data_len)
> +{
> +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> +
> +    memcpy(data, &priv->sh_data, MAX_PARAM_BUFFER_SIZE * sizeof(char));
> +    *data_len = priv->sh_data_len;

I'm confused why you write the sequence header in the init function and then store it to use here?

> +
> +    return 0;
> +}
> +
> +static int vaapi_encode_av1_init_sequence_params(AVCodecContext *avctx)
> +{
> +    VAAPIEncodeContext               *ctx = avctx->priv_data;
> +    VAAPIEncodeAV1Context           *priv = avctx->priv_data;
> +    AV1RawOBU                     *sh_obu = &priv->sh;
> +    AV1RawSequenceHeader              *sh = &sh_obu->obu.sequence_header;
> +    VAEncSequenceParameterBufferAV1 *vseq = ctx->codec_sequence_params;
> +    CodedBitstreamFragment           *obu = &priv->current_obu;
> +    const AVPixFmtDescriptor *desc;
> +    int ret;
> +
> +    memset(sh_obu, 0, sizeof(*sh_obu));
> +    sh_obu->header.obu_type = AV1_OBU_SEQUENCE_HEADER;
> +
> +    desc = av_pix_fmt_desc_get(priv->common.input_frames->sw_format);
> +    av_assert0(desc);
> +
> +    sh->seq_profile  = avctx->profile;
> +    if (!sh->seq_force_screen_content_tools)
> +        sh->seq_force_integer_mv = AV1_SELECT_INTEGER_MV;
> +    sh->frame_width_bits_minus_1  = av_log2(avctx->width);
> +    sh->frame_height_bits_minus_1 = av_log2(avctx->height);
> +    sh->max_frame_width_minus_1   = avctx->width - 1;
> +    sh->max_frame_height_minus_1  = avctx->height - 1;
> +    sh->seq_tier[0]               = priv->tier;
> +    /** enable order hint and reserve maximum 8 bits for it by default. */
> +    sh->enable_order_hint         = 1;
> +    sh->order_hint_bits_minus_1   = 7;
> +
> +    sh->color_config = (AV1RawColorConfig) {
> +        .high_bitdepth                  = desc->comp[0].depth == 8 ? 0 : 1,
> +        .color_primaries                = avctx->color_primaries,
> +        .transfer_characteristics       = avctx->color_trc,
> +        .matrix_coefficients            = avctx->colorspace,
> +        .color_description_present_flag = (avctx->color_primaries != AVCOL_PRI_UNSPECIFIED ||
> +                                           avctx->color_trc       != AVCOL_TRC_UNSPECIFIED ||
> +                                           avctx->colorspace      != AVCOL_SPC_UNSPECIFIED),
> +        .color_range                    = avctx->color_range == AVCOL_RANGE_JPEG,
> +        .subsampling_x                  = desc->log2_chroma_w,
> +        .subsampling_y                  = desc->log2_chroma_h,

.chroma_sample_position = some function of chroma_sample_location.

> +    };
> +
> +    if (avctx->level != FF_LEVEL_UNKNOWN) {
> +        sh->seq_level_idx[0] = avctx->level;
> +    } else {
> +        const AV1LevelDescriptor *level;
> +        float framerate;
> +
> +        if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
> +            framerate = avctx->framerate.num / avctx->framerate.den;
> +        else
> +            framerate = 0;
> +
> +        level = ff_av1_guess_level(avctx->bit_rate, priv->tier,
> +                                   ctx->surface_width, ctx->surface_height,
> +                                   priv->tile_rows * priv->tile_cols,
> +                                   priv->tile_cols, framerate);
> +        if (level) {
> +            av_log(avctx, AV_LOG_VERBOSE, "Using level %s.\n", level->name);
> +            sh->seq_level_idx[0] = level->level_idx;
> +        } else {
> +            av_log(avctx, AV_LOG_VERBOSE, "Stream will not conform to "
> +                   "any normal level, using level 6.3 by default.\n");
> +            sh->seq_level_idx[0] = 19;

Isn't this exactly what level_idx == 31 is for?

> +            sh->seq_tier[0] = 1;
> +        }
> +    }
> +    vseq->seq_profile             = sh->seq_profile;
> +    vseq->seq_level_idx           = sh->seq_level_idx[0];
> +    vseq->seq_tier                = sh->seq_tier[0];
> +    vseq->order_hint_bits_minus_1 = sh->order_hint_bits_minus_1;
> +    vseq->intra_period            = ctx->gop_size;
> +    vseq->ip_period               = ctx->b_per_p + 1;
> +
> +    vseq->seq_fields.bits.enable_order_hint = sh->enable_order_hint;
> +
> +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> +        vseq->bits_per_second = ctx->va_bit_rate;
> +        vseq->seq_fields.bits.enable_cdef = sh->enable_cdef = 1;
> +    }
> +
> +    ret = vaapi_encode_av1_add_obu(avctx, obu, AV1_OBU_SEQUENCE_HEADER, &priv->sh);
> +    if (ret < 0)
> +        goto end;
> +
> +    ret = vaapi_encode_av1_write_obu(avctx, priv->sh_data, &priv->sh_data_len, obu);
> +    if (ret < 0)
> +        goto end;
> +
> +end:
> +    ff_cbs_fragment_reset(obu);
> +    return ret;
> +}
> +
> +static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
> +                                                VAAPIEncodePicture *pic)
> +{
> +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> +    VAAPIEncodeAV1Picture          *hpic = pic->priv_data;
> +    AV1RawOBU                    *fh_obu = &priv->fh;
> +    AV1RawFrameHeader                *fh = &fh_obu->obu.frame.header;
> +    VAEncPictureParameterBufferAV1 *vpic = pic->codec_picture_params;
> +    CodedBitstreamFragment          *obu = &priv->current_obu;
> +    VAAPIEncodePicture    *ref;
> +    VAAPIEncodeAV1Picture *href;
> +    int slot, i;
> +    int ret;
> +    static const int8_t default_loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME] =
> +        { 1, 0, 0, 0, -1, 0, -1, -1 };
> +
> +    memset(fh_obu, 0, sizeof(*fh_obu));
> +    pic->nb_slices = priv->tile_groups;
> +    fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> +    fh_obu->header.obu_has_size_field = 1;
> +
> +    switch (pic->type) {
> +    case PICTURE_TYPE_IDR:
> +        av_assert0(pic->nb_refs[0] == 0 || pic->nb_refs[1]);
> +        fh->frame_type = AV1_FRAME_KEY;
> +        fh->refresh_frame_flags = 0xFF;
> +        fh->base_q_idx = priv->q_idx_idr;
> +        hpic->slot = 0;
> +        hpic->last_idr_frame = pic->display_order;
> +        break;
> +    case PICTURE_TYPE_P:
> +        av_assert0(pic->nb_refs[0]);
> +        fh->frame_type = AV1_FRAME_INTER;
> +        fh->base_q_idx = priv->q_idx_p;
> +        ref = pic->refs[0][pic->nb_refs[0] - 1];
> +        href = ref->priv_data;
> +        hpic->slot = !href->slot;

I'm very confused how all of these multiple-reference stuff can work if you only ever put pictures in slots 0 and 1.

What does it actually look like?  Can you share a stream using it?

> +        hpic->last_idr_frame = href->last_idr_frame;
> +        fh->refresh_frame_flags = 1 << hpic->slot;
> +
> +        /** set the nearest frame in L0 as all reference frame. */
> +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> +            fh->ref_frame_idx[i] = href->slot;
> +        }
> +        fh->primary_ref_frame = href->slot;
> +        fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
> +        vpic->ref_frame_ctrl_l0.fields.search_idx0 = AV1_REF_FRAME_LAST;
> +
> +        /** set the 2nd nearest frame in L0 as Golden frame. */
> +        if (pic->nb_refs[0] > 1) {
> +            ref = pic->refs[0][pic->nb_refs[0] - 2];
> +            href = ref->priv_data;
> +            fh->ref_frame_idx[3] = href->slot;
> +            fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
> +            vpic->ref_frame_ctrl_l0.fields.search_idx1 = AV1_REF_FRAME_GOLDEN;
> +        }
> +        break;
> +    case PICTURE_TYPE_B:
> +        av_assert0(pic->nb_refs[0] && pic->nb_refs[1]);
> +        fh->frame_type = AV1_FRAME_INTER;
> +        fh->base_q_idx = priv->q_idx_b;
> +        fh->refresh_frame_flags = 0x0;
> +        fh->reference_select = 1;
> +
> +        /** B frame will not be referenced, disable its recon frame. */
> +        vpic->picture_flags.bits.disable_frame_recon = 1;

It feels like it would be very easy to support all of the B structure just like VP9 does here.

> +
> +        /** Use LAST_FRAME and BWDREF_FRAME for reference. */
> +        vpic->ref_frame_ctrl_l0.fields.search_idx0 = AV1_REF_FRAME_LAST;
> +        vpic->ref_frame_ctrl_l1.fields.search_idx0 = AV1_REF_FRAME_BWDREF;
> +
> +        ref                            = pic->refs[0][pic->nb_refs[0] - 1];
> +        href                           = ref->priv_data;
> +        hpic->last_idr_frame           = href->last_idr_frame;
> +        fh->primary_ref_frame          = href->slot;
> +        fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
> +        for (i = 0; i < AV1_REF_FRAME_GOLDEN; i++) {
> +            fh->ref_frame_idx[i] = href->slot;
> +        }
> +
> +        ref                            = pic->refs[1][pic->nb_refs[1] - 1];
> +        href                           = ref->priv_data;
> +        fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
> +        for (i = AV1_REF_FRAME_GOLDEN; i < AV1_REFS_PER_FRAME; i++) {
> +            fh->ref_frame_idx[i] = href->slot;
> +        }
> +        break;
> +    default:
> +        av_assert0(0 && "invalid picture type");
> +    }
> +
> +    fh->show_frame                = pic->display_order <= pic->encode_order;
> +    fh->showable_frame            = fh->frame_type != AV1_FRAME_KEY;
> +    fh->frame_width_minus_1       = avctx->width - 1;
> +    fh->frame_height_minus_1      = avctx->height - 1;
> +    fh->render_width_minus_1      = fh->frame_width_minus_1;
> +    fh->render_height_minus_1     = fh->frame_height_minus_1;
> +    fh->order_hint                = pic->display_order - hpic->last_idr_frame;

Doesn't this overflow?

> +    fh->tile_cols                 = priv->tile_cols;
> +    fh->tile_rows                 = priv->tile_rows;
> +    fh->tile_cols_log2            = priv->tile_cols_log2;
> +    fh->tile_rows_log2            = priv->tile_rows_log2;
> +    fh->uniform_tile_spacing_flag = priv->uniform_tile;
> +    fh->tile_size_bytes_minus1    = priv->attr_ext2.bits.tile_size_bytes_minus1;
> +    fh->reduced_tx_set            = 1;

This not being present in the capabilities feels like an omission in the API.  That's a large loss for an implementation which does support the full transform set.

> +
> +    /** ignore ONLY_4x4 mode for codedlossless is not fully implemented. */
> +    if (priv->attr_ext2.bits.tx_mode_support & 0x04)
> +        fh->tx_mode = AV1_TX_MODE_SELECT;
> +    else if (priv->attr_ext2.bits.tx_mode_support & 0x02)
> +        fh->tx_mode = AV1_TX_MODE_LARGEST;
> +    else
> +        return AVERROR(EINVAL);

What would this case even mean?  The encoder doesn't support any transforms?  Seems like a message rather than silent failure would be a good idea anyway.

> +
> +    for (i = 0; i < fh->tile_cols; i++)
> +        fh->width_in_sbs_minus_1[i] = vpic->width_in_sbs_minus_1[i] = priv->width_in_sbs_minus_1[i];
> +
> +    for (i = 0; i < fh->tile_rows; i++)
> +        fh->height_in_sbs_minus_1[i] = vpic->height_in_sbs_minus_1[i] = priv->height_in_sbs_minus_1[i];

Er, what?  Doesn't this fail the inference on the last tile width/height if you don't split exactly?

> +
> +    memcpy(fh->loop_filter_ref_deltas, default_loop_filter_ref_deltas,
> +           AV1_TOTAL_REFS_PER_FRAME * sizeof(int8_t));
> +
> +    if (fh->frame_type == AV1_FRAME_KEY && fh->show_frame) {
> +        fh->error_resilient_mode = 1;
> +    }
> +
> +    if (fh->frame_type == AV1_FRAME_KEY || fh->error_resilient_mode)
> +        fh->primary_ref_frame = AV1_PRIMARY_REF_NONE;
> +
> +    vpic->base_qindex          = fh->base_q_idx;
> +    vpic->frame_width_minus_1  = fh->frame_width_minus_1;
> +    vpic->frame_height_minus_1 = fh->frame_height_minus_1;
> +    vpic->primary_ref_frame    = fh->primary_ref_frame;
> +    vpic->reconstructed_frame  = pic->recon_surface;
> +    vpic->coded_buf            = pic->output_buffer;
> +    vpic->tile_cols            = fh->tile_cols;
> +    vpic->tile_rows            = fh->tile_rows;
> +    vpic->order_hint           = fh->order_hint;
> +#if VA_CHECK_VERSION(1, 15, 0)
> +    vpic->refresh_frame_flags  = fh->refresh_frame_flags;
> +#endif

What will the difference be between running in the two versions?  (Is it worth supporting the older one?  Cf. bit_depth on VP9.)

> +
> +    vpic->picture_flags.bits.enable_frame_obu     = 0;
> +    vpic->picture_flags.bits.frame_type           = fh->frame_type;
> +    vpic->picture_flags.bits.reduced_tx_set       = fh->reduced_tx_set;
> +    vpic->picture_flags.bits.error_resilient_mode = fh->error_resilient_mode;
> +
> +    /** let driver decide to use single or compound reference prediction mode. */
> +    vpic->mode_control_flags.bits.reference_mode = fh->reference_select ? 2 : 0;
> +    vpic->mode_control_flags.bits.tx_mode = fh->tx_mode;
> +
> +    vpic->tile_group_obu_hdr_info.bits.obu_has_size_field = 1;
> +
> +    /** set reference. */
> +    for (i = 0; i < AV1_REFS_PER_FRAME; i++)
> +        vpic->ref_frame_idx[i] = fh->ref_frame_idx[i];
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(vpic->reference_frames); i++)
> +        vpic->reference_frames[i] = VA_INVALID_SURFACE;
> +
> +    for (i = 0; i < MAX_REFERENCE_LIST_NUM; i++) {
> +        for (int j = 0; j < pic->nb_refs[i]; j++) {
> +            VAAPIEncodePicture *ref_pic = pic->refs[i][j];
> +
> +            slot = ((VAAPIEncodeAV1Picture*)ref_pic->priv_data)->slot;
> +            av_assert0(vpic->reference_frames[slot] == VA_INVALID_SURFACE);
> +
> +            vpic->reference_frames[slot] = ref_pic->recon_surface;
> +        }
> +    }
> +
> +    /** pack frame header, and set va params offset like bit_offset_qindex etc. */
> +    ret = vaapi_encode_av1_write_frame_header(avctx, pic, priv->fh_data, &priv->fh_data_len);
> +    if (ret < 0)
> +        goto end;
> +
> +end:
> +    ff_cbs_fragment_reset(obu);
> +    return ret;
> +}
> +
> +static int vaapi_encode_av1_init_slice_params(AVCodecContext *avctx,
> +                                              VAAPIEncodePicture *pic,
> +                                              VAAPIEncodeSlice *slice)
> +{
> +    VAAPIEncodeAV1Context      *priv = avctx->priv_data;
> +    VAEncTileGroupBufferAV1  *vslice = slice->codec_slice_params;
> +    CodedBitstreamAV1Context  *cbctx = priv->cbc->priv_data;
> +    int div;
> +
> +    /** Set tile group info. */
> +    div = priv->tile_cols * priv->tile_rows / priv->tile_groups;
> +    vslice->tg_start = slice->index * div;
> +    if (slice->index == (priv->tile_groups - 1)) {
> +        vslice->tg_end = priv->tile_cols * priv->tile_rows - 1;
> +        cbctx->seen_frame_header = 0;
> +    } else {
> +        vslice->tg_end = (slice->index + 1) * div - 1;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vaapi_encode_av1_write_picture_header(AVCodecContext *avctx,
> +                                                 VAAPIEncodePicture *pic,
> +                                                 char *data, size_t *data_len)
> +{
> +    VAAPIEncodeAV1Context     *priv = avctx->priv_data;
> +    CodedBitstreamFragment     *obu = &priv->current_obu;
> +    AV1RawOBU               *fh_obu = &priv->fh;
> +    AV1RawFrameHeader       *rep_fh = &fh_obu->obu.frame_header;
> +    VAAPIEncodeAV1Picture *href;
> +    int ret = 0;
> +
> +    pic->tail_size = 0;
> +    /** Pack repeat frame header. */
> +    if (pic->display_order > pic->encode_order) {
> +        memset(fh_obu, 0, sizeof(*fh_obu));
> +        href = pic->refs[0][pic->nb_refs[0] - 1]->priv_data;
> +        fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> +        fh_obu->header.obu_has_size_field = 1;
> +
> +        rep_fh->show_existing_frame   = 1;
> +        rep_fh->frame_to_show_map_idx = href->slot == 0;
> +        rep_fh->frame_type            = AV1_FRAME_INTER;
> +        rep_fh->frame_width_minus_1   = avctx->width - 1;
> +        rep_fh->frame_height_minus_1  = avctx->height - 1;
> +        rep_fh->render_width_minus_1  = rep_fh->frame_width_minus_1;
> +        rep_fh->render_height_minus_1 = rep_fh->frame_height_minus_1;
> +
> +        ret = vaapi_encode_av1_write_frame_header(avctx, pic, pic->tail_data, &pic->tail_size);
> +        if (ret < 0)
> +            goto end;
> +
> +        pic->tail_size /= 8;
> +    }
> +
> +    memcpy(data, &priv->fh_data, MAX_PARAM_BUFFER_SIZE * sizeof(char));
> +    *data_len = priv->fh_data_len;
> +
> +end:
> +    ff_cbs_fragment_reset(obu);
> +    return ret;
> +}
> +
> +static const VAAPIEncodeProfile vaapi_encode_av1_profiles[] = {
> +    { FF_PROFILE_AV1_MAIN,  8, 3, 1, 1, VAProfileAV1Profile0 },
> +    { FF_PROFILE_AV1_MAIN, 10, 3, 1, 1, VAProfileAV1Profile0 },
> +    { FF_PROFILE_UNKNOWN }
> +};
> +
> +static const VAAPIEncodeType vaapi_encode_type_av1 = {
> +    .profiles        = vaapi_encode_av1_profiles,
> +    .flags           = FLAG_B_PICTURES,
> +    .default_quality = 25,
> +    .configure       = &vaapi_encode_av1_configure,
> +
> +    .sequence_header_type  = VAEncPackedHeaderSequence,
> +    .sequence_params_size  = sizeof(VAEncSequenceParameterBufferAV1),
> +    .init_sequence_params  = &vaapi_encode_av1_init_sequence_params,
> +    .write_sequence_header = &vaapi_encode_av1_write_sequence_header,
> +
> +    .picture_priv_data_size = sizeof(VAAPIEncodeAV1Picture),
> +    .picture_header_type    = VAEncPackedHeaderPicture,
> +    .picture_params_size    = sizeof(VAEncPictureParameterBufferAV1),
> +    .init_picture_params    = &vaapi_encode_av1_init_picture_params,
> +    .write_picture_header   = &vaapi_encode_av1_write_picture_header,
> +
> +    .slice_params_size = sizeof(VAEncTileGroupBufferAV1),
> +    .init_slice_params = &vaapi_encode_av1_init_slice_params,
> +};
> +
> +static av_cold int vaapi_encode_av1_init(AVCodecContext *avctx)
> +{
> +    VAAPIEncodeContext      *ctx = avctx->priv_data;
> +    VAAPIEncodeAV1Context  *priv = avctx->priv_data;
> +    VAConfigAttrib attr;
> +    VAStatus vas;
> +    int ret;
> +
> +    ctx->codec = &vaapi_encode_type_av1;
> +
> +    ctx->desired_packed_headers =
> +        VA_ENC_PACKED_HEADER_SEQUENCE |
> +        VA_ENC_PACKED_HEADER_PICTURE;
> +
> +    if (avctx->profile == FF_PROFILE_UNKNOWN)
> +        avctx->profile = priv->profile;
> +    if (avctx->level == FF_LEVEL_UNKNOWN)
> +        avctx->level = priv->level;
> +
> +    if (avctx->level != FF_LEVEL_UNKNOWN && avctx->level & ~0x1f) {
> +        av_log(avctx, AV_LOG_ERROR, "Invalid level %d\n", avctx->level);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    ret = ff_vaapi_encode_init(avctx);
> +    if (ret < 0)
> +        return ret;
> +
> +    attr.type = VAConfigAttribEncAV1;
> +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> +                                ctx->va_profile,
> +                                ctx->va_entrypoint,
> +                                &attr, 1);
> +    if (vas != VA_STATUS_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> +               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
> +        return AVERROR_EXTERNAL;
> +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> +        priv->attr.value = 0;
> +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> +               "supported.\n", attr.type);

Is zero actually reasonable in this attribute (and below)?  Would we be better just requiring them to work?

> +    } else {
> +        priv->attr.value = attr.value;
> +    }
> +
> +    attr.type = VAConfigAttribEncAV1Ext1;
> +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> +                                ctx->va_profile,
> +                                ctx->va_entrypoint,
> +                                &attr, 1);
> +    if (vas != VA_STATUS_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> +               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
> +        return AVERROR_EXTERNAL;
> +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> +        priv->attr_ext1.value = 0;
> +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> +               "supported.\n", attr.type);
> +    } else {
> +        priv->attr_ext1.value = attr.value;
> +    }
> +
> +    /** This attr provides essential indicators, return error if not support. */
> +    attr.type = VAConfigAttribEncAV1Ext2;
> +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> +                                ctx->va_profile,
> +                                ctx->va_entrypoint,
> +                                &attr, 1);
> +    if (vas != VA_STATUS_SUCCESS || attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> +               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
> +        return AVERROR_EXTERNAL;
> +    } else {
> +        priv->attr_ext2.value = attr.value;
> +    }
> +
> +    ret = vaapi_encode_av1_set_tile(avctx);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static av_cold int vaapi_encode_av1_close(AVCodecContext *avctx)
> +{
> +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> +
> +    ff_cbs_fragment_free(&priv->current_obu);
> +    ff_cbs_close(&priv->cbc);
> +
> +    return ff_vaapi_encode_close(avctx);
> +}
> +
> +#define OFFSET(x) offsetof(VAAPIEncodeAV1Context, x)
> +#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
> +
> +static const AVOption vaapi_encode_av1_options[] = {
> +    VAAPI_ENCODE_COMMON_OPTIONS,
> +    VAAPI_ENCODE_RC_OPTIONS,
> +    { "profile", "Set profile (seq_profile)",
> +      OFFSET(profile), AV_OPT_TYPE_INT,
> +      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 0xff, FLAGS, "profile" },
> +
> +#define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
> +    { .i64 = value }, 0, 0, FLAGS, "profile"
> +    { PROFILE("main",               FF_PROFILE_AV1_MAIN) },
> +    { PROFILE("high",               FF_PROFILE_AV1_HIGH) },
> +    { PROFILE("professional",       FF_PROFILE_AV1_PROFESSIONAL) },
> +#undef PROFILE
> +
> +    { "tier", "Set tier (seq_tier)",
> +      OFFSET(tier), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS, "tier" },
> +    { "main", NULL, 0, AV_OPT_TYPE_CONST,
> +      { .i64 = 0 }, 0, 0, FLAGS, "tier" },
> +    { "high", NULL, 0, AV_OPT_TYPE_CONST,
> +      { .i64 = 1 }, 0, 0, FLAGS, "tier" },
> +    { "level", "Set level (seq_level_idx)",
> +      OFFSET(level), AV_OPT_TYPE_INT,
> +      { .i64 = FF_LEVEL_UNKNOWN }, FF_LEVEL_UNKNOWN, 0x1f, FLAGS, "level" },
> +
> +#define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
> +      { .i64 = value }, 0, 0, FLAGS, "level"
> +    { LEVEL("2.0",  0) },
> +    { LEVEL("2.1",  1) },
> +    { LEVEL("3.0",  4) },
> +    { LEVEL("3.1",  5) },
> +    { LEVEL("4.0",  8) },
> +    { LEVEL("4.1",  9) },
> +    { LEVEL("5.0", 12) },
> +    { LEVEL("5.1", 13) },
> +    { LEVEL("5.2", 14) },
> +    { LEVEL("5.3", 15) },
> +    { LEVEL("6.0", 16) },
> +    { LEVEL("6.1", 17) },
> +    { LEVEL("6.2", 18) },
> +    { LEVEL("6.3", 19) },
> +#undef LEVEL
> +
> +    { "tiles", "Tile columns x rows",
> +      OFFSET(tile_cols), AV_OPT_TYPE_IMAGE_SIZE, { .str = "1x1" }, 1, AV1_MAX_TILE_COLS, FLAGS },
> +    { "tile_groups", "Number of tile groups for encoding",
> +      OFFSET(tile_groups), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, AV1_MAX_TILE_ROWS * AV1_MAX_TILE_COLS, FLAGS },
> +
> +    { NULL },
> +};
> +
> +static const FFCodecDefault vaapi_encode_av1_defaults[] = {
> +    { "b",              "0"   },
> +    { "bf",             "7"   },

This feels excessive when only one layer is supported.  Do experimental results give this as the best tradeoff?

> +    { "g",              "120" },
> +    { "qmin",           "1"   },
> +    { "qmax",           "255" },
> +    { NULL },
> +};
> +
> +static const AVClass vaapi_encode_av1_class = {
> +    .class_name = "av1_vaapi",
> +    .item_name  = av_default_item_name,
> +    .option     = vaapi_encode_av1_options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> +const FFCodec ff_av1_vaapi_encoder = {
> +    .p.name         = "av1_vaapi",
> +    CODEC_LONG_NAME("AV1 (VAAPI)"),
> +    .p.type         = AVMEDIA_TYPE_VIDEO,
> +    .p.id           = AV_CODEC_ID_AV1,
> +    .priv_data_size = sizeof(VAAPIEncodeAV1Context),
> +    .init           = &vaapi_encode_av1_init,
> +    FF_CODEC_RECEIVE_PACKET_CB(&ff_vaapi_encode_receive_packet),
> +    .close          = &vaapi_encode_av1_close,
> +    .p.priv_class   = &vaapi_encode_av1_class,
> +    .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE |
> +                      AV_CODEC_CAP_DR1 | AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
> +    .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE |
> +                      FF_CODEC_CAP_INIT_CLEANUP,
> +    .defaults       = vaapi_encode_av1_defaults,
> +    .p.pix_fmts = (const enum AVPixelFormat[]) {
> +        AV_PIX_FMT_VAAPI,
> +        AV_PIX_FMT_NONE,
> +    },
> +    .hw_configs     = ff_vaapi_encode_hw_configs,
> +    .p.wrapper_name = "vaapi",
> +};
> diff --git a/libavcodec/version.h b/libavcodec/version.h
> index 9411511e04..728ab8839d 100644
> --- a/libavcodec/version.h
> +++ b/libavcodec/version.h
> @@ -29,7 +29,7 @@
>   
>   #include "version_major.h"
>   
> -#define LIBAVCODEC_VERSION_MINOR  22
> +#define LIBAVCODEC_VERSION_MINOR  23
>   #define LIBAVCODEC_VERSION_MICRO 100
>   
>   #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \

Thanks,

- Mark
Wang, Fei W Aug. 10, 2023, 2:54 a.m. UTC | #2
On Mon, 2023-08-07 at 22:21 +0100, Mark Thompson wrote:
> On 03/08/2023 07:01, fei.w.wang-at-intel.com@ffmpeg.org wrote:
> > From: Fei Wang <fei.w.wang@intel.com>
> > 
> > Signed-off-by: Fei Wang <fei.w.wang@intel.com>
> > ---
> >   Changelog                     |    1 +
> >   configure                     |    3 +
> >   doc/encoders.texi             |   13 +
> >   libavcodec/Makefile           |    1 +
> >   libavcodec/allcodecs.c        |    1 +
> >   libavcodec/vaapi_encode.c     |  125 +++-
> >   libavcodec/vaapi_encode.h     |   12 +
> >   libavcodec/vaapi_encode_av1.c | 1229
> > +++++++++++++++++++++++++++++++++
> >   libavcodec/version.h          |    2 +-
> >   9 files changed, 1368 insertions(+), 19 deletions(-)
> >   create mode 100644 libavcodec/vaapi_encode_av1.c
> 
> I assume this is tested on Intel hardware.  Is it tested on AMD as
> well?  (Apparently working in recent Mesa.)

AMD tested the patchset months ago:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22585

This patch changed a little compare with the version in Cartwheel,
@Ruijing, Could you help to review this version in ML? To avoid the
diffs break you. Thanks.

> 
> 
> > diff --git a/Changelog b/Changelog
> > index bbda4f4fd4..e86f742cd3 100644
> > --- a/Changelog
> > +++ b/Changelog
> > @@ -27,6 +27,7 @@ version <next>:
> >   - Bitstream filter for converting VVC from MP4 to Annex B
> >   - scale_vt filter for videotoolbox
> >   - transpose_vt filter for videotoolbox
> > +- VAAPI AV1 encoder
> >   
> >   version 6.0:
> >   - Radiance HDR image support
> > diff --git a/configure b/configure
> > index 99388e7664..68a238a819 100755
> > --- a/configure
> > +++ b/configure
> > @@ -3322,6 +3322,8 @@ av1_qsv_decoder_select="qsvdec"
> >   av1_qsv_encoder_select="qsvenc"
> >   av1_qsv_encoder_deps="libvpl"
> >   av1_amf_encoder_deps="amf"
> > +av1_vaapi_encoder_deps="VAEncPictureParameterBufferAV1"
> > +av1_vaapi_encoder_select="cbs_av1 vaapi_encode"
> >   
> >   # parsers
> >   aac_parser_select="adts_header mpeg4audio"
> > @@ -7106,6 +7108,7 @@ if enabled vaapi; then
> >       check_type "va/va.h va/va_enc_jpeg.h"
> > "VAEncPictureParameterBufferJPEG"
> >       check_type "va/va.h
> > va/va_enc_vp8.h"  "VAEncPictureParameterBufferVP8"
> >       check_type "va/va.h
> > va/va_enc_vp9.h"  "VAEncPictureParameterBufferVP9"
> > +    check_type "va/va.h
> > va/va_enc_av1.h"  "VAEncPictureParameterBufferAV1"
> >   fi
> >   
> >   if enabled_all opencl libdrm ; then
> > diff --git a/doc/encoders.texi b/doc/encoders.texi
> > index 25d6b7f09e..fb331ebd8e 100644
> > --- a/doc/encoders.texi
> > +++ b/doc/encoders.texi
> > @@ -3991,6 +3991,19 @@ Average variable bitrate.
> >   Each encoder also has its own specific options:
> >   @table @option
> >   
> > +@item av1_vaapi
> > +@option{profile} sets the value of @emph{seq_profile}.
> > +@option{tier} sets the value of @emph{seq_tier}.
> > +@option{level} sets the value of @emph{seq_level_idx}.
> > +
> > +@table @option
> > +@item tiles
> > +Set the number of tiles to encode the input video with, as columns
> > x rows.
> > +(default is 1x1).
> 
> Probably needs some clarification that large resolutions must be
> split into tiles?  Maybe an "auto" value for this (meaning use as few
> as possible), and let explicit "1x1" fail if the resolution is too
> large.
> 
> > +@item tile_groups
> > +Set tile groups number (default is 1).
> 
> Meaning what?  It splits into tile group OBUs containing equal
> numbers of tiles, or of equal size in bits, or something else?
> 
> > +@end table
> > +
> >   @item h264_vaapi
> >   @option{profile} sets the value of @emph{profile_idc} and the
> > @emph{constraint_set*_flag}s.
> >   @option{level} sets the value of @emph{level_idc}.
> > diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> > index a6b2ecbb22..473afb4471 100644
> > --- a/libavcodec/Makefile
> > +++ b/libavcodec/Makefile
> > @@ -258,6 +258,7 @@ OBJS-$(CONFIG_AV1_MEDIACODEC_DECODER)  +=
> > mediacodecdec.o
> >   OBJS-$(CONFIG_AV1_MEDIACODEC_ENCODER)  += mediacodecenc.o
> >   OBJS-$(CONFIG_AV1_NVENC_ENCODER)       += nvenc_av1.o nvenc.o
> >   OBJS-$(CONFIG_AV1_QSV_ENCODER)         += qsvenc_av1.o
> > +OBJS-$(CONFIG_AV1_VAAPI_ENCODER)       += vaapi_encode_av1.o
> > av1_levels.o
> >   OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o
> >   OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
> >   OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
> > diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> > index 8775d15a4f..c43c1d7b48 100644
> > --- a/libavcodec/allcodecs.c
> > +++ b/libavcodec/allcodecs.c
> > @@ -844,6 +844,7 @@ extern const FFCodec ff_av1_nvenc_encoder;
> >   extern const FFCodec ff_av1_qsv_decoder;
> >   extern const FFCodec ff_av1_qsv_encoder;
> >   extern const FFCodec ff_av1_amf_encoder;
> > +extern const FFCodec ff_av1_vaapi_encoder;
> >   extern const FFCodec ff_libopenh264_encoder;
> >   extern const FFCodec ff_libopenh264_decoder;
> >   extern const FFCodec ff_h264_amf_encoder;
> > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> > index 2604f12b9e..2907e159fb 100644
> > --- a/libavcodec/vaapi_encode.c
> > +++ b/libavcodec/vaapi_encode.c
> > @@ -669,6 +669,15 @@ static int
> > vaapi_encode_set_output_timestamp(AVCodecContext *avctx,
> >   {
> >       VAAPIEncodeContext *ctx = avctx->priv_data;
> >   
> > +    // AV1 packs P frame and next B frame into one pkt, and uses
> > the other
> > +    // repeat frame header pkt at the display order position of
> > the P frame
> > +    // to indicate its frame index. Each frame has a corresponding
> > pkt in its
> > +    // display order position. So don't need to consider delay for
> > AV1 timestamp.
> > +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > +        pkt->dts = pkt->pts - ctx->dts_pts_diff;
> > +        return 0;
> > +    }
> 
> This doesn't get you the right result, though?  The PTS and DTS on
> every AV1 packet want to be the same.

The result tested good which can be played normally. Just aligned with
other vaapi encoders that the 1st frame start with 0/-1 of PTS/DTS if
have B frame. Set PTS/DTS to same also looks good.

> 
> In any case, please don't put tests for codec ID in the common
> code.  I suggest that the timestamp behaviour wants to be a new
> FLAG_*.
> 
> > +
> >       if (ctx->output_delay == 0) {
> >           pkt->dts = pkt->pts;
> >       } else if (pic->encode_order < ctx->decode_delay) {
> > @@ -689,9 +698,10 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >   {
> >       VAAPIEncodeContext *ctx = avctx->priv_data;
> >       VACodedBufferSegment *buf_list, *buf;
> > -    VAStatus vas;
> > +    AVPacket *pkt_ptr = pkt;
> >       int total_size = 0;
> >       uint8_t *ptr;
> > +    VAStatus vas;
> >       int err;
> >   
> >       err = vaapi_encode_wait(avctx, pic);
> > @@ -711,11 +721,52 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >       for (buf = buf_list; buf; buf = buf->next)
> >           total_size += buf->size;
> >   
> > -    err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > -    ptr = pkt->data;
> > +    /** repack av1 coded frame for not display and repeat frames
> > */
> > +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > +        int display_frame = pic->display_order <= pic-
> > >encode_order;
> >   
> > -    if (err < 0)
> > -        goto fail_mapped;
> > +        if (display_frame) {
> > +            total_size += ctx->header_data_size;
> > +            err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > +            if (err < 0)
> > +                goto fail_mapped;
> > +            ptr = pkt->data;
> > +
> > +            if (ctx->header_data_size) {
> > +                memcpy(ptr, ctx->header_data, ctx-
> > >header_data_size);
> > +                ptr += ctx->header_data_size;
> > +                ctx->header_data_size = 0;
> > +            }
> > +        } else {
> > +            ctx->header_data = av_realloc(ctx->header_data,
> > total_size);
> > +            if (!ctx->header_data) {
> > +                err = AVERROR(ENOMEM);
> > +                goto fail_mapped;
> > +            }
> > +            ptr = ctx->header_data;
> > +            ctx->header_data_size = total_size;
> > +
> > +            if (pic->tail_size) {
> > +                if (ctx->tail_pkt->size) {
> > +                    err = AVERROR(AVERROR_BUG);
> > +                    goto fail_mapped;
> > +                }
> > +
> > +                err = ff_get_encode_buffer(avctx, ctx->tail_pkt,
> > pic->tail_size, 0);
> > +                if (err < 0)
> > +                    goto fail_mapped;
> > +
> > +                memcpy(ctx->tail_pkt->data, pic->tail_data, pic-
> > >tail_size);
> > +                pkt_ptr = ctx->tail_pkt;
> > +            }
> > +        }
> > +    } else {
> > +        err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > +        ptr = pkt->data;
> > +
> > +        if (err < 0)
> > +            goto fail_mapped;
> > +    }
> 
> I'm not convinced that this repeated-realloc-and-copy structure in
> the main context is a good approach.
> 
> How about something like storing the output buffer for every picture
> in that picture as it is generated (could even stay in the VABuffer
> without mapping it if that worked?), then when you get a display-now
> picture concatenate all the oustanding frames together in order into
> the packet (with a single copy).

Your way is feasible, may need to add additional flag in pic to
indicate the pic is completed but not dumped and release pic after
dumped, or just delay to change pic state to completed. I will try it.

> 
> The tail data is trivial to generate on the fly (it's only three
> bytes), so storing it does not seem useful.
> 
> >   
> >       for (buf = buf_list; buf; buf = buf->next) {
> >           av_log(avctx, AV_LOG_DEBUG, "Output buffer: %u bytes "
> > @@ -726,10 +777,10 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >       }
> >   
> >       if (pic->type == PICTURE_TYPE_IDR)
> > -        pkt->flags |= AV_PKT_FLAG_KEY;
> > +        pkt_ptr->flags |= AV_PKT_FLAG_KEY;
> >   
> > -    pkt->pts = pic->pts;
> > -    pkt->duration = pic->duration;
> > +    pkt_ptr->pts = pic->pts;
> > +    pkt_ptr->duration = pic->duration;
> >   
> >       vas = vaUnmapBuffer(ctx->hwctx->display, pic->output_buffer);
> >       if (vas != VA_STATUS_SUCCESS) {
> > @@ -742,8 +793,8 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >       // for no-delay encoders this is handled in generic codec
> >       if (avctx->codec->capabilities & AV_CODEC_CAP_DELAY &&
> >           avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
> > -        pkt->opaque     = pic->opaque;
> > -        pkt->opaque_ref = pic->opaque_ref;
> > +        pkt_ptr->opaque     = pic->opaque;
> > +        pkt_ptr->opaque_ref = pic->opaque_ref;
> >           pic->opaque_ref = NULL;
> >       }
> >   
> > @@ -752,6 +803,9 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >   
> >       av_log(avctx, AV_LOG_DEBUG, "Output read for pic
> > %"PRId64"/%"PRId64".\n",
> >              pic->display_order, pic->encode_order);
> > +
> > +    vaapi_encode_set_output_timestamp(avctx, pic, pkt_ptr);
> > +
> >       return 0;
> >   
> >   fail_mapped:
> > @@ -1128,9 +1182,19 @@ static int
> > vaapi_encode_pick_next(AVCodecContext *avctx,
> >   
> >       vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
> >       if (pic->type != PICTURE_TYPE_IDR) {
> > -        vaapi_encode_add_ref(avctx, pic, start,
> > -                             pic->type == PICTURE_TYPE_P,
> > -                             b_counter > 0, 0);
> > +        // TODO: apply both previous and forward multi reference
> > for all vaapi encoders.
> > +        // And L0/L1 reference frame number can be set dynamically
> > through query
> > +        // VAConfigAttribEncMaxRefFrames attribute.
> > +        if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > +            for (i = 0; i < ctx->nb_next_prev; i++)
> > +                vaapi_encode_add_ref(avctx, pic, ctx-
> > >next_prev[i],
> > +                                     pic->type == PICTURE_TYPE_P,
> > +                                     b_counter > 0, 0);
> 
> So an undisclosed aim of the previous patch was to make this extra
> list of the most recent N frames for this to use with P frames only?

Currently, yes. As the TODO comment says, I will apply it to all other
codecs and B frame next, it will need lots of test on different
hardware. Add this for AV1 here is to align with VPL which use 2
previous reference for P frame. Base on our test, 2 reference frame for
P will improve ~5% BD-rate for quality, so that can make it possible to
let user get same quality with VPL after adjusting and aligning encoder
options.

> 
> Why this partcular structure for P frames only, though?  Seems like
> if you support extra references then other codecs could also make use
> of them in either direction, so we would be better off hinting that
> the codec would like up to N references and then using the contents
> of the existing DPB, plus keep extras if there is space.
> 
> > +        } else
> > +            vaapi_encode_add_ref(avctx, pic, start,
> > +                                 pic->type == PICTURE_TYPE_P,
> > +                                 b_counter > 0, 0);
> > +
> >           vaapi_encode_add_ref(avctx, pic, ctx->next_prev[ctx-
> > >nb_next_prev - 1], 0, 0, 1);
> >       }
> >   
> > @@ -1292,6 +1356,19 @@ int
> > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket
> > *pkt)
> >       AVFrame *frame = ctx->frame;
> >       int err;
> >   
> > +start:
> > +    /** if no B frame before repeat P frame, sent repeat P frame
> > out. */
> > +    if (avctx->codec_id == AV_CODEC_ID_AV1 && ctx->tail_pkt->size) 
> > {
> > +        for (VAAPIEncodePicture *tmp = ctx->pic_start; tmp; tmp =
> > tmp->next) {
> > +            if (tmp->type == PICTURE_TYPE_B && tmp->pts < ctx-
> > >tail_pkt->pts)
> > +                break;
> > +            else if (!tmp->next) {
> > +                av_packet_move_ref(pkt, ctx->tail_pkt);
> > +                goto end;
> > +            }
> > +        }
> > +    }
> > +
> >       err = ff_encode_get_frame(avctx, frame);
> >       if (err < 0 && err != AVERROR_EOF)
> >           return err;
> > @@ -1356,17 +1433,21 @@ int
> > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket
> > *pkt)
> >           return err;
> >       }
> >   
> > -    vaapi_encode_set_output_timestamp(avctx, pic, pkt);
> > -    av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts
> > %"PRId64", "
> > -           "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> > -
> >       ctx->output_order = pic->encode_order;
> >       vaapi_encode_clear_old(avctx);
> >   
> > +    /** loop to get an available pkt in encoder flushing. */
> > +    if (ctx->end_of_stream && !pkt->size)
> > +        goto start;
> > +
> > +end:
> > +    if (pkt->size)
> > +        av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64",
> > dts %"PRId64", "
> > +               "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> > +
> >       return 0;
> >   }
> >   
> > -
> >   static av_cold void vaapi_encode_add_global_param(AVCodecContext
> > *avctx, int type,
> >                                                     void *buffer,
> > size_t size)
> >   {
> > @@ -2667,6 +2748,12 @@ av_cold int
> > ff_vaapi_encode_init(AVCodecContext *avctx)
> >       ctx->device = (AVHWDeviceContext*)ctx->device_ref->data;
> >       ctx->hwctx = ctx->device->hwctx;
> >   
> > +    ctx->tail_pkt = av_packet_alloc();
> > +    if (!ctx->tail_pkt) {
> > +        err = AVERROR(ENOMEM);
> > +        goto fail;
> > +    }
> > +
> >       err = vaapi_encode_profile_entrypoint(avctx);
> >       if (err < 0)
> >           goto fail;
> > @@ -2859,9 +2946,11 @@ av_cold int
> > ff_vaapi_encode_close(AVCodecContext *avctx)
> >       }
> >   
> >       av_frame_free(&ctx->frame);
> > +    av_packet_free(&ctx->tail_pkt);
> >   
> >       av_freep(&ctx->codec_sequence_params);
> >       av_freep(&ctx->codec_picture_params);
> > +    av_freep(&ctx->header_data);
> >       av_fifo_freep2(&ctx->encode_fifo);
> >   
> >       av_buffer_unref(&ctx->recon_frames_ref);
> > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> > index d5452a37b3..03df8d6d46 100644
> > --- a/libavcodec/vaapi_encode.h
> > +++ b/libavcodec/vaapi_encode.h
> > @@ -133,6 +133,11 @@ typedef struct VAAPIEncodePicture {
> >   
> >       int          nb_slices;
> >       VAAPIEncodeSlice *slices;
> > +
> > +    /** Tail data of current pic, used only for repeat header of
> > AV1. */
> > +    char tail_data[MAX_PARAM_BUFFER_SIZE];
> > +    /** Byte length of tail_data. */
> > +    size_t tail_size;
> >   } VAAPIEncodePicture;
> >   
> >   typedef struct VAAPIEncodeProfile {
> > @@ -367,6 +372,13 @@ typedef struct VAAPIEncodeContext {
> >       AVFifo          *encode_fifo;
> >       // Max number of frame buffered in encoder.
> >       int             async_depth;
> > +
> > +    /** Head data for current output pkt, used only for AV1. */
> > +    void  *header_data;
> > +    size_t header_data_size;
> > +
> > +    /** Store av1 repeat frame header pkt. */
> > +    AVPacket *tail_pkt;
> >   } VAAPIEncodeContext;
> >   
> >   enum {
> > diff --git a/libavcodec/vaapi_encode_av1.c
> > b/libavcodec/vaapi_encode_av1.c
> > new file mode 100644
> > index 0000000000..4c8a9d995d
> > --- /dev/null
> > +++ b/libavcodec/vaapi_encode_av1.c
> > @@ -0,0 +1,1229 @@
> > +/*
> > + * Copyright (c) 2023 Intel Corporation
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later
> > version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General
> > Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> > 02110-1301 USA
> > + */
> > +
> > +#include <va/va.h>
> > +#include <va/va_enc_av1.h>
> > +
> > +#include "libavutil/pixdesc.h"
> > +#include "libavutil/opt.h"
> > +
> > +#include "cbs_av1.h"
> > +#include "put_bits.h"
> > +#include "codec_internal.h"
> > +#include "av1_levels.h"
> > +#include "vaapi_encode.h"
> > +
> > +#define AV1_MAX_QUANT 255
> > +
> > +typedef struct VAAPIEncodeAV1Picture {
> > +    int64_t last_idr_frame;
> > +    int slot;
> > +} VAAPIEncodeAV1Picture;
> > +
> > +typedef struct VAAPIEncodeAV1Context {
> > +    VAAPIEncodeContext common;
> > +    AV1RawOBU sh; /**< sequence header.*/
> > +    AV1RawOBU fh; /**< frame header.*/
> > +    CodedBitstreamContext *cbc;
> > +    CodedBitstreamFragment current_obu;
> > +    VAConfigAttribValEncAV1 attr;
> > +    VAConfigAttribValEncAV1Ext1 attr_ext1;
> > +    VAConfigAttribValEncAV1Ext2 attr_ext2;
> > +
> > +    char sh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded sequence
> > header data. */
> > +    size_t sh_data_len; /**< bit length of sh_data. */
> > +    char fh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded frame header
> > data. */
> > +    size_t fh_data_len; /**< bit length of fh_data. */
> > +
> > +    uint8_t uniform_tile;
> > +    uint8_t use_128x128_superblock;
> > +    int sb_cols;
> > +    int sb_rows;
> > +    int tile_cols_log2;
> > +    int tile_rows_log2;
> > +    int max_tile_width_sb;
> > +    int max_tile_height_sb;
> > +    uint8_t width_in_sbs_minus_1[AV1_MAX_TILE_COLS];
> > +    uint8_t height_in_sbs_minus_1[AV1_MAX_TILE_ROWS];
> > +
> > +    int min_log2_tile_cols;
> > +    int max_log2_tile_cols;
> > +    int min_log2_tile_rows;
> > +    int max_log2_tile_rows;
> > +
> > +    int q_idx_idr;
> > +    int q_idx_p;
> > +    int q_idx_b;
> > +
> > +    /** user options */
> > +    int profile;
> > +    int level;
> > +    int tier;
> > +    int tile_cols, tile_rows;
> > +    int tile_groups;
> > +} VAAPIEncodeAV1Context;
> > +
> > +static av_cold int vaapi_encode_av1_configure(AVCodecContext
> > *avctx)
> > +{
> > +    VAAPIEncodeContext     *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +    int ret;
> > +
> > +    ret = ff_cbs_init(&priv->cbc, AV_CODEC_ID_AV1, avctx);
> > +    if (ret < 0)
> > +        return ret;
> > +
> > +    if (ctx->rc_mode->quality) {
> > +        priv->q_idx_p = av_clip(ctx->rc_quality, 0,
> > AV1_MAX_QUANT);
> > +        if (fabs(avctx->i_quant_factor) > 0.0)
> > +            priv->q_idx_idr =
> > +                av_clip((fabs(avctx->i_quant_factor) * priv-
> > >q_idx_p  +
> > +                         avctx->i_quant_offset) + 0.5,
> > +                        0, AV1_MAX_QUANT);
> > +        else
> > +            priv->q_idx_idr = priv->q_idx_p;
> > +
> > +        if (fabs(avctx->b_quant_factor) > 0.0)
> > +            priv->q_idx_b =
> > +                av_clip((fabs(avctx->b_quant_factor) * priv-
> > >q_idx_p  +
> > +                         avctx->b_quant_offset) + 0.5,
> > +                        0, AV1_MAX_QUANT);
> > +        else
> > +            priv->q_idx_b = priv->q_idx_p;
> > +    } else {
> > +        /** Arbitrary value */
> > +        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 128;
> > +    }
> 
> Are there any alignment requirements here?  (If I gave it an input
> which is, say, 1361x1, would that work?)

I didn't get you. Which parameter here need to be align? Max/Min
resolution HW supported will be checked somewhere else. If use 1361x1
there will be the error like:

"Hardware does not support encoding at size 1376x16 (constraints: width
32-8192 height 32-8192)." 

> 
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_add_obu(AVCodecContext *avctx,
> > +                                    CodedBitstreamFragment *au,
> > +                                    uint8_t type,
> > +                                    void *obu_unit)
> > +{
> > +    int ret;
> > +
> > +    ret = ff_cbs_insert_unit_content(au, -1,
> > +                                     type, obu_unit, NULL);
> > +    if (ret < 0) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to add OBU unit: "
> > +               "type = %d.\n", type);
> > +        return ret;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_write_obu(AVCodecContext *avctx,
> > +                                      char *data, size_t
> > *data_len,
> > +                                      CodedBitstreamFragment *bs)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +    int ret;
> > +
> > +    ret = ff_cbs_write_fragment_data(priv->cbc, bs);
> > +    if (ret < 0) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to write packed
> > header.\n");
> > +        return ret;
> > +    }
> > +
> > +    if ((size_t)8 * MAX_PARAM_BUFFER_SIZE < 8 * bs->data_size -
> > bs->data_bit_padding) {
> > +        av_log(avctx, AV_LOG_ERROR, "Access unit too large: "
> > +               "%zu < %zu.\n", (size_t)8 * MAX_PARAM_BUFFER_SIZE,
> > +               8 * bs->data_size - bs->data_bit_padding);
> > +        return AVERROR(ENOSPC);
> > +    }
> > +
> > +    memcpy(data, bs->data, bs->data_size);
> > +    *data_len = 8 * bs->data_size - bs->data_bit_padding;
> > +
> > +    return 0;
> > +}
> > +
> > +static int get_relative_dist(const AV1RawSequenceHeader *seq,
> > +                             unsigned int a, unsigned int b)
> > +{
> > +    unsigned int diff, m;
> > +    if (!seq->enable_order_hint)
> > +        return 0;
> > +    diff = a - b;
> > +    m = 1 << seq->order_hint_bits_minus_1;
> > +    diff = (diff & (m - 1)) - (diff & m);
> > +    return diff;
> > +}
> > +
> > +static int write_ns(PutBitContext *pbc, uint32_t n, uint32_t
> > value)
> > +{
> > +    uint32_t w, m, v, extra_bit;
> > +
> > +    w = av_log2(n) + 1;
> > +    m = (1 << w) - n;
> > +
> > +    if (value < m) {
> > +        v = value;
> > +        put_bits(pbc, w - 1, v);
> > +    } else {
> > +        v = m + ((value - m) >> 1);
> > +        extra_bit = (value - m) & 1;
> > +        put_bits(pbc, w - 1, v);
> > +        put_bits(pbc, 1, extra_bit);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +/**
> > + * This API provide the minmum implemention according current
> > enabled features
> > + * in frame header. If more features will enable in furture,
> > please make sure
> > + * the relative flags of features should be packed correctly into
> > frame header
> > + * obu in this API.
> > + */
> > +static int vaapi_encode_av1_write_frame_header(AVCodecContext
> > *avctx,
> > +                                               VAAPIEncodePicture
> > *pic,
> > +                                               char *data, size_t
> > *data_len)
> > +{
> > +    VAAPIEncodeContext              *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> > +    AV1RawOBU                    *fh_obu = &priv->fh;
> > +    AV1RawOBU                    *sh_obu = &priv->sh;
> > +    AV1RawFrameHeader                *fh = &fh_obu-
> > >obu.frame.header;
> > +    AV1RawSequenceHeader             *sh = &sh_obu-
> > >obu.sequence_header;
> > +    VAEncPictureParameterBufferAV1 *vpic = pic-
> > >codec_picture_params;
> > +    PutBitContext pbc, pbc_tmp;
> > +    uint8_t byte;
> > +    int qindex, coded_lossless;
> > +    int id_len, frame_is_intra, skip_mode_allowed;
> > +    int start, payload_bits, obu_size, obu_size_len;
> > +    int qindex_offset, loopfilter_offset;
> > +    int cdef_start_offset, cdef_end_offset;
> > +    int i;
> > +
> > +    init_put_bits(&pbc, data, MAX_PARAM_BUFFER_SIZE);
> > +
> > +    /** obu header */
> > +    put_bits(&pbc, 1, fh_obu->header.obu_forbidden_bit);
> > +    put_bits(&pbc, 4, fh_obu->header.obu_type);
> > +    put_bits(&pbc, 1, fh_obu->header.obu_extension_flag);
> > +    put_bits(&pbc, 1, fh_obu->header.obu_has_size_field);
> > +    put_bits(&pbc, 1, fh_obu->header.obu_reserved_1bit);
> > +
> > +    /** record pbc status to re-write obu size later. */
> > +    if (fh_obu->header.obu_has_size_field) {
> > +        pbc_tmp = pbc;
> > +        put_bits32(&pbc, 0);
> > +        put_bits32(&pbc, 0);
> > +    }
> > +
> > +    start = put_bits_count(&pbc);
> > +
> > +    /** uncompressed_header() */
> > +    if (sh->frame_id_numbers_present_flag)
> > +        id_len = sh->additional_frame_id_length_minus_1 +
> > +                 sh->delta_frame_id_length_minus_2 + 3;
> > +
> > +    frame_is_intra = (fh->frame_type == AV1_FRAME_KEY ||
> > +                      fh->frame_type == AV1_FRAME_INTRA_ONLY);
> > +    if (!sh->reduced_still_picture_header) {
> > +        put_bits(&pbc, 1, fh->show_existing_frame);
> > +        if (fh->show_existing_frame) {
> > +            put_bits(&pbc, 3, fh->frame_to_show_map_idx);
> > +            goto trailing_bits;
> > +        }
> > +        if (sh->frame_id_numbers_present_flag)
> > +            put_bits(&pbc, id_len, fh->display_frame_id);
> > +
> > +        put_bits(&pbc, 2, fh->frame_type);
> > +        put_bits(&pbc, 1, fh->show_frame);
> > +        if (!fh->show_frame)
> > +            put_bits(&pbc, 1, fh->showable_frame);
> > +
> > +        if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> > +            (fh->frame_type == AV1_FRAME_KEY && fh->show_frame)))
> > +            put_bits(&pbc, 1, fh->error_resilient_mode);
> > +    }
> > +
> > +    put_bits(&pbc, 1, fh->disable_cdf_update);
> > +    if (sh->seq_force_screen_content_tools ==
> > AV1_SELECT_SCREEN_CONTENT_TOOLS)
> > +        put_bits(&pbc, 1, fh->allow_screen_content_tools);
> > +
> > +    if (fh->allow_screen_content_tools && sh->seq_force_integer_mv 
> > == AV1_SELECT_INTEGER_MV)
> > +        put_bits(&pbc, 1, fh->force_integer_mv);
> > +
> > +    if (sh->frame_id_numbers_present_flag)
> > +        put_bits(&pbc, id_len, fh->current_frame_id);
> > +
> > +    if (fh->frame_type != AV1_FRAME_SWITCH && !sh-
> > >reduced_still_picture_header)
> > +        put_bits(&pbc, 1, fh->frame_size_override_flag);
> > +
> > +    if (sh->enable_order_hint)
> > +        put_bits(&pbc, sh->order_hint_bits_minus_1 + 1, fh-
> > >order_hint);
> > +
> > +    if (!(frame_is_intra || fh->error_resilient_mode))
> > +        put_bits(&pbc, 3, fh->primary_ref_frame);
> > +
> > +    if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> > +        fh->frame_type == AV1_FRAME_KEY && fh->show_frame))
> > +        put_bits(&pbc, 8, fh->refresh_frame_flags);
> > +
> > +    if (frame_is_intra) {
> > +        /** render_size() */
> > +         put_bits(&pbc, 1, fh->render_and_frame_size_different);
> > +    } else {
> > +        if (!frame_is_intra && sh->enable_order_hint)
> > +            put_bits(&pbc, 1, fh->frame_refs_short_signaling);
> > +
> > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +            if (!fh->frame_refs_short_signaling)
> > +                put_bits(&pbc, 3, fh->ref_frame_idx[i]);
> > +        }
> > +
> > +        if (!(fh->frame_size_override_flag && !fh-
> > >error_resilient_mode))
> > +            put_bits(&pbc, 1, fh-
> > >render_and_frame_size_different);
> > +
> > +        if (!fh->force_integer_mv)
> > +            put_bits(&pbc, 1, fh->allow_high_precision_mv);
> > +
> > +        /** read_interpolation_filter() */
> > +        put_bits(&pbc, 1, fh->is_filter_switchable);
> > +        if (!fh->is_filter_switchable)
> > +            put_bits(&pbc, 2, fh->interpolation_filter);
> > +
> > +        put_bits(&pbc, 1, fh->is_motion_mode_switchable);
> > +    }
> > +
> > +    if (!(sh->reduced_still_picture_header || fh-
> > >disable_cdf_update))
> > +        put_bits(&pbc, 1, fh->disable_frame_end_update_cdf);
> > +
> > +    /** tile_info() */
> > +    put_bits(&pbc, 1, fh->uniform_tile_spacing_flag);
> > +    if (fh->uniform_tile_spacing_flag) {
> > +        for (i = 0; i < priv->tile_cols_log2 - priv-
> > >min_log2_tile_cols; i++) {
> > +            put_bits(&pbc, 1, 1);
> > +        }
> > +        if (priv->tile_cols_log2 < priv->max_log2_tile_cols)
> > +            put_bits(&pbc, 1, 0);
> > +
> > +        for (i = 0; i < priv->tile_rows_log2 - priv-
> > >min_log2_tile_rows; i++) {
> > +            put_bits(&pbc, 1, 1);
> > +        }
> > +        if (priv->tile_rows_log2 < priv->max_log2_tile_rows)
> > +            put_bits(&pbc, 1, 0);
> > +    } else {
> > +        int start_sb = 0;
> > +        int max_width, max_height;
> > +        for (i = 0; start_sb < priv->sb_cols; i++) {
> > +            max_width = FFMIN(priv->sb_cols - start_sb, priv-
> > >max_tile_width_sb);
> > +            write_ns(&pbc, max_width, fh-
> > >width_in_sbs_minus_1[i]);
> > +            start_sb += fh->width_in_sbs_minus_1[i] + 1;
> > +        }
> > +
> > +        start_sb = 0;
> > +        for (i = 0; start_sb < priv->sb_rows; i++) {
> > +            max_height = FFMIN(priv->sb_rows - start_sb, priv-
> > >max_tile_height_sb);
> > +            write_ns(&pbc, max_height, fh-
> > >height_in_sbs_minus_1[i]);
> > +            start_sb += fh->height_in_sbs_minus_1[i] + 1;
> > +        }
> > +    }
> > +
> > +    if (priv->tile_cols_log2 || priv->tile_rows_log2) {
> > +        put_bits(&pbc, priv->tile_cols_log2 + priv-
> > >tile_rows_log2, fh->context_update_tile_id);
> > +        put_bits(&pbc, 2, fh->tile_size_bytes_minus1);
> > +    }
> > +
> > +    qindex_offset = put_bits_count(&pbc);
> > +    /** quantization_params() */
> > +    put_bits(&pbc, 8, fh->base_q_idx);
> > +    put_bits(&pbc, 1, fh->delta_q_y_dc);
> > +    put_bits(&pbc, 1, fh->delta_q_u_dc);
> > +    put_bits(&pbc, 1, fh->delta_q_u_ac);
> > +    put_bits(&pbc, 1, fh->using_qmatrix);
> > +
> > +    /** segmentation_params() */
> > +    put_bits(&pbc, 1, fh->segmentation_enabled);
> > +
> > +    /** delta_q_params() */
> > +    if (fh->base_q_idx)
> > +        put_bits(&pbc, 1, fh->delta_q_present);
> > +    if (fh->delta_q_present)
> > +        put_bits(&pbc, 2, fh->delta_q_res);
> > +
> > +    /** delta_lf_params() */
> > +    if (fh->delta_q_present) {
> > +        if (!fh->allow_intrabc)
> > +            put_bits(&pbc, 1, fh->delta_lf_present);
> > +        if (fh->delta_lf_present) {
> > +            put_bits(&pbc, 2, fh->delta_lf_res);
> > +            put_bits(&pbc, 1, fh->delta_lf_multi);
> > +        }
> > +    }
> > +
> > +    /** codelossless */
> > +    coded_lossless = 1;
> > +    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
> > +        if (fh->segmentation_enabled && fh-
> > >feature_enabled[i][AV1_SEG_LVL_ALT_Q])
> > +            qindex = fh->base_q_idx + fh-
> > >feature_value[i][AV1_SEG_LVL_ALT_Q];
> > +        else
> > +            qindex = fh->base_q_idx;
> > +        qindex = av_clip_uintp2(qindex, 8);
> > +
> > +        if (qindex || fh->delta_q_y_dc || fh->delta_q_u_ac || fh-
> > >delta_q_u_dc ||
> > +            fh->delta_q_v_ac || fh->delta_q_v_dc)
> > +            coded_lossless = 0;
> > +    }
> > +
> > +    loopfilter_offset = put_bits_count(&pbc);
> > +    /** loop_filter_params() */
> > +    if (!(coded_lossless || fh->allow_intrabc)) {
> > +        put_bits(&pbc, 6, fh->loop_filter_level[0]);
> > +        put_bits(&pbc, 6, fh->loop_filter_level[1]);
> > +        if (fh->loop_filter_level[0] || fh->loop_filter_level[1])
> > {
> > +            put_bits(&pbc, 6, fh->loop_filter_level[2]);
> > +            put_bits(&pbc, 6, fh->loop_filter_level[3]);
> > +        }
> > +        put_bits(&pbc, 3, fh->loop_filter_sharpness);
> > +        put_bits(&pbc, 1, fh->loop_filter_delta_enabled);
> > +    }
> > +
> > +    cdef_start_offset = put_bits_count(&pbc);
> > +    /** cdef_params() */
> > +    if (!(coded_lossless || fh->allow_intrabc || !sh-
> > >enable_cdef)) {
> > +        put_bits(&pbc, 2, fh->cdef_damping_minus_3);
> > +        put_bits(&pbc, 2, fh->cdef_bits);
> > +        for (i = 0; i < (1 << fh->cdef_bits); i++) {
> > +            put_bits(&pbc, 4, fh->cdef_y_pri_strength[i]);
> > +            put_bits(&pbc, 2, fh->cdef_y_sec_strength[i]);
> > +            put_bits(&pbc, 4, fh->cdef_uv_pri_strength[i]);
> > +            put_bits(&pbc, 2, fh->cdef_uv_sec_strength[i]);
> > +        }
> > +    }
> > +    cdef_end_offset = put_bits_count(&pbc);
> > +
> > +    /** read_tx_mode() */
> > +    if (fh->tx_mode == AV1_TX_MODE_SELECT)
> > +        put_bits(&pbc, 1, 1);
> > +    else
> > +        put_bits(&pbc, 1, 0);
> > +
> > +    /** frame_reference_mode() */
> > +    if (!frame_is_intra)
> > +        put_bits(&pbc, 1, fh->reference_select);
> > +
> > +    /** skip_mode_params() */
> > +    if (frame_is_intra || !fh->reference_select || !sh-
> > >enable_order_hint)
> > +        skip_mode_allowed = 0;
> > +    else {
> > +        int forward_idx, backward_idx;
> > +        int ref_hint, forward_hint, backward_hint;
> > +
> > +        forward_idx = -1;
> > +        backward_idx = -1;
> > +
> > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +            ref_hint = fh->ref_order_hint[fh->ref_frame_idx[i]];
> > +            if (get_relative_dist(sh, ref_hint, fh->order_hint) <
> > 0) {
> > +                if (forward_idx < 0 || get_relative_dist(sh,
> > ref_hint, forward_idx) > 0) {
> > +                    forward_idx = i;
> > +                    forward_hint = ref_hint;
> > +                }
> > +            } else if (get_relative_dist(sh, ref_hint, fh-
> > >order_hint) > 0) {
> > +                if (backward_idx < 0 || get_relative_dist(sh,
> > ref_hint, backward_hint) < 0) {
> > +                    backward_idx = i;
> > +                    backward_hint = ref_hint;
> > +                }
> > +            }
> > +        }
> > +        if (forward_idx < 0)
> > +            skip_mode_allowed = 0;
> > +        else if (backward_idx >= 0)
> > +            skip_mode_allowed = 1;
> > +        else {
> > +            int second_forward_idx, second_forward_hint;
> > +            second_forward_idx = -1;
> > +            for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +                ref_hint = fh->ref_order_hint[fh-
> > >ref_frame_idx[i]];
> > +                if (get_relative_dist(sh, ref_hint, forward_hint)
> > < 0) {
> > +                    if (second_forward_idx < 0 ||
> > get_relative_dist(sh, ref_hint, second_forward_hint) > 0){
> > +                        second_forward_idx = i;
> > +                        second_forward_hint = ref_hint;
> > +                    }
> > +                }
> > +            }
> > +            if (second_forward_idx < 0)
> > +                skip_mode_allowed = 0;
> > +            else
> > +                skip_mode_allowed = 1;
> > +        }
> > +    }
> > +
> > +    if (skip_mode_allowed)
> > +        put_bits(&pbc, 1, fh->skip_mode_present);
> > +
> > +    put_bits(&pbc, 1, fh->reduced_tx_set);
> > +
> > +    /** global_motion_params() */
> > +    if (!frame_is_intra) {
> > +        for (i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF;
> > i++) {
> > +            put_bits(&pbc, 1, fh->is_global[i]);
> > +            if (fh->is_global[i]) {
> > +                put_bits(&pbc, 1, fh->is_rot_zoom[i]);
> > +                if (!fh->is_rot_zoom[i])
> > +                    put_bits(&pbc, 1, fh->is_translation[i]);
> > +            }
> > +        }
> > +    }
> > +
> > +trailing_bits:
> > +    payload_bits = put_bits_count(&pbc) - start;
> > +
> > +    /** trailing_bits() */
> > +    put_bits(&pbc, 1, 1);
> > +    obu_size = (put_bits_count(&pbc) - start + 7) / 8;
> > +    for (i = 0; i < obu_size * 8 - payload_bits - 1; i++)
> > +        put_bits(&pbc, 1, 0);
> > +
> > +    flush_put_bits(&pbc);
> > +    *data_len = put_bits_count(&pbc);
> > +
> > +    /** update obu size in bitstream */
> > +    if (fh_obu->header.obu_has_size_field) {
> > +        obu_size_len = priv->attr_ext2.bits.obu_size_bytes_minus1
> > + 1;
> > +        for (i = 0; i < obu_size_len; i++) {
> > +            byte = obu_size >> (7 * i) & 0x7f;
> > +            if (i < obu_size_len - 1)
> > +                byte |= 0x80;
> > +            put_bits(&pbc_tmp, 8, byte);
> > +        }
> > +        flush_put_bits(&pbc_tmp);
> > +        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 -
> > obu_size_len), obu_size);
> > +        *data_len -= (8 - obu_size_len) * 8;
> > +    }
> 
> Why is there an incomplete duplicate of the cbs_av1 header writing
> code here?

To record some position/size in bitstream that needed for VAAPI. Like
qp_index/loopfilter/cdef offset and cdef parameters size in bit. It's
not reasonable to add the specific parameters into CBS. 

> 
> > +
> > +    if (fh->show_existing_frame)
> > +        return 0;
> > +
> > +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> > +        vpic->min_base_qindex = av_clip(avctx->qmin, 1,
> > AV1_MAX_QUANT);
> > +        vpic->max_base_qindex = av_clip(avctx->qmax, 1,
> > AV1_MAX_QUANT);
> > +
> > +        vpic->bit_offset_qindex            = qindex_offset - (8 -
> > obu_size_len) * 8;
> > +        vpic->bit_offset_loopfilter_params = loopfilter_offset -
> > (8 - obu_size_len) * 8;
> > +        vpic->bit_offset_cdef_params       = cdef_start_offset -
> > (8 - obu_size_len) * 8;
> > +        vpic->size_in_bits_cdef_params     = cdef_end_offset -
> > cdef_start_offset;
> > +        vpic->size_in_bits_frame_hdr_obu   = *data_len;
> > +
> > +        vpic->byte_offset_frame_hdr_obu_size = (((pic->type ==
> > PICTURE_TYPE_IDR) ?
> > +                                               priv->sh_data_len /
> > 8 : 0) +
> > +                                               (fh_obu-
> > >header.obu_extension_flag ?
> > +                                               2 : 1));
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int tile_log2(int blkSize, int target) {
> > +    int k;
> > +    for (k = 0; (blkSize << k) < target; k++);
> > +    return k;
> > +}
> > +
> > +static int vaapi_encode_av1_set_tile(AVCodecContext *avctx)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +    int mi_cols, mi_rows, sb_shift, sb_size;
> > +    int max_tile_area_sb, max_tile_area_sb_varied;
> > +    int tile_width_sb, tile_height_sb, widest_tile_sb;
> > +    int min_log2_tiles;
> > +    int tile_rows_tmp, i;
> > +
> > +    if (priv->tile_cols > AV1_MAX_TILE_COLS ||
> > +        priv->tile_rows > AV1_MAX_TILE_ROWS) {
> > +        av_log(avctx, AV_LOG_ERROR, "Invalid tile number %dx%d,
> > should less than %dx%d.\n",
> > +               priv->tile_cols, priv->tile_rows,
> > AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);
> 
> Can't this be a constraint on the option rather than a special check
> here?

Tiles set with IMAGE_SIZE option type which aligns with HEVC. It
doesn't support Min/MAX check.

> 
> > +        return AVERROR(EINVAL);
> > +    }
> > +
> > +    mi_cols = 2 * ((avctx->width + 7) >> 3);
> > +    mi_rows = 2 * ((avctx->height + 7) >> 3);
> > +    priv->sb_cols = priv->use_128x128_superblock ?
> > +                    ((mi_cols + 31) >> 5) : ((mi_cols + 15) >> 4);
> > +    priv->sb_rows = priv->use_128x128_superblock ?
> > +                    ((mi_rows + 31) >> 5) : ((mi_rows + 15) >> 4);
> > +    sb_shift = priv->use_128x128_superblock ? 5 : 4;
> > +    sb_size  = sb_shift + 2;
> > +    priv->max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size;
> > +    max_tile_area_sb = AV1_MAX_TILE_AREA  >> (2 * sb_size);
> > +
> > +    priv->min_log2_tile_cols = tile_log2(priv->max_tile_width_sb,
> > priv->sb_cols);
> > +    priv->max_log2_tile_cols = tile_log2(1, FFMIN(priv->sb_cols,
> > AV1_MAX_TILE_COLS));
> > +    priv->max_log2_tile_rows = tile_log2(1, FFMIN(priv->sb_rows,
> > AV1_MAX_TILE_ROWS));
> > +    min_log2_tiles = FFMAX(priv->min_log2_tile_cols,
> > +                           tile_log2(max_tile_area_sb, priv-
> > >sb_rows * priv->sb_cols));
> > +
> > +    if (priv->tile_cols != av_clip(priv->tile_cols, (priv->sb_cols 
> > + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv-
> > >sb_cols)) {
> > +        priv->tile_cols = av_clip(priv->tile_cols, (priv->sb_cols
> > + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv-
> > >sb_cols);
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile cols, correct
> > to %d.\n", priv->tile_cols);
> > +    }
> 
> If the user explicitly asked for a number which isn't possible then
> you probably want to fail here (see comment about some sort of "auto"
> setting above).

OK, will change it to 'auto' by default.

> 
> > +
> > +    priv->tile_cols_log2 = tile_log2(1, priv->tile_cols);
> > +    tile_width_sb = (priv->sb_cols + (1 << priv->tile_cols_log2) -
> > 1) >>
> > +                    priv->tile_cols_log2;
> > +
> > +    if (priv->tile_rows > priv->sb_rows) {
> > +        priv->tile_rows = priv->sb_rows;
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct
> > to %d.\n", priv->tile_rows);
> > +    }
> > +
> > +    for (tile_rows_tmp = priv->tile_rows; tile_rows_tmp <= priv-
> > >sb_rows && tile_rows_tmp <= AV1_MAX_TILE_ROWS; tile_rows_tmp++) {
> > +        /** try uniformed tile. */
> > +        priv->tile_rows_log2 = tile_log2(1, tile_rows_tmp);
> > +        if ((priv->sb_cols + tile_width_sb - 1) / tile_width_sb ==
> > priv->tile_cols) {
> > +            for (i = 0; i < priv->tile_cols - 1; i++)
> > +                priv->width_in_sbs_minus_1[i] = tile_width_sb - 1;
> > +            priv->width_in_sbs_minus_1[i] = priv->sb_cols - (priv-
> > >tile_cols - 1) * tile_width_sb - 1;
> > +
> > +            tile_height_sb = (priv->sb_rows + (1 << priv-
> > >tile_rows_log2) - 1) >>
> > +                             priv->tile_rows_log2;
> > +
> > +            if ((priv->sb_rows + tile_height_sb - 1) /
> > tile_height_sb == tile_rows_tmp &&
> > +                tile_height_sb <= max_tile_area_sb /
> > tile_width_sb) {
> > +                for (i = 0; i < tile_rows_tmp - 1; i++)
> > +                    priv->height_in_sbs_minus_1[i] =
> > tile_height_sb - 1;
> > +                priv->height_in_sbs_minus_1[i] = priv->sb_rows -
> > (tile_rows_tmp - 1) * tile_height_sb - 1;
> > +
> > +                priv->uniform_tile = 1;
> > +                priv->min_log2_tile_rows = FFMAX(min_log2_tiles -
> > priv->tile_cols_log2, 0);
> > +
> > +                break;
> > +            }
> > +        }
> > +
> > +        /** try non-uniformed tile. */
> > +        widest_tile_sb = 0;
> > +        for (i = 0; i < priv->tile_cols; i++) {
> > +            priv->width_in_sbs_minus_1[i] = (i + 1) * priv-
> > >sb_cols / priv->tile_cols - i * priv->sb_cols / priv->tile_cols -
> > 1;
> > +            widest_tile_sb = FFMAX(widest_tile_sb, priv-
> > >width_in_sbs_minus_1[i] + 1);
> > +        }
> > +
> > +        if (min_log2_tiles)
> > +            max_tile_area_sb_varied = (priv->sb_rows * priv-
> > >sb_cols) >> (min_log2_tiles + 1);
> > +        else
> > +            max_tile_area_sb_varied = priv->sb_rows * priv-
> > >sb_cols;
> > +        priv->max_tile_height_sb = FFMAX(1,
> > max_tile_area_sb_varied / widest_tile_sb);
> > +
> > +        if (tile_rows_tmp == av_clip(tile_rows_tmp, (priv->sb_rows 
> > + priv->max_tile_height_sb - 1) / priv->max_tile_height_sb, priv-
> > >sb_rows)) {
> > +            for (i = 0; i < tile_rows_tmp; i++)
> > +                priv->height_in_sbs_minus_1[i] = (i + 1) * priv-
> > >sb_rows / tile_rows_tmp - i * priv->sb_rows / tile_rows_tmp - 1;
> > +
> > +            break;
> > +        }
> > +    }
> > +
> > +    if (priv->tile_rows != tile_rows_tmp) {
> > +        priv->tile_rows = tile_rows_tmp;
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct
> > to %d.\n", priv->tile_rows);
> > +    }
> > +
> > +    /** check if tile cols/rows is supported by driver. */
> > +    if (priv->attr_ext2.bits.max_tile_num_minus1) {
> > +        if ((priv->tile_cols * priv->tile_rows - 1) > priv-
> > >attr_ext2.bits.max_tile_num_minus1) {
> > +            av_log(avctx, AV_LOG_ERROR, "Unsupported tile num %d *
> > %d = %d by driver, "
> > +                   "should be less than %d.\n", priv->tile_cols,
> > priv->tile_rows,
> 
> s/less than/at most/ ?
> 
> > +                   priv->tile_cols * priv->tile_rows,
> > +                   priv->attr_ext2.bits.max_tile_num_minus1 + 1);
> > +            return AVERROR(EINVAL);
> > +        }
> > +    }
> > +    av_log(avctx, AV_LOG_DEBUG, "Setting tile cols/rows to
> > %d/%d.\n",
> > +           priv->tile_cols, priv->tile_rows);
> > +
> > +    /** check if tile group numbers is valid. */
> > +    if (priv->tile_groups > priv->tile_cols * priv->tile_rows) {
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile groups number
> > %d, "
> > +        "correct to %d.\n", priv->tile_groups, priv->tile_cols *
> > priv->tile_rows);
> > +        priv->tile_groups = priv->tile_cols * priv->tile_rows;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_write_sequence_header(AVCodecContext
> > *avctx,
> > +                                                  char *data,
> > size_t *data_len)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +
> > +    memcpy(data, &priv->sh_data, MAX_PARAM_BUFFER_SIZE *
> > sizeof(char));
> > +    *data_len = priv->sh_data_len;
> 
> I'm confused why you write the sequence header in the init function
> and then store it to use here?

Sequence data is keep same during the encode, so I think it doesn't
need to re-write it in each GOP.

> 
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_init_sequence_params(AVCodecContext
> > *avctx)
> > +{
> > +    VAAPIEncodeContext               *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context           *priv = avctx->priv_data;
> > +    AV1RawOBU                     *sh_obu = &priv->sh;
> > +    AV1RawSequenceHeader              *sh = &sh_obu-
> > >obu.sequence_header;
> > +    VAEncSequenceParameterBufferAV1 *vseq = ctx-
> > >codec_sequence_params;
> > +    CodedBitstreamFragment           *obu = &priv->current_obu;
> > +    const AVPixFmtDescriptor *desc;
> > +    int ret;
> > +
> > +    memset(sh_obu, 0, sizeof(*sh_obu));
> > +    sh_obu->header.obu_type = AV1_OBU_SEQUENCE_HEADER;
> > +
> > +    desc = av_pix_fmt_desc_get(priv->common.input_frames-
> > >sw_format);
> > +    av_assert0(desc);
> > +
> > +    sh->seq_profile  = avctx->profile;
> > +    if (!sh->seq_force_screen_content_tools)
> > +        sh->seq_force_integer_mv = AV1_SELECT_INTEGER_MV;
> > +    sh->frame_width_bits_minus_1  = av_log2(avctx->width);
> > +    sh->frame_height_bits_minus_1 = av_log2(avctx->height);
> > +    sh->max_frame_width_minus_1   = avctx->width - 1;
> > +    sh->max_frame_height_minus_1  = avctx->height - 1;
> > +    sh->seq_tier[0]               = priv->tier;
> > +    /** enable order hint and reserve maximum 8 bits for it by
> > default. */
> > +    sh->enable_order_hint         = 1;
> > +    sh->order_hint_bits_minus_1   = 7;
> > +
> > +    sh->color_config = (AV1RawColorConfig) {
> > +        .high_bitdepth                  = desc->comp[0].depth == 8
> > ? 0 : 1,
> > +        .color_primaries                = avctx->color_primaries,
> > +        .transfer_characteristics       = avctx->color_trc,
> > +        .matrix_coefficients            = avctx->colorspace,
> > +        .color_description_present_flag = (avctx->color_primaries
> > != AVCOL_PRI_UNSPECIFIED ||
> > +                                           avctx-
> > >color_trc       != AVCOL_TRC_UNSPECIFIED ||
> > +                                           avctx-
> > >colorspace      != AVCOL_SPC_UNSPECIFIED),
> > +        .color_range                    = avctx->color_range ==
> > AVCOL_RANGE_JPEG,
> > +        .subsampling_x                  = desc->log2_chroma_w,
> > +        .subsampling_y                  = desc->log2_chroma_h,
> 
> .chroma_sample_position = some function of chroma_sample_location.

There is no chroma_sample_position supported in VAAPI yet.

> 
> > +    };
> > +
> > +    if (avctx->level != FF_LEVEL_UNKNOWN) {
> > +        sh->seq_level_idx[0] = avctx->level;
> > +    } else {
> > +        const AV1LevelDescriptor *level;
> > +        float framerate;
> > +
> > +        if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
> > +            framerate = avctx->framerate.num / avctx-
> > >framerate.den;
> > +        else
> > +            framerate = 0;
> > +
> > +        level = ff_av1_guess_level(avctx->bit_rate, priv->tier,
> > +                                   ctx->surface_width, ctx-
> > >surface_height,
> > +                                   priv->tile_rows * priv-
> > >tile_cols,
> > +                                   priv->tile_cols, framerate);
> > +        if (level) {
> > +            av_log(avctx, AV_LOG_VERBOSE, "Using level %s.\n",
> > level->name);
> > +            sh->seq_level_idx[0] = level->level_idx;
> > +        } else {
> > +            av_log(avctx, AV_LOG_VERBOSE, "Stream will not conform
> > to "
> > +                   "any normal level, using level 6.3 by
> > default.\n");
> > +            sh->seq_level_idx[0] = 19;
> 
> Isn't this exactly what level_idx == 31 is for?
> 
> > +            sh->seq_tier[0] = 1;
> > +        }
> > +    }
> > +    vseq->seq_profile             = sh->seq_profile;
> > +    vseq->seq_level_idx           = sh->seq_level_idx[0];
> > +    vseq->seq_tier                = sh->seq_tier[0];
> > +    vseq->order_hint_bits_minus_1 = sh->order_hint_bits_minus_1;
> > +    vseq->intra_period            = ctx->gop_size;
> > +    vseq->ip_period               = ctx->b_per_p + 1;
> > +
> > +    vseq->seq_fields.bits.enable_order_hint = sh-
> > >enable_order_hint;
> > +
> > +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> > +        vseq->bits_per_second = ctx->va_bit_rate;
> > +        vseq->seq_fields.bits.enable_cdef = sh->enable_cdef = 1;
> > +    }
> > +
> > +    ret = vaapi_encode_av1_add_obu(avctx, obu,
> > AV1_OBU_SEQUENCE_HEADER, &priv->sh);
> > +    if (ret < 0)
> > +        goto end;
> > +
> > +    ret = vaapi_encode_av1_write_obu(avctx, priv->sh_data, &priv-
> > >sh_data_len, obu);
> > +    if (ret < 0)
> > +        goto end;
> > +
> > +end:
> > +    ff_cbs_fragment_reset(obu);
> > +    return ret;
> > +}
> > +
> > +static int vaapi_encode_av1_init_picture_params(AVCodecContext
> > *avctx,
> > +                                                VAAPIEncodePicture
> > *pic)
> > +{
> > +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> > +    VAAPIEncodeAV1Picture          *hpic = pic->priv_data;
> > +    AV1RawOBU                    *fh_obu = &priv->fh;
> > +    AV1RawFrameHeader                *fh = &fh_obu-
> > >obu.frame.header;
> > +    VAEncPictureParameterBufferAV1 *vpic = pic-
> > >codec_picture_params;
> > +    CodedBitstreamFragment          *obu = &priv->current_obu;
> > +    VAAPIEncodePicture    *ref;
> > +    VAAPIEncodeAV1Picture *href;
> > +    int slot, i;
> > +    int ret;
> > +    static const int8_t
> > default_loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME] =
> > +        { 1, 0, 0, 0, -1, 0, -1, -1 };
> > +
> > +    memset(fh_obu, 0, sizeof(*fh_obu));
> > +    pic->nb_slices = priv->tile_groups;
> > +    fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> > +    fh_obu->header.obu_has_size_field = 1;
> > +
> > +    switch (pic->type) {
> > +    case PICTURE_TYPE_IDR:
> > +        av_assert0(pic->nb_refs[0] == 0 || pic->nb_refs[1]);
> > +        fh->frame_type = AV1_FRAME_KEY;
> > +        fh->refresh_frame_flags = 0xFF;
> > +        fh->base_q_idx = priv->q_idx_idr;
> > +        hpic->slot = 0;
> > +        hpic->last_idr_frame = pic->display_order;
> > +        break;
> > +    case PICTURE_TYPE_P:
> > +        av_assert0(pic->nb_refs[0]);
> > +        fh->frame_type = AV1_FRAME_INTER;
> > +        fh->base_q_idx = priv->q_idx_p;
> > +        ref = pic->refs[0][pic->nb_refs[0] - 1];
> > +        href = ref->priv_data;
> > +        hpic->slot = !href->slot;
> 
> I'm very confused how all of these multiple-reference stuff can work
> if you only ever put pictures in slots 0 and 1.
> 
> What does it actually look like?  Can you share a stream using it?

An example encoded by:
# ffmpeg -vaapi_device /dev/dri/renderD128 -f lavfi -i testsrc -vf
format=nv12,hwupload -c:v av1_vaapi -vframes 100 -y vaapi_av1.mp4

https://drive.google.com/file/d/1rP_cidjKV1GKzwGzG7-esrhw_rdWyAcG/view

> 
> > +        hpic->last_idr_frame = href->last_idr_frame;
> > +        fh->refresh_frame_flags = 1 << hpic->slot;
> > +
> > +        /** set the nearest frame in L0 as all reference frame. */
> > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +            fh->ref_frame_idx[i] = href->slot;
> > +        }
> > +        fh->primary_ref_frame = href->slot;
> > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +        vpic->ref_frame_ctrl_l0.fields.search_idx0 =
> > AV1_REF_FRAME_LAST;
> > +
> > +        /** set the 2nd nearest frame in L0 as Golden frame. */
> > +        if (pic->nb_refs[0] > 1) {
> > +            ref = pic->refs[0][pic->nb_refs[0] - 2];
> > +            href = ref->priv_data;
> > +            fh->ref_frame_idx[3] = href->slot;
> > +            fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +            vpic->ref_frame_ctrl_l0.fields.search_idx1 =
> > AV1_REF_FRAME_GOLDEN;
> > +        }
> > +        break;
> > +    case PICTURE_TYPE_B:
> > +        av_assert0(pic->nb_refs[0] && pic->nb_refs[1]);
> > +        fh->frame_type = AV1_FRAME_INTER;
> > +        fh->base_q_idx = priv->q_idx_b;
> > +        fh->refresh_frame_flags = 0x0;
> > +        fh->reference_select = 1;
> > +
> > +        /** B frame will not be referenced, disable its recon
> > frame. */
> > +        vpic->picture_flags.bits.disable_frame_recon = 1;
> 
> It feels like it would be very easy to support all of the B structure
> just like VP9 does here.

It's in my plan together with multi(>2) reference support. Better to
aligned with VPL and need some time to test and compare.

> 
> > +
> > +        /** Use LAST_FRAME and BWDREF_FRAME for reference. */
> > +        vpic->ref_frame_ctrl_l0.fields.search_idx0 =
> > AV1_REF_FRAME_LAST;
> > +        vpic->ref_frame_ctrl_l1.fields.search_idx0 =
> > AV1_REF_FRAME_BWDREF;
> > +
> > +        ref                            = pic->refs[0][pic-
> > >nb_refs[0] - 1];
> > +        href                           = ref->priv_data;
> > +        hpic->last_idr_frame           = href->last_idr_frame;
> > +        fh->primary_ref_frame          = href->slot;
> > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +        for (i = 0; i < AV1_REF_FRAME_GOLDEN; i++) {
> > +            fh->ref_frame_idx[i] = href->slot;
> > +        }
> > +
> > +        ref                            = pic->refs[1][pic-
> > >nb_refs[1] - 1];
> > +        href                           = ref->priv_data;
> > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +        for (i = AV1_REF_FRAME_GOLDEN; i < AV1_REFS_PER_FRAME;
> > i++) {
> > +            fh->ref_frame_idx[i] = href->slot;
> > +        }
> > +        break;
> > +    default:
> > +        av_assert0(0 && "invalid picture type");
> > +    }
> > +
> > +    fh->show_frame                = pic->display_order <= pic-
> > >encode_order;
> > +    fh->showable_frame            = fh->frame_type !=
> > AV1_FRAME_KEY;
> > +    fh->frame_width_minus_1       = avctx->width - 1;
> > +    fh->frame_height_minus_1      = avctx->height - 1;
> > +    fh->render_width_minus_1      = fh->frame_width_minus_1;
> > +    fh->render_height_minus_1     = fh->frame_height_minus_1;
> > +    fh->order_hint                = pic->display_order - hpic-
> > >last_idr_frame;
> 
> Doesn't this overflow?

The max order hint for AV1 is 255, force convert from int64_t to
uint8_t will get the expected result.

> 
> > +    fh->tile_cols                 = priv->tile_cols;
> > +    fh->tile_rows                 = priv->tile_rows;
> > +    fh->tile_cols_log2            = priv->tile_cols_log2;
> > +    fh->tile_rows_log2            = priv->tile_rows_log2;
> > +    fh->uniform_tile_spacing_flag = priv->uniform_tile;
> > +    fh->tile_size_bytes_minus1    = priv-
> > >attr_ext2.bits.tile_size_bytes_minus1;
> > +    fh->reduced_tx_set            = 1;
> 
> This not being present in the capabilities feels like an omission in
> the API.  That's a large loss for an implementation which does
> support the full transform set.

Personally understanding, "not being present" means it must be
supported. If any exception driver, then they can ask to add new
capability to libva.

> 
> > +
> > +    /** ignore ONLY_4x4 mode for codedlossless is not fully
> > implemented. */
> > +    if (priv->attr_ext2.bits.tx_mode_support & 0x04)
> > +        fh->tx_mode = AV1_TX_MODE_SELECT;
> > +    else if (priv->attr_ext2.bits.tx_mode_support & 0x02)
> > +        fh->tx_mode = AV1_TX_MODE_LARGEST;
> > +    else
> > +        return AVERROR(EINVAL);
> 
> What would this case even mean?  The encoder doesn't support any
> transforms?  Seems like a message rather than silent failure would be
> a good idea anyway.

OK, will add the message to let user know the failure reason.

> 
> > +
> > +    for (i = 0; i < fh->tile_cols; i++)
> > +        fh->width_in_sbs_minus_1[i] = vpic-
> > >width_in_sbs_minus_1[i] = priv->width_in_sbs_minus_1[i];
> > +
> > +    for (i = 0; i < fh->tile_rows; i++)
> > +        fh->height_in_sbs_minus_1[i] = vpic-
> > >height_in_sbs_minus_1[i] = priv->height_in_sbs_minus_1[i];
> 
> Er, what?  Doesn't this fail the inference on the last tile
> width/height if you don't split exactly?

What's the meaning of "split exactly"? All the tile w/h will be split
in vaapi_encode_av1_set_tile include last tile. Just make sure the info
of tiles w/h in frame header are same with the info passed to VAAPI
should be fine.

> 
> > +
> > +    memcpy(fh->loop_filter_ref_deltas,
> > default_loop_filter_ref_deltas,
> > +           AV1_TOTAL_REFS_PER_FRAME * sizeof(int8_t));
> > +
> > +    if (fh->frame_type == AV1_FRAME_KEY && fh->show_frame) {
> > +        fh->error_resilient_mode = 1;
> > +    }
> > +
> > +    if (fh->frame_type == AV1_FRAME_KEY || fh-
> > >error_resilient_mode)
> > +        fh->primary_ref_frame = AV1_PRIMARY_REF_NONE;
> > +
> > +    vpic->base_qindex          = fh->base_q_idx;
> > +    vpic->frame_width_minus_1  = fh->frame_width_minus_1;
> > +    vpic->frame_height_minus_1 = fh->frame_height_minus_1;
> > +    vpic->primary_ref_frame    = fh->primary_ref_frame;
> > +    vpic->reconstructed_frame  = pic->recon_surface;
> > +    vpic->coded_buf            = pic->output_buffer;
> > +    vpic->tile_cols            = fh->tile_cols;
> > +    vpic->tile_rows            = fh->tile_rows;
> > +    vpic->order_hint           = fh->order_hint;
> > +#if VA_CHECK_VERSION(1, 15, 0)
> > +    vpic->refresh_frame_flags  = fh->refresh_frame_flags;
> > +#endif
> 
> What will the difference be between running in the two versions?  (Is
> it worth supporting the older one?  Cf. bit_depth on VP9.)

I'd prefer to support older one. In case of someone using old version
and works well with ffmpeg-vpl(as vpl already support av1 encode long
time ago), otherwise he must be confuse why ffmpeg-vaapi need higher
VAAPI version but VPL doesn't.

> 
> > +
> > +    vpic->picture_flags.bits.enable_frame_obu     = 0;
> > +    vpic->picture_flags.bits.frame_type           = fh-
> > >frame_type;
> > +    vpic->picture_flags.bits.reduced_tx_set       = fh-
> > >reduced_tx_set;
> > +    vpic->picture_flags.bits.error_resilient_mode = fh-
> > >error_resilient_mode;
> > +
> > +    /** let driver decide to use single or compound reference
> > prediction mode. */
> > +    vpic->mode_control_flags.bits.reference_mode = fh-
> > >reference_select ? 2 : 0;
> > +    vpic->mode_control_flags.bits.tx_mode = fh->tx_mode;
> > +
> > +    vpic->tile_group_obu_hdr_info.bits.obu_has_size_field = 1;
> > +
> > +    /** set reference. */
> > +    for (i = 0; i < AV1_REFS_PER_FRAME; i++)
> > +        vpic->ref_frame_idx[i] = fh->ref_frame_idx[i];
> > +
> > +    for (i = 0; i < FF_ARRAY_ELEMS(vpic->reference_frames); i++)
> > +        vpic->reference_frames[i] = VA_INVALID_SURFACE;
> > +
> > +    for (i = 0; i < MAX_REFERENCE_LIST_NUM; i++) {
> > +        for (int j = 0; j < pic->nb_refs[i]; j++) {
> > +            VAAPIEncodePicture *ref_pic = pic->refs[i][j];
> > +
> > +            slot = ((VAAPIEncodeAV1Picture*)ref_pic->priv_data)-
> > >slot;
> > +            av_assert0(vpic->reference_frames[slot] ==
> > VA_INVALID_SURFACE);
> > +
> > +            vpic->reference_frames[slot] = ref_pic->recon_surface;
> > +        }
> > +    }
> > +
> > +    /** pack frame header, and set va params offset like
> > bit_offset_qindex etc. */
> > +    ret = vaapi_encode_av1_write_frame_header(avctx, pic, priv-
> > >fh_data, &priv->fh_data_len);
> > +    if (ret < 0)
> > +        goto end;
> > +
> > +end:
> > +    ff_cbs_fragment_reset(obu);
> > +    return ret;
> > +}
> > +
> > +static int vaapi_encode_av1_init_slice_params(AVCodecContext
> > *avctx,
> > +                                              VAAPIEncodePicture
> > *pic,
> > +                                              VAAPIEncodeSlice
> > *slice)
> > +{
> > +    VAAPIEncodeAV1Context      *priv = avctx->priv_data;
> > +    VAEncTileGroupBufferAV1  *vslice = slice->codec_slice_params;
> > +    CodedBitstreamAV1Context  *cbctx = priv->cbc->priv_data;
> > +    int div;
> > +
> > +    /** Set tile group info. */
> > +    div = priv->tile_cols * priv->tile_rows / priv->tile_groups;
> > +    vslice->tg_start = slice->index * div;
> > +    if (slice->index == (priv->tile_groups - 1)) {
> > +        vslice->tg_end = priv->tile_cols * priv->tile_rows - 1;
> > +        cbctx->seen_frame_header = 0;
> > +    } else {
> > +        vslice->tg_end = (slice->index + 1) * div - 1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_write_picture_header(AVCodecContext
> > *avctx,
> > +                                                 VAAPIEncodePictur
> > e *pic,
> > +                                                 char *data,
> > size_t *data_len)
> > +{
> > +    VAAPIEncodeAV1Context     *priv = avctx->priv_data;
> > +    CodedBitstreamFragment     *obu = &priv->current_obu;
> > +    AV1RawOBU               *fh_obu = &priv->fh;
> > +    AV1RawFrameHeader       *rep_fh = &fh_obu->obu.frame_header;
> > +    VAAPIEncodeAV1Picture *href;
> > +    int ret = 0;
> > +
> > +    pic->tail_size = 0;
> > +    /** Pack repeat frame header. */
> > +    if (pic->display_order > pic->encode_order) {
> > +        memset(fh_obu, 0, sizeof(*fh_obu));
> > +        href = pic->refs[0][pic->nb_refs[0] - 1]->priv_data;
> > +        fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> > +        fh_obu->header.obu_has_size_field = 1;
> > +
> > +        rep_fh->show_existing_frame   = 1;
> > +        rep_fh->frame_to_show_map_idx = href->slot == 0;
> > +        rep_fh->frame_type            = AV1_FRAME_INTER;
> > +        rep_fh->frame_width_minus_1   = avctx->width - 1;
> > +        rep_fh->frame_height_minus_1  = avctx->height - 1;
> > +        rep_fh->render_width_minus_1  = rep_fh-
> > >frame_width_minus_1;
> > +        rep_fh->render_height_minus_1 = rep_fh-
> > >frame_height_minus_1;
> > +
> > +        ret = vaapi_encode_av1_write_frame_header(avctx, pic, pic-
> > >tail_data, &pic->tail_size);
> > +        if (ret < 0)
> > +            goto end;
> > +
> > +        pic->tail_size /= 8;
> > +    }
> > +
> > +    memcpy(data, &priv->fh_data, MAX_PARAM_BUFFER_SIZE *
> > sizeof(char));
> > +    *data_len = priv->fh_data_len;
> > +
> > +end:
> > +    ff_cbs_fragment_reset(obu);
> > +    return ret;
> > +}
> > +
> > +static const VAAPIEncodeProfile vaapi_encode_av1_profiles[] = {
> > +    { FF_PROFILE_AV1_MAIN,  8, 3, 1, 1, VAProfileAV1Profile0 },
> > +    { FF_PROFILE_AV1_MAIN, 10, 3, 1, 1, VAProfileAV1Profile0 },
> > +    { FF_PROFILE_UNKNOWN }
> > +};
> > +
> > +static const VAAPIEncodeType vaapi_encode_type_av1 = {
> > +    .profiles        = vaapi_encode_av1_profiles,
> > +    .flags           = FLAG_B_PICTURES,
> > +    .default_quality = 25,
> > +    .configure       = &vaapi_encode_av1_configure,
> > +
> > +    .sequence_header_type  = VAEncPackedHeaderSequence,
> > +    .sequence_params_size  =
> > sizeof(VAEncSequenceParameterBufferAV1),
> > +    .init_sequence_params  =
> > &vaapi_encode_av1_init_sequence_params,
> > +    .write_sequence_header =
> > &vaapi_encode_av1_write_sequence_header,
> > +
> > +    .picture_priv_data_size = sizeof(VAAPIEncodeAV1Picture),
> > +    .picture_header_type    = VAEncPackedHeaderPicture,
> > +    .picture_params_size    =
> > sizeof(VAEncPictureParameterBufferAV1),
> > +    .init_picture_params    =
> > &vaapi_encode_av1_init_picture_params,
> > +    .write_picture_header   =
> > &vaapi_encode_av1_write_picture_header,
> > +
> > +    .slice_params_size = sizeof(VAEncTileGroupBufferAV1),
> > +    .init_slice_params = &vaapi_encode_av1_init_slice_params,
> > +};
> > +
> > +static av_cold int vaapi_encode_av1_init(AVCodecContext *avctx)
> > +{
> > +    VAAPIEncodeContext      *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context  *priv = avctx->priv_data;
> > +    VAConfigAttrib attr;
> > +    VAStatus vas;
> > +    int ret;
> > +
> > +    ctx->codec = &vaapi_encode_type_av1;
> > +
> > +    ctx->desired_packed_headers =
> > +        VA_ENC_PACKED_HEADER_SEQUENCE |
> > +        VA_ENC_PACKED_HEADER_PICTURE;
> > +
> > +    if (avctx->profile == FF_PROFILE_UNKNOWN)
> > +        avctx->profile = priv->profile;
> > +    if (avctx->level == FF_LEVEL_UNKNOWN)
> > +        avctx->level = priv->level;
> > +
> > +    if (avctx->level != FF_LEVEL_UNKNOWN && avctx->level & ~0x1f)
> > {
> > +        av_log(avctx, AV_LOG_ERROR, "Invalid level %d\n", avctx-
> > >level);
> > +        return AVERROR(EINVAL);
> > +    }
> > +
> > +    ret = ff_vaapi_encode_init(avctx);
> > +    if (ret < 0)
> > +        return ret;
> > +
> > +    attr.type = VAConfigAttribEncAV1;
> > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > +                                ctx->va_profile,
> > +                                ctx->va_entrypoint,
> > +                                &attr, 1);
> > +    if (vas != VA_STATUS_SUCCESS) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > +               "config attribute: %d (%s).\n", vas,
> > vaErrorStr(vas));
> > +        return AVERROR_EXTERNAL;
> > +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> > +        priv->attr.value = 0;
> > +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> > +               "supported.\n", attr.type);
> 
> Is zero actually reasonable in this attribute (and below)?  Would we
> be better just requiring them to work?

The attribute content of EncAV1 and EncAV1Ext1 is not the necessary for
encoder. If it is not supported, I will regard all the attribute
features is un-usable, and it doesn't break our encode. So I gives it
more tolerance without error, just a warning message.

> 
> > +    } else {
> > +        priv->attr.value = attr.value;
> > +    }
> > +
> > +    attr.type = VAConfigAttribEncAV1Ext1;
> > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > +                                ctx->va_profile,
> > +                                ctx->va_entrypoint,
> > +                                &attr, 1);
> > +    if (vas != VA_STATUS_SUCCESS) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > +               "config attribute: %d (%s).\n", vas,
> > vaErrorStr(vas));
> > +        return AVERROR_EXTERNAL;
> > +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> > +        priv->attr_ext1.value = 0;
> > +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> > +               "supported.\n", attr.type);
> > +    } else {
> > +        priv->attr_ext1.value = attr.value;
> > +    }
> > +
> > +    /** This attr provides essential indicators, return error if
> > not support. */
> > +    attr.type = VAConfigAttribEncAV1Ext2;
> > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > +                                ctx->va_profile,
> > +                                ctx->va_entrypoint,
> > +                                &attr, 1);
> > +    if (vas != VA_STATUS_SUCCESS || attr.value ==
> > VA_ATTRIB_NOT_SUPPORTED) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > +               "config attribute: %d (%s).\n", vas,
> > vaErrorStr(vas));
> > +        return AVERROR_EXTERNAL;
> > +    } else {
> > +        priv->attr_ext2.value = attr.value;
> > +    }
> > +
> > +    ret = vaapi_encode_av1_set_tile(avctx);
> > +    if (ret < 0)
> > +        return ret;
> > +
> > +    return 0;
> > +}
> > +
> > +static av_cold int vaapi_encode_av1_close(AVCodecContext *avctx)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +
> > +    ff_cbs_fragment_free(&priv->current_obu);
> > +    ff_cbs_close(&priv->cbc);
> > +
> > +    return ff_vaapi_encode_close(avctx);
> > +}
> > +
> > +#define OFFSET(x) offsetof(VAAPIEncodeAV1Context, x)
> > +#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM |
> > AV_OPT_FLAG_ENCODING_PARAM)
> > +
> > +static const AVOption vaapi_encode_av1_options[] = {
> > +    VAAPI_ENCODE_COMMON_OPTIONS,
> > +    VAAPI_ENCODE_RC_OPTIONS,
> > +    { "profile", "Set profile (seq_profile)",
> > +      OFFSET(profile), AV_OPT_TYPE_INT,
> > +      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 0xff,
> > FLAGS, "profile" },
> > +
> > +#define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
> > +    { .i64 = value }, 0, 0, FLAGS, "profile"
> > +    { PROFILE("main",               FF_PROFILE_AV1_MAIN) },
> > +    { PROFILE("high",               FF_PROFILE_AV1_HIGH) },
> > +    { PROFILE("professional",       FF_PROFILE_AV1_PROFESSIONAL)
> > },
> > +#undef PROFILE
> > +
> > +    { "tier", "Set tier (seq_tier)",
> > +      OFFSET(tier), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS,
> > "tier" },
> > +    { "main", NULL, 0, AV_OPT_TYPE_CONST,
> > +      { .i64 = 0 }, 0, 0, FLAGS, "tier" },
> > +    { "high", NULL, 0, AV_OPT_TYPE_CONST,
> > +      { .i64 = 1 }, 0, 0, FLAGS, "tier" },
> > +    { "level", "Set level (seq_level_idx)",
> > +      OFFSET(level), AV_OPT_TYPE_INT,
> > +      { .i64 = FF_LEVEL_UNKNOWN }, FF_LEVEL_UNKNOWN, 0x1f, FLAGS,
> > "level" },
> > +
> > +#define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
> > +      { .i64 = value }, 0, 0, FLAGS, "level"
> > +    { LEVEL("2.0",  0) },
> > +    { LEVEL("2.1",  1) },
> > +    { LEVEL("3.0",  4) },
> > +    { LEVEL("3.1",  5) },
> > +    { LEVEL("4.0",  8) },
> > +    { LEVEL("4.1",  9) },
> > +    { LEVEL("5.0", 12) },
> > +    { LEVEL("5.1", 13) },
> > +    { LEVEL("5.2", 14) },
> > +    { LEVEL("5.3", 15) },
> > +    { LEVEL("6.0", 16) },
> > +    { LEVEL("6.1", 17) },
> > +    { LEVEL("6.2", 18) },
> > +    { LEVEL("6.3", 19) },
> > +#undef LEVEL
> > +
> > +    { "tiles", "Tile columns x rows",
> > +      OFFSET(tile_cols), AV_OPT_TYPE_IMAGE_SIZE, { .str = "1x1" },
> > 1, AV1_MAX_TILE_COLS, FLAGS },
> > +    { "tile_groups", "Number of tile groups for encoding",
> > +      OFFSET(tile_groups), AV_OPT_TYPE_INT, { .i64 = 1 }, 1,
> > AV1_MAX_TILE_ROWS * AV1_MAX_TILE_COLS, FLAGS },
> > +
> > +    { NULL },
> > +};
> > +
> > +static const FFCodecDefault vaapi_encode_av1_defaults[] = {
> > +    { "b",              "0"   },
> > +    { "bf",             "7"   },
> 
> This feels excessive when only one layer is supported.  Do
> experimental results give this as the best tradeoff?

Did a quick check, set bf as 2 can get a better quality in BRC mode.
Will change it to 2.

Thanks.

> 
> > +    { "g",              "120" },
> > +    { "qmin",           "1"   },
> > +    { "qmax",           "255" },
> > +    { NULL },
> > +};
> > +
> > +static const AVClass vaapi_encode_av1_class = {
> > +    .class_name = "av1_vaapi",
> > +    .item_name  = av_default_item_name,
> > +    .option     = vaapi_encode_av1_options,
> > +    .version    = LIBAVUTIL_VERSION_INT,
> > +};
> > +
> > +const FFCodec ff_av1_vaapi_encoder = {
> > +    .p.name         = "av1_vaapi",
> > +    CODEC_LONG_NAME("AV1 (VAAPI)"),
> > +    .p.type         = AVMEDIA_TYPE_VIDEO,
> > +    .p.id           = AV_CODEC_ID_AV1,
> > +    .priv_data_size = sizeof(VAAPIEncodeAV1Context),
> > +    .init           = &vaapi_encode_av1_init,
> > +    FF_CODEC_RECEIVE_PACKET_CB(&ff_vaapi_encode_receive_packet),
> > +    .close          = &vaapi_encode_av1_close,
> > +    .p.priv_class   = &vaapi_encode_av1_class,
> > +    .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE |
> > +                      AV_CODEC_CAP_DR1 |
> > AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
> > +    .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE |
> > +                      FF_CODEC_CAP_INIT_CLEANUP,
> > +    .defaults       = vaapi_encode_av1_defaults,
> > +    .p.pix_fmts = (const enum AVPixelFormat[]) {
> > +        AV_PIX_FMT_VAAPI,
> > +        AV_PIX_FMT_NONE,
> > +    },
> > +    .hw_configs     = ff_vaapi_encode_hw_configs,
> > +    .p.wrapper_name = "vaapi",
> > +};
> > diff --git a/libavcodec/version.h b/libavcodec/version.h
> > index 9411511e04..728ab8839d 100644
> > --- a/libavcodec/version.h
> > +++ b/libavcodec/version.h
> > @@ -29,7 +29,7 @@
> >   
> >   #include "version_major.h"
> >   
> > -#define LIBAVCODEC_VERSION_MINOR  22
> > +#define LIBAVCODEC_VERSION_MINOR  23
> >   #define LIBAVCODEC_VERSION_MICRO 100
> >   
> >   #define
> > LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
> 
> Thanks,
> 
> - Mark
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Mark Thompson Aug. 13, 2023, 9:43 p.m. UTC | #3
On 10/08/2023 03:54, Wang, Fei W wrote:
> On Mon, 2023-08-07 at 22:21 +0100, Mark Thompson wrote:
>> On 03/08/2023 07:01, fei.w.wang-at-intel.com@ffmpeg.org wrote:
>>> From: Fei Wang <fei.w.wang@intel.com>
>>>
>>> Signed-off-by: Fei Wang <fei.w.wang@intel.com>
>>> ---
>>>    Changelog                     |    1 +
>>>    configure                     |    3 +
>>>    doc/encoders.texi             |   13 +
>>>    libavcodec/Makefile           |    1 +
>>>    libavcodec/allcodecs.c        |    1 +
>>>    libavcodec/vaapi_encode.c     |  125 +++-
>>>    libavcodec/vaapi_encode.h     |   12 +
>>>    libavcodec/vaapi_encode_av1.c | 1229
>>> +++++++++++++++++++++++++++++++++
>>>    libavcodec/version.h          |    2 +-
>>>    9 files changed, 1368 insertions(+), 19 deletions(-)
>>>    create mode 100644 libavcodec/vaapi_encode_av1.c
>>
>> I assume this is tested on Intel hardware.  Is it tested on AMD as
>> well?  (Apparently working in recent Mesa.)
> 
> AMD tested the patchset months ago:
> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22585
> 
> This patch changed a little compare with the version in Cartwheel,
> @Ruijing, Could you help to review this version in ML? To avoid the
> diffs break you. Thanks.
> 
>>> ...
>>> @@ -669,6 +669,15 @@ static int
>>> vaapi_encode_set_output_timestamp(AVCodecContext *avctx,
>>>    {
>>>        VAAPIEncodeContext *ctx = avctx->priv_data;
>>>    
>>> +    // AV1 packs P frame and next B frame into one pkt, and uses
>>> the other
>>> +    // repeat frame header pkt at the display order position of
>>> the P frame
>>> +    // to indicate its frame index. Each frame has a corresponding
>>> pkt in its
>>> +    // display order position. So don't need to consider delay for
>>> AV1 timestamp.
>>> +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
>>> +        pkt->dts = pkt->pts - ctx->dts_pts_diff;
>>> +        return 0;
>>> +    }
>>
>> This doesn't get you the right result, though?  The PTS and DTS on
>> every AV1 packet want to be the same.
> 
> The result tested good which can be played normally. Just aligned with
> other vaapi encoders that the 1st frame start with 0/-1 of PTS/DTS if
> have B frame. Set PTS/DTS to same also looks good.

DTS != PTS should only be true when the stream has externally-visible frame reordering in, which AV1 doesn't.

The other two AV1 encoders both output DTS == PTS; I think you should match that.

>> In any case, please don't put tests for codec ID in the common
>> code.  I suggest that the timestamp behaviour wants to be a new
>> FLAG_*.
>>
>>> +
>>> ...
>>> @@ -1128,9 +1182,19 @@ static int
>>> vaapi_encode_pick_next(AVCodecContext *avctx,
>>>    
>>>        vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
>>>        if (pic->type != PICTURE_TYPE_IDR) {
>>> -        vaapi_encode_add_ref(avctx, pic, start,
>>> -                             pic->type == PICTURE_TYPE_P,
>>> -                             b_counter > 0, 0);
>>> +        // TODO: apply both previous and forward multi reference
>>> for all vaapi encoders.
>>> +        // And L0/L1 reference frame number can be set dynamically
>>> through query
>>> +        // VAConfigAttribEncMaxRefFrames attribute.
>>> +        if (avctx->codec_id == AV_CODEC_ID_AV1) {
>>> +            for (i = 0; i < ctx->nb_next_prev; i++)
>>> +                vaapi_encode_add_ref(avctx, pic, ctx-
>>>> next_prev[i],
>>> +                                     pic->type == PICTURE_TYPE_P,
>>> +                                     b_counter > 0, 0);
>>
>> So an undisclosed aim of the previous patch was to make this extra
>> list of the most recent N frames for this to use with P frames only?
> 
> Currently, yes. As the TODO comment says, I will apply it to all other
> codecs and B frame next, it will need lots of test on different
> hardware. Add this for AV1 here is to align with VPL which use 2
> previous reference for P frame. Base on our test, 2 reference frame for
> P will improve ~5% BD-rate for quality, so that can make it possible to
> let user get same quality with VPL after adjusting and aligning encoder
> options.

Hmm.  My Intel test machine says it can take 8 L0 and 2 L1 references in H.264.  I might try this, seems like it should be easy to test.

>>
>> Why this partcular structure for P frames only, though?  Seems like
>> if you support extra references then other codecs could also make use
>> of them in either direction, so we would be better off hinting that
>> the codec would like up to N references and then using the contents
>> of the existing DPB, plus keep extras if there is space.
>>
>>> ...
>>> +
>>> +static av_cold int vaapi_encode_av1_configure(AVCodecContext
>>> *avctx)
>>> +{
>>> +    VAAPIEncodeContext     *ctx = avctx->priv_data;
>>> +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
>>> +    int ret;
>>> +
>>> +    ret = ff_cbs_init(&priv->cbc, AV_CODEC_ID_AV1, avctx);
>>> +    if (ret < 0)
>>> +        return ret;
>>> +
>>> +    if (ctx->rc_mode->quality) {
>>> +        priv->q_idx_p = av_clip(ctx->rc_quality, 0,
>>> AV1_MAX_QUANT);
>>> +        if (fabs(avctx->i_quant_factor) > 0.0)
>>> +            priv->q_idx_idr =
>>> +                av_clip((fabs(avctx->i_quant_factor) * priv-
>>>> q_idx_p  +
>>> +                         avctx->i_quant_offset) + 0.5,
>>> +                        0, AV1_MAX_QUANT);
>>> +        else
>>> +            priv->q_idx_idr = priv->q_idx_p;
>>> +
>>> +        if (fabs(avctx->b_quant_factor) > 0.0)
>>> +            priv->q_idx_b =
>>> +                av_clip((fabs(avctx->b_quant_factor) * priv-
>>>> q_idx_p  +
>>> +                         avctx->b_quant_offset) + 0.5,
>>> +                        0, AV1_MAX_QUANT);
>>> +        else
>>> +            priv->q_idx_b = priv->q_idx_p;
>>> +    } else {
>>> +        /** Arbitrary value */
>>> +        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 128;
>>> +    }
>>
>> Are there any alignment requirements here?  (If I gave it an input
>> which is, say, 1361x1, would that work?)
> 
> I didn't get you. Which parameter here need to be align? Max/Min
> resolution HW supported will be checked somewhere else. If use 1361x1
> there will be the error like:
> 
> "Hardware does not support encoding at size 1376x16 (constraints: width
> 32-8192 height 32-8192)."

I mean of padding for the reference surfaces.  I was expecting this to be rounded up to a multiple of 8 (to match MiCols/MiRows) - see this function in other codecs.

>>> ...
>>> +
>>> +    /** update obu size in bitstream */
>>> +    if (fh_obu->header.obu_has_size_field) {
>>> +        obu_size_len = priv->attr_ext2.bits.obu_size_bytes_minus1
>>> + 1;
>>> +        for (i = 0; i < obu_size_len; i++) {
>>> +            byte = obu_size >> (7 * i) & 0x7f;
>>> +            if (i < obu_size_len - 1)
>>> +                byte |= 0x80;
>>> +            put_bits(&pbc_tmp, 8, byte);
>>> +        }
>>> +        flush_put_bits(&pbc_tmp);
>>> +        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 -
>>> obu_size_len), obu_size);
>>> +        *data_len -= (8 - obu_size_len) * 8;
>>> +    }
>>
>> Why is there an incomplete duplicate of the cbs_av1 header writing
>> code here?
> 
> To record some position/size in bitstream that needed for VAAPI. Like
> qp_index/loopfilter/cdef offset and cdef parameters size in bit. It's
> not reasonable to add the specific parameters into CBS.

How about with <https://lists.ffmpeg.org/pipermail/ffmpeg-devel/2023-August/313228.html>?

>>> ...
>>> +
>>> +static int vaapi_encode_av1_set_tile(AVCodecContext *avctx)
>>> +{
>>> +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
>>> +    int mi_cols, mi_rows, sb_shift, sb_size;
>>> +    int max_tile_area_sb, max_tile_area_sb_varied;
>>> +    int tile_width_sb, tile_height_sb, widest_tile_sb;
>>> +    int min_log2_tiles;
>>> +    int tile_rows_tmp, i;
>>> +
>>> +    if (priv->tile_cols > AV1_MAX_TILE_COLS ||
>>> +        priv->tile_rows > AV1_MAX_TILE_ROWS) {
>>> +        av_log(avctx, AV_LOG_ERROR, "Invalid tile number %dx%d,
>>> should less than %dx%d.\n",
>>> +               priv->tile_cols, priv->tile_rows,
>>> AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);
>>
>> Can't this be a constraint on the option rather than a special check
>> here?
> 
> Tiles set with IMAGE_SIZE option type which aligns with HEVC. It
> doesn't support Min/MAX check.

Hmm, yeah - fair enough.

>>> ...
>>> +
>>> +    sh->color_config = (AV1RawColorConfig) {
>>> +        .high_bitdepth                  = desc->comp[0].depth == 8
>>> ? 0 : 1,
>>> +        .color_primaries                = avctx->color_primaries,
>>> +        .transfer_characteristics       = avctx->color_trc,
>>> +        .matrix_coefficients            = avctx->colorspace,
>>> +        .color_description_present_flag = (avctx->color_primaries
>>> != AVCOL_PRI_UNSPECIFIED ||
>>> +                                           avctx-
>>>> color_trc       != AVCOL_TRC_UNSPECIFIED ||
>>> +                                           avctx-
>>>> colorspace      != AVCOL_SPC_UNSPECIFIED),
>>> +        .color_range                    = avctx->color_range ==
>>> AVCOL_RANGE_JPEG,
>>> +        .subsampling_x                  = desc->log2_chroma_w,
>>> +        .subsampling_y                  = desc->log2_chroma_h,
>>
>> .chroma_sample_position = some function of chroma_sample_location.
> 
> There is no chroma_sample_position supported in VAAPI yet.

VAAPI won't care.  It should be marked correctly in the headers so that the output stream has the right value.

(CFL not having any ability to deal with the sample location is a known deficiency of AV1.)

>>> ...
>>> +    fh->tile_cols                 = priv->tile_cols;
>>> +    fh->tile_rows                 = priv->tile_rows;
>>> +    fh->tile_cols_log2            = priv->tile_cols_log2;
>>> +    fh->tile_rows_log2            = priv->tile_rows_log2;
>>> +    fh->uniform_tile_spacing_flag = priv->uniform_tile;
>>> +    fh->tile_size_bytes_minus1    = priv-
>>>> attr_ext2.bits.tile_size_bytes_minus1;
>>> +    fh->reduced_tx_set            = 1;
>>
>> This not being present in the capabilities feels like an omission in
>> the API.  That's a large loss for an implementation which does
>> support the full transform set.
> 
> Personally understanding, "not being present" means it must be
> supported. If any exception driver, then they can ask to add new
> capability to libva.

reduced_tx_set is a negative property - you want to set it to zero to allow all transforms, unless the implementation can't cope with that.

Does the Intel implementation require it to be set?  (That is, does not support the full transform set.)

>>> ...
>>> +
>>> +    for (i = 0; i < fh->tile_cols; i++)
>>> +        fh->width_in_sbs_minus_1[i] = vpic-
>>>> width_in_sbs_minus_1[i] = priv->width_in_sbs_minus_1[i];
>>> +
>>> +    for (i = 0; i < fh->tile_rows; i++)
>>> +        fh->height_in_sbs_minus_1[i] = vpic-
>>>> height_in_sbs_minus_1[i] = priv->height_in_sbs_minus_1[i];
>>
>> Er, what?  Doesn't this fail the inference on the last tile
>> width/height if you don't split exactly?
> 
> What's the meaning of "split exactly"? All the tile w/h will be split
> in vaapi_encode_av1_set_tile include last tile. Just make sure the info
> of tiles w/h in frame header are same with the info passed to VAAPI
> should be fine.

Sorry, I misread this.  The values are set elsewhere, it's not setting all tiles to have the same SB width/height.

>>> ...
>>> +
>>> +    vpic->base_qindex          = fh->base_q_idx;
>>> +    vpic->frame_width_minus_1  = fh->frame_width_minus_1;
>>> +    vpic->frame_height_minus_1 = fh->frame_height_minus_1;
>>> +    vpic->primary_ref_frame    = fh->primary_ref_frame;
>>> +    vpic->reconstructed_frame  = pic->recon_surface;
>>> +    vpic->coded_buf            = pic->output_buffer;
>>> +    vpic->tile_cols            = fh->tile_cols;
>>> +    vpic->tile_rows            = fh->tile_rows;
>>> +    vpic->order_hint           = fh->order_hint;
>>> +#if VA_CHECK_VERSION(1, 15, 0)
>>> +    vpic->refresh_frame_flags  = fh->refresh_frame_flags;
>>> +#endif
>>
>> What will the difference be between running in the two versions?  (Is
>> it worth supporting the older one?  Cf. bit_depth on VP9.)
> 
> I'd prefer to support older one. In case of someone using old version
> and works well with ffmpeg-vpl(as vpl already support av1 encode long
> time ago), otherwise he must be confuse why ffmpeg-vaapi need higher
> VAAPI version but VPL doesn't.

Ok.  So the driver doesn't use the value of refresh_frame_flags if it doesn't affect the result?  (Seems like it shouldn't, not sure why it would be added to the API.)

>>> ...

Thanks,

- Mark
Dong, Ruijing Aug. 15, 2023, 1:59 p.m. UTC | #4
[AMD Official Use Only - General]

-----Original Message-----
From: Wang, Fei W <fei.w.wang@intel.com>
Sent: Wednesday, August 9, 2023 10:55 PM
To: ffmpeg-devel@ffmpeg.org
Cc: Dong, Ruijing <Ruijing.Dong@amd.com>
Subject: Re: [FFmpeg-devel] [PATCH v3 6/6] lavc/vaapi_encode: Add VAAPI AV1 encoder

On Mon, 2023-08-07 at 22:21 +0100, Mark Thompson wrote:
> On 03/08/2023 07:01, fei.w.wang-at-intel.com@ffmpeg.org wrote:
> > From: Fei Wang <fei.w.wang@intel.com>
> >
> > Signed-off-by: Fei Wang <fei.w.wang@intel.com>
> > ---
> >   Changelog                     |    1 +
> >   configure                     |    3 +
> >   doc/encoders.texi             |   13 +
> >   libavcodec/Makefile           |    1 +
> >   libavcodec/allcodecs.c        |    1 +
> >   libavcodec/vaapi_encode.c     |  125 +++-
> >   libavcodec/vaapi_encode.h     |   12 +
> >   libavcodec/vaapi_encode_av1.c | 1229
> > +++++++++++++++++++++++++++++++++
> >   libavcodec/version.h          |    2 +-
> >   9 files changed, 1368 insertions(+), 19 deletions(-)
> >   create mode 100644 libavcodec/vaapi_encode_av1.c
>
> I assume this is tested on Intel hardware.  Is it tested on AMD as
> well?  (Apparently working in recent Mesa.)
Yep, tested on AMD as well, working.

AMD tested the patchset months ago:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22585

This patch changed a little compare with the version in Cartwheel,
@Ruijing, Could you help to review this version in ML? To avoid the
diffs break you. Thanks.

>
>
> > diff --git a/Changelog b/Changelog
> > index bbda4f4fd4..e86f742cd3 100644
> > --- a/Changelog
> > +++ b/Changelog
> > @@ -27,6 +27,7 @@ version <next>:
> >   - Bitstream filter for converting VVC from MP4 to Annex B
> >   - scale_vt filter for videotoolbox
> >   - transpose_vt filter for videotoolbox
> > +- VAAPI AV1 encoder
> >
> >   version 6.0:
> >   - Radiance HDR image support
> > diff --git a/configure b/configure
> > index 99388e7664..68a238a819 100755
> > --- a/configure
> > +++ b/configure
> > @@ -3322,6 +3322,8 @@ av1_qsv_decoder_select="qsvdec"
> >   av1_qsv_encoder_select="qsvenc"
> >   av1_qsv_encoder_deps="libvpl"
> >   av1_amf_encoder_deps="amf"
> > +av1_vaapi_encoder_deps="VAEncPictureParameterBufferAV1"
> > +av1_vaapi_encoder_select="cbs_av1 vaapi_encode"
> >
> >   # parsers
> >   aac_parser_select="adts_header mpeg4audio"
> > @@ -7106,6 +7108,7 @@ if enabled vaapi; then
> >       check_type "va/va.h va/va_enc_jpeg.h"
> > "VAEncPictureParameterBufferJPEG"
> >       check_type "va/va.h
> > va/va_enc_vp8.h"  "VAEncPictureParameterBufferVP8"
> >       check_type "va/va.h
> > va/va_enc_vp9.h"  "VAEncPictureParameterBufferVP9"
> > +    check_type "va/va.h
> > va/va_enc_av1.h"  "VAEncPictureParameterBufferAV1"
> >   fi
> >
> >   if enabled_all opencl libdrm ; then
> > diff --git a/doc/encoders.texi b/doc/encoders.texi
> > index 25d6b7f09e..fb331ebd8e 100644
> > --- a/doc/encoders.texi
> > +++ b/doc/encoders.texi
> > @@ -3991,6 +3991,19 @@ Average variable bitrate.
> >   Each encoder also has its own specific options:
> >   @table @option
> >
> > +@item av1_vaapi
> > +@option{profile} sets the value of @emph{seq_profile}.
> > +@option{tier} sets the value of @emph{seq_tier}.
> > +@option{level} sets the value of @emph{seq_level_idx}.
> > +
> > +@table @option
> > +@item tiles
> > +Set the number of tiles to encode the input video with, as columns
> > x rows.
> > +(default is 1x1).
>
> Probably needs some clarification that large resolutions must be
> split into tiles?  Maybe an "auto" value for this (meaning use as few
> as possible), and let explicit "1x1" fail if the resolution is too
> large.
>
> > +@item tile_groups
> > +Set tile groups number (default is 1).
>
> Meaning what?  It splits into tile group OBUs containing equal
> numbers of tiles, or of equal size in bits, or something else?
>
> > +@end table
> > +
> >   @item h264_vaapi
> >   @option{profile} sets the value of @emph{profile_idc} and the
> > @emph{constraint_set*_flag}s.
> >   @option{level} sets the value of @emph{level_idc}.
> > diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> > index a6b2ecbb22..473afb4471 100644
> > --- a/libavcodec/Makefile
> > +++ b/libavcodec/Makefile
> > @@ -258,6 +258,7 @@ OBJS-$(CONFIG_AV1_MEDIACODEC_DECODER)  +=
> > mediacodecdec.o
> >   OBJS-$(CONFIG_AV1_MEDIACODEC_ENCODER)  += mediacodecenc.o
> >   OBJS-$(CONFIG_AV1_NVENC_ENCODER)       += nvenc_av1.o nvenc.o
> >   OBJS-$(CONFIG_AV1_QSV_ENCODER)         += qsvenc_av1.o
> > +OBJS-$(CONFIG_AV1_VAAPI_ENCODER)       += vaapi_encode_av1.o
> > av1_levels.o
> >   OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o
> >   OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
> >   OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
> > diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> > index 8775d15a4f..c43c1d7b48 100644
> > --- a/libavcodec/allcodecs.c
> > +++ b/libavcodec/allcodecs.c
> > @@ -844,6 +844,7 @@ extern const FFCodec ff_av1_nvenc_encoder;
> >   extern const FFCodec ff_av1_qsv_decoder;
> >   extern const FFCodec ff_av1_qsv_encoder;
> >   extern const FFCodec ff_av1_amf_encoder;
> > +extern const FFCodec ff_av1_vaapi_encoder;
> >   extern const FFCodec ff_libopenh264_encoder;
> >   extern const FFCodec ff_libopenh264_decoder;
> >   extern const FFCodec ff_h264_amf_encoder;
> > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> > index 2604f12b9e..2907e159fb 100644
> > --- a/libavcodec/vaapi_encode.c
> > +++ b/libavcodec/vaapi_encode.c
> > @@ -669,6 +669,15 @@ static int
> > vaapi_encode_set_output_timestamp(AVCodecContext *avctx,
> >   {
> >       VAAPIEncodeContext *ctx = avctx->priv_data;
> >
> > +    // AV1 packs P frame and next B frame into one pkt, and uses
> > the other
> > +    // repeat frame header pkt at the display order position of
> > the P frame
> > +    // to indicate its frame index. Each frame has a corresponding
> > pkt in its
> > +    // display order position. So don't need to consider delay for
> > AV1 timestamp.
> > +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > +        pkt->dts = pkt->pts - ctx->dts_pts_diff;
> > +        return 0;
> > +    }
>
> This doesn't get you the right result, though?  The PTS and DTS on
> every AV1 packet want to be the same.

The result tested good which can be played normally. Just aligned with
other vaapi encoders that the 1st frame start with 0/-1 of PTS/DTS if
have B frame. Set PTS/DTS to same also looks good.

>
> In any case, please don't put tests for codec ID in the common
> code.  I suggest that the timestamp behaviour wants to be a new
> FLAG_*.
>
> > +
> >       if (ctx->output_delay == 0) {
> >           pkt->dts = pkt->pts;
> >       } else if (pic->encode_order < ctx->decode_delay) {
> > @@ -689,9 +698,10 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >   {
> >       VAAPIEncodeContext *ctx = avctx->priv_data;
> >       VACodedBufferSegment *buf_list, *buf;
> > -    VAStatus vas;
> > +    AVPacket *pkt_ptr = pkt;
> >       int total_size = 0;
> >       uint8_t *ptr;
> > +    VAStatus vas;
> >       int err;
> >
> >       err = vaapi_encode_wait(avctx, pic);
> > @@ -711,11 +721,52 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >       for (buf = buf_list; buf; buf = buf->next)
> >           total_size += buf->size;
> >
> > -    err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > -    ptr = pkt->data;
> > +    /** repack av1 coded frame for not display and repeat frames
> > */
> > +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > +        int display_frame = pic->display_order <= pic-
> > >encode_order;
> >
> > -    if (err < 0)
> > -        goto fail_mapped;
> > +        if (display_frame) {
> > +            total_size += ctx->header_data_size;
> > +            err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > +            if (err < 0)
> > +                goto fail_mapped;
> > +            ptr = pkt->data;
> > +
> > +            if (ctx->header_data_size) {
> > +                memcpy(ptr, ctx->header_data, ctx-
> > >header_data_size);
> > +                ptr += ctx->header_data_size;
> > +                ctx->header_data_size = 0;
> > +            }
> > +        } else {
> > +            ctx->header_data = av_realloc(ctx->header_data,
> > total_size);
> > +            if (!ctx->header_data) {
> > +                err = AVERROR(ENOMEM);
> > +                goto fail_mapped;
> > +            }
> > +            ptr = ctx->header_data;
> > +            ctx->header_data_size = total_size;
> > +
> > +            if (pic->tail_size) {
> > +                if (ctx->tail_pkt->size) {
> > +                    err = AVERROR(AVERROR_BUG);
> > +                    goto fail_mapped;
> > +                }
> > +
> > +                err = ff_get_encode_buffer(avctx, ctx->tail_pkt,
> > pic->tail_size, 0);
> > +                if (err < 0)
> > +                    goto fail_mapped;
> > +
> > +                memcpy(ctx->tail_pkt->data, pic->tail_data, pic-
> > >tail_size);
> > +                pkt_ptr = ctx->tail_pkt;
> > +            }
> > +        }
> > +    } else {
> > +        err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > +        ptr = pkt->data;
> > +
> > +        if (err < 0)
> > +            goto fail_mapped;
> > +    }
>
> I'm not convinced that this repeated-realloc-and-copy structure in
> the main context is a good approach.
>
> How about something like storing the output buffer for every picture
> in that picture as it is generated (could even stay in the VABuffer
> without mapping it if that worked?), then when you get a display-now
> picture concatenate all the oustanding frames together in order into
> the packet (with a single copy).

Your way is feasible, may need to add additional flag in pic to
indicate the pic is completed but not dumped and release pic after
dumped, or just delay to change pic state to completed. I will try it.

>
> The tail data is trivial to generate on the fly (it's only three
> bytes), so storing it does not seem useful.
>
> >
> >       for (buf = buf_list; buf; buf = buf->next) {
> >           av_log(avctx, AV_LOG_DEBUG, "Output buffer: %u bytes "
> > @@ -726,10 +777,10 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >       }
> >
> >       if (pic->type == PICTURE_TYPE_IDR)
> > -        pkt->flags |= AV_PKT_FLAG_KEY;
> > +        pkt_ptr->flags |= AV_PKT_FLAG_KEY;
> >
> > -    pkt->pts = pic->pts;
> > -    pkt->duration = pic->duration;
> > +    pkt_ptr->pts = pic->pts;
> > +    pkt_ptr->duration = pic->duration;
> >
> >       vas = vaUnmapBuffer(ctx->hwctx->display, pic->output_buffer);
> >       if (vas != VA_STATUS_SUCCESS) {
> > @@ -742,8 +793,8 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >       // for no-delay encoders this is handled in generic codec
> >       if (avctx->codec->capabilities & AV_CODEC_CAP_DELAY &&
> >           avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
> > -        pkt->opaque     = pic->opaque;
> > -        pkt->opaque_ref = pic->opaque_ref;
> > +        pkt_ptr->opaque     = pic->opaque;
> > +        pkt_ptr->opaque_ref = pic->opaque_ref;
> >           pic->opaque_ref = NULL;
> >       }
> >
> > @@ -752,6 +803,9 @@ static int vaapi_encode_output(AVCodecContext
> > *avctx,
> >
> >       av_log(avctx, AV_LOG_DEBUG, "Output read for pic
> > %"PRId64"/%"PRId64".\n",
> >              pic->display_order, pic->encode_order);
> > +
> > +    vaapi_encode_set_output_timestamp(avctx, pic, pkt_ptr);
> > +
> >       return 0;
> >
> >   fail_mapped:
> > @@ -1128,9 +1182,19 @@ static int
> > vaapi_encode_pick_next(AVCodecContext *avctx,
> >
> >       vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
> >       if (pic->type != PICTURE_TYPE_IDR) {
> > -        vaapi_encode_add_ref(avctx, pic, start,
> > -                             pic->type == PICTURE_TYPE_P,
> > -                             b_counter > 0, 0);
> > +        // TODO: apply both previous and forward multi reference
> > for all vaapi encoders.
> > +        // And L0/L1 reference frame number can be set dynamically
> > through query
> > +        // VAConfigAttribEncMaxRefFrames attribute.
> > +        if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > +            for (i = 0; i < ctx->nb_next_prev; i++)
> > +                vaapi_encode_add_ref(avctx, pic, ctx-
> > >next_prev[i],
> > +                                     pic->type == PICTURE_TYPE_P,
> > +                                     b_counter > 0, 0);
>
> So an undisclosed aim of the previous patch was to make this extra
> list of the most recent N frames for this to use with P frames only?

Currently, yes. As the TODO comment says, I will apply it to all other
codecs and B frame next, it will need lots of test on different
hardware. Add this for AV1 here is to align with VPL which use 2
previous reference for P frame. Base on our test, 2 reference frame for
P will improve ~5% BD-rate for quality, so that can make it possible to
let user get same quality with VPL after adjusting and aligning encoder
options.

>
> Why this partcular structure for P frames only, though?  Seems like
> if you support extra references then other codecs could also make use
> of them in either direction, so we would be better off hinting that
> the codec would like up to N references and then using the contents
> of the existing DPB, plus keep extras if there is space.
>
> > +        } else
> > +            vaapi_encode_add_ref(avctx, pic, start,
> > +                                 pic->type == PICTURE_TYPE_P,
> > +                                 b_counter > 0, 0);
> > +
> >           vaapi_encode_add_ref(avctx, pic, ctx->next_prev[ctx-
> > >nb_next_prev - 1], 0, 0, 1);
> >       }
> >
> > @@ -1292,6 +1356,19 @@ int
> > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket
> > *pkt)
> >       AVFrame *frame = ctx->frame;
> >       int err;
> >
> > +start:
> > +    /** if no B frame before repeat P frame, sent repeat P frame
> > out. */
> > +    if (avctx->codec_id == AV_CODEC_ID_AV1 && ctx->tail_pkt->size)
> > {
> > +        for (VAAPIEncodePicture *tmp = ctx->pic_start; tmp; tmp =
> > tmp->next) {
> > +            if (tmp->type == PICTURE_TYPE_B && tmp->pts < ctx-
> > >tail_pkt->pts)
> > +                break;
> > +            else if (!tmp->next) {
> > +                av_packet_move_ref(pkt, ctx->tail_pkt);
> > +                goto end;
> > +            }
> > +        }
> > +    }
> > +
> >       err = ff_encode_get_frame(avctx, frame);
> >       if (err < 0 && err != AVERROR_EOF)
> >           return err;
> > @@ -1356,17 +1433,21 @@ int
> > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket
> > *pkt)
> >           return err;
> >       }
> >
> > -    vaapi_encode_set_output_timestamp(avctx, pic, pkt);
> > -    av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts
> > %"PRId64", "
> > -           "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> > -
> >       ctx->output_order = pic->encode_order;
> >       vaapi_encode_clear_old(avctx);
> >
> > +    /** loop to get an available pkt in encoder flushing. */
> > +    if (ctx->end_of_stream && !pkt->size)
> > +        goto start;
> > +
> > +end:
> > +    if (pkt->size)
> > +        av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64",
> > dts %"PRId64", "
> > +               "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> > +
> >       return 0;
> >   }
> >
> > -
> >   static av_cold void vaapi_encode_add_global_param(AVCodecContext
> > *avctx, int type,
> >                                                     void *buffer,
> > size_t size)
> >   {
> > @@ -2667,6 +2748,12 @@ av_cold int
> > ff_vaapi_encode_init(AVCodecContext *avctx)
> >       ctx->device = (AVHWDeviceContext*)ctx->device_ref->data;
> >       ctx->hwctx = ctx->device->hwctx;
> >
> > +    ctx->tail_pkt = av_packet_alloc();
> > +    if (!ctx->tail_pkt) {
> > +        err = AVERROR(ENOMEM);
> > +        goto fail;
> > +    }
> > +
> >       err = vaapi_encode_profile_entrypoint(avctx);
> >       if (err < 0)
> >           goto fail;
> > @@ -2859,9 +2946,11 @@ av_cold int
> > ff_vaapi_encode_close(AVCodecContext *avctx)
> >       }
> >
> >       av_frame_free(&ctx->frame);
> > +    av_packet_free(&ctx->tail_pkt);
> >
> >       av_freep(&ctx->codec_sequence_params);
> >       av_freep(&ctx->codec_picture_params);
> > +    av_freep(&ctx->header_data);
> >       av_fifo_freep2(&ctx->encode_fifo);
> >
> >       av_buffer_unref(&ctx->recon_frames_ref);
> > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> > index d5452a37b3..03df8d6d46 100644
> > --- a/libavcodec/vaapi_encode.h
> > +++ b/libavcodec/vaapi_encode.h
> > @@ -133,6 +133,11 @@ typedef struct VAAPIEncodePicture {
> >
> >       int          nb_slices;
> >       VAAPIEncodeSlice *slices;
> > +
> > +    /** Tail data of current pic, used only for repeat header of
> > AV1. */
> > +    char tail_data[MAX_PARAM_BUFFER_SIZE];
> > +    /** Byte length of tail_data. */
> > +    size_t tail_size;
> >   } VAAPIEncodePicture;
> >
> >   typedef struct VAAPIEncodeProfile {
> > @@ -367,6 +372,13 @@ typedef struct VAAPIEncodeContext {
> >       AVFifo          *encode_fifo;
> >       // Max number of frame buffered in encoder.
> >       int             async_depth;
> > +
> > +    /** Head data for current output pkt, used only for AV1. */
> > +    void  *header_data;
> > +    size_t header_data_size;
> > +
> > +    /** Store av1 repeat frame header pkt. */
> > +    AVPacket *tail_pkt;
> >   } VAAPIEncodeContext;
> >
> >   enum {
> > diff --git a/libavcodec/vaapi_encode_av1.c
> > b/libavcodec/vaapi_encode_av1.c
> > new file mode 100644
> > index 0000000000..4c8a9d995d
> > --- /dev/null
> > +++ b/libavcodec/vaapi_encode_av1.c
> > @@ -0,0 +1,1229 @@
> > +/*
> > + * Copyright (c) 2023 Intel Corporation
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later
> > version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General
> > Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> > 02110-1301 USA
> > + */
> > +
> > +#include <va/va.h>
> > +#include <va/va_enc_av1.h>
> > +
> > +#include "libavutil/pixdesc.h"
> > +#include "libavutil/opt.h"
> > +
> > +#include "cbs_av1.h"
> > +#include "put_bits.h"
> > +#include "codec_internal.h"
> > +#include "av1_levels.h"
> > +#include "vaapi_encode.h"
> > +
> > +#define AV1_MAX_QUANT 255
> > +
> > +typedef struct VAAPIEncodeAV1Picture {
> > +    int64_t last_idr_frame;
> > +    int slot;
> > +} VAAPIEncodeAV1Picture;
> > +
> > +typedef struct VAAPIEncodeAV1Context {
> > +    VAAPIEncodeContext common;
> > +    AV1RawOBU sh; /**< sequence header.*/
> > +    AV1RawOBU fh; /**< frame header.*/
> > +    CodedBitstreamContext *cbc;
> > +    CodedBitstreamFragment current_obu;
> > +    VAConfigAttribValEncAV1 attr;
> > +    VAConfigAttribValEncAV1Ext1 attr_ext1;
> > +    VAConfigAttribValEncAV1Ext2 attr_ext2;
> > +
> > +    char sh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded sequence
> > header data. */
> > +    size_t sh_data_len; /**< bit length of sh_data. */
> > +    char fh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded frame header
> > data. */
> > +    size_t fh_data_len; /**< bit length of fh_data. */
> > +
> > +    uint8_t uniform_tile;
> > +    uint8_t use_128x128_superblock;
> > +    int sb_cols;
> > +    int sb_rows;
> > +    int tile_cols_log2;
> > +    int tile_rows_log2;
> > +    int max_tile_width_sb;
> > +    int max_tile_height_sb;
> > +    uint8_t width_in_sbs_minus_1[AV1_MAX_TILE_COLS];
> > +    uint8_t height_in_sbs_minus_1[AV1_MAX_TILE_ROWS];
> > +
> > +    int min_log2_tile_cols;
> > +    int max_log2_tile_cols;
> > +    int min_log2_tile_rows;
> > +    int max_log2_tile_rows;
> > +
> > +    int q_idx_idr;
> > +    int q_idx_p;
> > +    int q_idx_b;
> > +
> > +    /** user options */
> > +    int profile;
> > +    int level;
> > +    int tier;
> > +    int tile_cols, tile_rows;
> > +    int tile_groups;
> > +} VAAPIEncodeAV1Context;
> > +
> > +static av_cold int vaapi_encode_av1_configure(AVCodecContext
> > *avctx)
> > +{
> > +    VAAPIEncodeContext     *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +    int ret;
> > +
> > +    ret = ff_cbs_init(&priv->cbc, AV_CODEC_ID_AV1, avctx);
> > +    if (ret < 0)
> > +        return ret;
> > +
> > +    if (ctx->rc_mode->quality) {
> > +        priv->q_idx_p = av_clip(ctx->rc_quality, 0,
> > AV1_MAX_QUANT);
> > +        if (fabs(avctx->i_quant_factor) > 0.0)
> > +            priv->q_idx_idr =
> > +                av_clip((fabs(avctx->i_quant_factor) * priv-
> > >q_idx_p  +
> > +                         avctx->i_quant_offset) + 0.5,
> > +                        0, AV1_MAX_QUANT);
> > +        else
> > +            priv->q_idx_idr = priv->q_idx_p;
> > +
> > +        if (fabs(avctx->b_quant_factor) > 0.0)
> > +            priv->q_idx_b =
> > +                av_clip((fabs(avctx->b_quant_factor) * priv-
> > >q_idx_p  +
> > +                         avctx->b_quant_offset) + 0.5,
> > +                        0, AV1_MAX_QUANT);
> > +        else
> > +            priv->q_idx_b = priv->q_idx_p;
> > +    } else {
> > +        /** Arbitrary value */
> > +        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 128;
> > +    }
>
> Are there any alignment requirements here?  (If I gave it an input
> which is, say, 1361x1, would that work?)

I didn't get you. Which parameter here need to be align? Max/Min
resolution HW supported will be checked somewhere else. If use 1361x1
there will be the error like:

"Hardware does not support encoding at size 1376x16 (constraints: width
32-8192 height 32-8192)."

>
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_add_obu(AVCodecContext *avctx,
> > +                                    CodedBitstreamFragment *au,
> > +                                    uint8_t type,
> > +                                    void *obu_unit)
> > +{
> > +    int ret;
> > +
> > +    ret = ff_cbs_insert_unit_content(au, -1,
> > +                                     type, obu_unit, NULL);
> > +    if (ret < 0) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to add OBU unit: "
> > +               "type = %d.\n", type);
> > +        return ret;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_write_obu(AVCodecContext *avctx,
> > +                                      char *data, size_t
> > *data_len,
> > +                                      CodedBitstreamFragment *bs)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +    int ret;
> > +
> > +    ret = ff_cbs_write_fragment_data(priv->cbc, bs);
> > +    if (ret < 0) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to write packed
> > header.\n");
> > +        return ret;
> > +    }
> > +
> > +    if ((size_t)8 * MAX_PARAM_BUFFER_SIZE < 8 * bs->data_size -
> > bs->data_bit_padding) {
> > +        av_log(avctx, AV_LOG_ERROR, "Access unit too large: "
> > +               "%zu < %zu.\n", (size_t)8 * MAX_PARAM_BUFFER_SIZE,
> > +               8 * bs->data_size - bs->data_bit_padding);
> > +        return AVERROR(ENOSPC);
> > +    }
> > +
> > +    memcpy(data, bs->data, bs->data_size);
> > +    *data_len = 8 * bs->data_size - bs->data_bit_padding;
> > +
> > +    return 0;
> > +}
> > +
> > +static int get_relative_dist(const AV1RawSequenceHeader *seq,
> > +                             unsigned int a, unsigned int b)
> > +{
> > +    unsigned int diff, m;
> > +    if (!seq->enable_order_hint)
> > +        return 0;
> > +    diff = a - b;
> > +    m = 1 << seq->order_hint_bits_minus_1;
> > +    diff = (diff & (m - 1)) - (diff & m);
> > +    return diff;
> > +}
> > +
> > +static int write_ns(PutBitContext *pbc, uint32_t n, uint32_t
> > value)
> > +{
> > +    uint32_t w, m, v, extra_bit;
> > +
> > +    w = av_log2(n) + 1;
> > +    m = (1 << w) - n;
> > +
> > +    if (value < m) {
> > +        v = value;
> > +        put_bits(pbc, w - 1, v);
> > +    } else {
> > +        v = m + ((value - m) >> 1);
> > +        extra_bit = (value - m) & 1;
> > +        put_bits(pbc, w - 1, v);
> > +        put_bits(pbc, 1, extra_bit);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +/**
> > + * This API provide the minmum implemention according current
> > enabled features
> > + * in frame header. If more features will enable in furture,
> > please make sure
> > + * the relative flags of features should be packed correctly into
> > frame header
> > + * obu in this API.
> > + */
> > +static int vaapi_encode_av1_write_frame_header(AVCodecContext
> > *avctx,
> > +                                               VAAPIEncodePicture
> > *pic,
> > +                                               char *data, size_t
> > *data_len)
> > +{
> > +    VAAPIEncodeContext              *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> > +    AV1RawOBU                    *fh_obu = &priv->fh;
> > +    AV1RawOBU                    *sh_obu = &priv->sh;
> > +    AV1RawFrameHeader                *fh = &fh_obu-
> > >obu.frame.header;
> > +    AV1RawSequenceHeader             *sh = &sh_obu-
> > >obu.sequence_header;
> > +    VAEncPictureParameterBufferAV1 *vpic = pic-
> > >codec_picture_params;
> > +    PutBitContext pbc, pbc_tmp;
> > +    uint8_t byte;
> > +    int qindex, coded_lossless;
> > +    int id_len, frame_is_intra, skip_mode_allowed;
> > +    int start, payload_bits, obu_size, obu_size_len;
> > +    int qindex_offset, loopfilter_offset;
> > +    int cdef_start_offset, cdef_end_offset;
> > +    int i;
> > +
> > +    init_put_bits(&pbc, data, MAX_PARAM_BUFFER_SIZE);
> > +
> > +    /** obu header */
> > +    put_bits(&pbc, 1, fh_obu->header.obu_forbidden_bit);
> > +    put_bits(&pbc, 4, fh_obu->header.obu_type);
> > +    put_bits(&pbc, 1, fh_obu->header.obu_extension_flag);
> > +    put_bits(&pbc, 1, fh_obu->header.obu_has_size_field);
> > +    put_bits(&pbc, 1, fh_obu->header.obu_reserved_1bit);
> > +
> > +    /** record pbc status to re-write obu size later. */
> > +    if (fh_obu->header.obu_has_size_field) {
> > +        pbc_tmp = pbc;
> > +        put_bits32(&pbc, 0);
> > +        put_bits32(&pbc, 0);
> > +    }
> > +
> > +    start = put_bits_count(&pbc);
> > +
> > +    /** uncompressed_header() */
> > +    if (sh->frame_id_numbers_present_flag)
> > +        id_len = sh->additional_frame_id_length_minus_1 +
> > +                 sh->delta_frame_id_length_minus_2 + 3;
> > +
> > +    frame_is_intra = (fh->frame_type == AV1_FRAME_KEY ||
> > +                      fh->frame_type == AV1_FRAME_INTRA_ONLY);
> > +    if (!sh->reduced_still_picture_header) {
> > +        put_bits(&pbc, 1, fh->show_existing_frame);
> > +        if (fh->show_existing_frame) {
> > +            put_bits(&pbc, 3, fh->frame_to_show_map_idx);
> > +            goto trailing_bits;
> > +        }
> > +        if (sh->frame_id_numbers_present_flag)
> > +            put_bits(&pbc, id_len, fh->display_frame_id);
> > +
> > +        put_bits(&pbc, 2, fh->frame_type);
> > +        put_bits(&pbc, 1, fh->show_frame);
> > +        if (!fh->show_frame)
> > +            put_bits(&pbc, 1, fh->showable_frame);
> > +
> > +        if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> > +            (fh->frame_type == AV1_FRAME_KEY && fh->show_frame)))
> > +            put_bits(&pbc, 1, fh->error_resilient_mode);
> > +    }
> > +
> > +    put_bits(&pbc, 1, fh->disable_cdf_update);
> > +    if (sh->seq_force_screen_content_tools ==
> > AV1_SELECT_SCREEN_CONTENT_TOOLS)
> > +        put_bits(&pbc, 1, fh->allow_screen_content_tools);
> > +
> > +    if (fh->allow_screen_content_tools && sh->seq_force_integer_mv
> > == AV1_SELECT_INTEGER_MV)
> > +        put_bits(&pbc, 1, fh->force_integer_mv);
> > +
> > +    if (sh->frame_id_numbers_present_flag)
> > +        put_bits(&pbc, id_len, fh->current_frame_id);
> > +
> > +    if (fh->frame_type != AV1_FRAME_SWITCH && !sh-
> > >reduced_still_picture_header)
> > +        put_bits(&pbc, 1, fh->frame_size_override_flag);
> > +
> > +    if (sh->enable_order_hint)
> > +        put_bits(&pbc, sh->order_hint_bits_minus_1 + 1, fh-
> > >order_hint);
> > +
> > +    if (!(frame_is_intra || fh->error_resilient_mode))
> > +        put_bits(&pbc, 3, fh->primary_ref_frame);
> > +
> > +    if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> > +        fh->frame_type == AV1_FRAME_KEY && fh->show_frame))
> > +        put_bits(&pbc, 8, fh->refresh_frame_flags);
> > +
> > +    if (frame_is_intra) {
> > +        /** render_size() */
> > +         put_bits(&pbc, 1, fh->render_and_frame_size_different);
> > +    } else {
> > +        if (!frame_is_intra && sh->enable_order_hint)
> > +            put_bits(&pbc, 1, fh->frame_refs_short_signaling);
> > +
> > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +            if (!fh->frame_refs_short_signaling)
> > +                put_bits(&pbc, 3, fh->ref_frame_idx[i]);
> > +        }
> > +
> > +        if (!(fh->frame_size_override_flag && !fh-
> > >error_resilient_mode))
> > +            put_bits(&pbc, 1, fh-
> > >render_and_frame_size_different);
> > +
> > +        if (!fh->force_integer_mv)
> > +            put_bits(&pbc, 1, fh->allow_high_precision_mv);
> > +
> > +        /** read_interpolation_filter() */
> > +        put_bits(&pbc, 1, fh->is_filter_switchable);
> > +        if (!fh->is_filter_switchable)
> > +            put_bits(&pbc, 2, fh->interpolation_filter);
> > +
> > +        put_bits(&pbc, 1, fh->is_motion_mode_switchable);
> > +    }
> > +
> > +    if (!(sh->reduced_still_picture_header || fh-
> > >disable_cdf_update))
> > +        put_bits(&pbc, 1, fh->disable_frame_end_update_cdf);
> > +
> > +    /** tile_info() */
> > +    put_bits(&pbc, 1, fh->uniform_tile_spacing_flag);
> > +    if (fh->uniform_tile_spacing_flag) {
> > +        for (i = 0; i < priv->tile_cols_log2 - priv-
> > >min_log2_tile_cols; i++) {
> > +            put_bits(&pbc, 1, 1);
> > +        }
> > +        if (priv->tile_cols_log2 < priv->max_log2_tile_cols)
> > +            put_bits(&pbc, 1, 0);
> > +
> > +        for (i = 0; i < priv->tile_rows_log2 - priv-
> > >min_log2_tile_rows; i++) {
> > +            put_bits(&pbc, 1, 1);
> > +        }
> > +        if (priv->tile_rows_log2 < priv->max_log2_tile_rows)
> > +            put_bits(&pbc, 1, 0);
> > +    } else {
> > +        int start_sb = 0;
> > +        int max_width, max_height;
> > +        for (i = 0; start_sb < priv->sb_cols; i++) {
> > +            max_width = FFMIN(priv->sb_cols - start_sb, priv-
> > >max_tile_width_sb);
> > +            write_ns(&pbc, max_width, fh-
> > >width_in_sbs_minus_1[i]);
> > +            start_sb += fh->width_in_sbs_minus_1[i] + 1;
> > +        }
> > +
> > +        start_sb = 0;
> > +        for (i = 0; start_sb < priv->sb_rows; i++) {
> > +            max_height = FFMIN(priv->sb_rows - start_sb, priv-
> > >max_tile_height_sb);
> > +            write_ns(&pbc, max_height, fh-
> > >height_in_sbs_minus_1[i]);
> > +            start_sb += fh->height_in_sbs_minus_1[i] + 1;
> > +        }
> > +    }
> > +
> > +    if (priv->tile_cols_log2 || priv->tile_rows_log2) {
> > +        put_bits(&pbc, priv->tile_cols_log2 + priv-
> > >tile_rows_log2, fh->context_update_tile_id);
> > +        put_bits(&pbc, 2, fh->tile_size_bytes_minus1);
> > +    }
> > +
> > +    qindex_offset = put_bits_count(&pbc);
> > +    /** quantization_params() */
> > +    put_bits(&pbc, 8, fh->base_q_idx);
> > +    put_bits(&pbc, 1, fh->delta_q_y_dc);
> > +    put_bits(&pbc, 1, fh->delta_q_u_dc);
> > +    put_bits(&pbc, 1, fh->delta_q_u_ac);
> > +    put_bits(&pbc, 1, fh->using_qmatrix);
> > +
> > +    /** segmentation_params() */
> > +    put_bits(&pbc, 1, fh->segmentation_enabled);
> > +
> > +    /** delta_q_params() */
> > +    if (fh->base_q_idx)
> > +        put_bits(&pbc, 1, fh->delta_q_present);
> > +    if (fh->delta_q_present)
> > +        put_bits(&pbc, 2, fh->delta_q_res);
> > +
> > +    /** delta_lf_params() */
> > +    if (fh->delta_q_present) {
> > +        if (!fh->allow_intrabc)
> > +            put_bits(&pbc, 1, fh->delta_lf_present);
> > +        if (fh->delta_lf_present) {
> > +            put_bits(&pbc, 2, fh->delta_lf_res);
> > +            put_bits(&pbc, 1, fh->delta_lf_multi);
> > +        }
> > +    }
> > +
> > +    /** codelossless */
> > +    coded_lossless = 1;
> > +    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
> > +        if (fh->segmentation_enabled && fh-
> > >feature_enabled[i][AV1_SEG_LVL_ALT_Q])
> > +            qindex = fh->base_q_idx + fh-
> > >feature_value[i][AV1_SEG_LVL_ALT_Q];
> > +        else
> > +            qindex = fh->base_q_idx;
> > +        qindex = av_clip_uintp2(qindex, 8);
> > +
> > +        if (qindex || fh->delta_q_y_dc || fh->delta_q_u_ac || fh-
> > >delta_q_u_dc ||
> > +            fh->delta_q_v_ac || fh->delta_q_v_dc)
> > +            coded_lossless = 0;
> > +    }
> > +
> > +    loopfilter_offset = put_bits_count(&pbc);
> > +    /** loop_filter_params() */
> > +    if (!(coded_lossless || fh->allow_intrabc)) {
> > +        put_bits(&pbc, 6, fh->loop_filter_level[0]);
> > +        put_bits(&pbc, 6, fh->loop_filter_level[1]);
> > +        if (fh->loop_filter_level[0] || fh->loop_filter_level[1])
> > {
> > +            put_bits(&pbc, 6, fh->loop_filter_level[2]);
> > +            put_bits(&pbc, 6, fh->loop_filter_level[3]);
> > +        }
> > +        put_bits(&pbc, 3, fh->loop_filter_sharpness);
> > +        put_bits(&pbc, 1, fh->loop_filter_delta_enabled);
> > +    }
> > +
> > +    cdef_start_offset = put_bits_count(&pbc);
> > +    /** cdef_params() */
> > +    if (!(coded_lossless || fh->allow_intrabc || !sh-
> > >enable_cdef)) {
> > +        put_bits(&pbc, 2, fh->cdef_damping_minus_3);
> > +        put_bits(&pbc, 2, fh->cdef_bits);
> > +        for (i = 0; i < (1 << fh->cdef_bits); i++) {
> > +            put_bits(&pbc, 4, fh->cdef_y_pri_strength[i]);
> > +            put_bits(&pbc, 2, fh->cdef_y_sec_strength[i]);
> > +            put_bits(&pbc, 4, fh->cdef_uv_pri_strength[i]);
> > +            put_bits(&pbc, 2, fh->cdef_uv_sec_strength[i]);
> > +        }
> > +    }
> > +    cdef_end_offset = put_bits_count(&pbc);
> > +
> > +    /** read_tx_mode() */
> > +    if (fh->tx_mode == AV1_TX_MODE_SELECT)
> > +        put_bits(&pbc, 1, 1);
> > +    else
> > +        put_bits(&pbc, 1, 0);
> > +
> > +    /** frame_reference_mode() */
> > +    if (!frame_is_intra)
> > +        put_bits(&pbc, 1, fh->reference_select);
> > +
> > +    /** skip_mode_params() */
> > +    if (frame_is_intra || !fh->reference_select || !sh-
> > >enable_order_hint)
> > +        skip_mode_allowed = 0;
> > +    else {
> > +        int forward_idx, backward_idx;
> > +        int ref_hint, forward_hint, backward_hint;
> > +
> > +        forward_idx = -1;
> > +        backward_idx = -1;
> > +
> > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +            ref_hint = fh->ref_order_hint[fh->ref_frame_idx[i]];
> > +            if (get_relative_dist(sh, ref_hint, fh->order_hint) <
> > 0) {
> > +                if (forward_idx < 0 || get_relative_dist(sh,
> > ref_hint, forward_idx) > 0) {
> > +                    forward_idx = i;
> > +                    forward_hint = ref_hint;
> > +                }
> > +            } else if (get_relative_dist(sh, ref_hint, fh-
> > >order_hint) > 0) {
> > +                if (backward_idx < 0 || get_relative_dist(sh,
> > ref_hint, backward_hint) < 0) {
> > +                    backward_idx = i;
> > +                    backward_hint = ref_hint;
> > +                }
> > +            }
> > +        }
> > +        if (forward_idx < 0)
> > +            skip_mode_allowed = 0;
> > +        else if (backward_idx >= 0)
> > +            skip_mode_allowed = 1;
> > +        else {
> > +            int second_forward_idx, second_forward_hint;
> > +            second_forward_idx = -1;
> > +            for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +                ref_hint = fh->ref_order_hint[fh-
> > >ref_frame_idx[i]];
> > +                if (get_relative_dist(sh, ref_hint, forward_hint)
> > < 0) {
> > +                    if (second_forward_idx < 0 ||
> > get_relative_dist(sh, ref_hint, second_forward_hint) > 0){
> > +                        second_forward_idx = i;
> > +                        second_forward_hint = ref_hint;
> > +                    }
> > +                }
> > +            }
> > +            if (second_forward_idx < 0)
> > +                skip_mode_allowed = 0;
> > +            else
> > +                skip_mode_allowed = 1;
> > +        }
> > +    }
> > +
> > +    if (skip_mode_allowed)
> > +        put_bits(&pbc, 1, fh->skip_mode_present);
> > +
> > +    put_bits(&pbc, 1, fh->reduced_tx_set);
> > +
> > +    /** global_motion_params() */
> > +    if (!frame_is_intra) {
> > +        for (i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF;
> > i++) {
> > +            put_bits(&pbc, 1, fh->is_global[i]);
> > +            if (fh->is_global[i]) {
> > +                put_bits(&pbc, 1, fh->is_rot_zoom[i]);
> > +                if (!fh->is_rot_zoom[i])
> > +                    put_bits(&pbc, 1, fh->is_translation[i]);
> > +            }
> > +        }
> > +    }
> > +
> > +trailing_bits:
> > +    payload_bits = put_bits_count(&pbc) - start;
> > +
> > +    /** trailing_bits() */
> > +    put_bits(&pbc, 1, 1);
> > +    obu_size = (put_bits_count(&pbc) - start + 7) / 8;
> > +    for (i = 0; i < obu_size * 8 - payload_bits - 1; i++)
> > +        put_bits(&pbc, 1, 0);
> > +
> > +    flush_put_bits(&pbc);
> > +    *data_len = put_bits_count(&pbc);
> > +
> > +    /** update obu size in bitstream */
> > +    if (fh_obu->header.obu_has_size_field) {
> > +        obu_size_len = priv->attr_ext2.bits.obu_size_bytes_minus1
> > + 1;
> > +        for (i = 0; i < obu_size_len; i++) {
> > +            byte = obu_size >> (7 * i) & 0x7f;
> > +            if (i < obu_size_len - 1)
> > +                byte |= 0x80;
> > +            put_bits(&pbc_tmp, 8, byte);
> > +        }
> > +        flush_put_bits(&pbc_tmp);
> > +        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 -
> > obu_size_len), obu_size);
> > +        *data_len -= (8 - obu_size_len) * 8;
> > +    }
>
> Why is there an incomplete duplicate of the cbs_av1 header writing
> code here?

To record some position/size in bitstream that needed for VAAPI. Like
qp_index/loopfilter/cdef offset and cdef parameters size in bit. It's
not reasonable to add the specific parameters into CBS.

>
> > +
> > +    if (fh->show_existing_frame)
> > +        return 0;
> > +
> > +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> > +        vpic->min_base_qindex = av_clip(avctx->qmin, 1,
> > AV1_MAX_QUANT);
> > +        vpic->max_base_qindex = av_clip(avctx->qmax, 1,
> > AV1_MAX_QUANT);
> > +
> > +        vpic->bit_offset_qindex            = qindex_offset - (8 -
> > obu_size_len) * 8;
> > +        vpic->bit_offset_loopfilter_params = loopfilter_offset -
> > (8 - obu_size_len) * 8;
> > +        vpic->bit_offset_cdef_params       = cdef_start_offset -
> > (8 - obu_size_len) * 8;
> > +        vpic->size_in_bits_cdef_params     = cdef_end_offset -
> > cdef_start_offset;
> > +        vpic->size_in_bits_frame_hdr_obu   = *data_len;
> > +
> > +        vpic->byte_offset_frame_hdr_obu_size = (((pic->type ==
> > PICTURE_TYPE_IDR) ?
> > +                                               priv->sh_data_len /
> > 8 : 0) +
> > +                                               (fh_obu-
> > >header.obu_extension_flag ?
> > +                                               2 : 1));
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int tile_log2(int blkSize, int target) {
> > +    int k;
> > +    for (k = 0; (blkSize << k) < target; k++);
> > +    return k;
> > +}
> > +
> > +static int vaapi_encode_av1_set_tile(AVCodecContext *avctx)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +    int mi_cols, mi_rows, sb_shift, sb_size;
> > +    int max_tile_area_sb, max_tile_area_sb_varied;
> > +    int tile_width_sb, tile_height_sb, widest_tile_sb;
> > +    int min_log2_tiles;
> > +    int tile_rows_tmp, i;
> > +
> > +    if (priv->tile_cols > AV1_MAX_TILE_COLS ||
> > +        priv->tile_rows > AV1_MAX_TILE_ROWS) {
> > +        av_log(avctx, AV_LOG_ERROR, "Invalid tile number %dx%d,
> > should less than %dx%d.\n",
> > +               priv->tile_cols, priv->tile_rows,
> > AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);
>
> Can't this be a constraint on the option rather than a special check
> here?

Tiles set with IMAGE_SIZE option type which aligns with HEVC. It
doesn't support Min/MAX check.

>
> > +        return AVERROR(EINVAL);
> > +    }
> > +
> > +    mi_cols = 2 * ((avctx->width + 7) >> 3);
> > +    mi_rows = 2 * ((avctx->height + 7) >> 3);
> > +    priv->sb_cols = priv->use_128x128_superblock ?
> > +                    ((mi_cols + 31) >> 5) : ((mi_cols + 15) >> 4);
> > +    priv->sb_rows = priv->use_128x128_superblock ?
> > +                    ((mi_rows + 31) >> 5) : ((mi_rows + 15) >> 4);
> > +    sb_shift = priv->use_128x128_superblock ? 5 : 4;
> > +    sb_size  = sb_shift + 2;
> > +    priv->max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size;
> > +    max_tile_area_sb = AV1_MAX_TILE_AREA  >> (2 * sb_size);
> > +
> > +    priv->min_log2_tile_cols = tile_log2(priv->max_tile_width_sb,
> > priv->sb_cols);
> > +    priv->max_log2_tile_cols = tile_log2(1, FFMIN(priv->sb_cols,
> > AV1_MAX_TILE_COLS));
> > +    priv->max_log2_tile_rows = tile_log2(1, FFMIN(priv->sb_rows,
> > AV1_MAX_TILE_ROWS));
> > +    min_log2_tiles = FFMAX(priv->min_log2_tile_cols,
> > +                           tile_log2(max_tile_area_sb, priv-
> > >sb_rows * priv->sb_cols));
> > +
> > +    if (priv->tile_cols != av_clip(priv->tile_cols, (priv->sb_cols
> > + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv-
> > >sb_cols)) {
> > +        priv->tile_cols = av_clip(priv->tile_cols, (priv->sb_cols
> > + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv-
> > >sb_cols);
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile cols, correct
> > to %d.\n", priv->tile_cols);
> > +    }
>
> If the user explicitly asked for a number which isn't possible then
> you probably want to fail here (see comment about some sort of "auto"
> setting above).

OK, will change it to 'auto' by default.

>
> > +
> > +    priv->tile_cols_log2 = tile_log2(1, priv->tile_cols);
> > +    tile_width_sb = (priv->sb_cols + (1 << priv->tile_cols_log2) -
> > 1) >>
> > +                    priv->tile_cols_log2;
> > +
> > +    if (priv->tile_rows > priv->sb_rows) {
> > +        priv->tile_rows = priv->sb_rows;
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct
> > to %d.\n", priv->tile_rows);
> > +    }
> > +
> > +    for (tile_rows_tmp = priv->tile_rows; tile_rows_tmp <= priv-
> > >sb_rows && tile_rows_tmp <= AV1_MAX_TILE_ROWS; tile_rows_tmp++) {
> > +        /** try uniformed tile. */
> > +        priv->tile_rows_log2 = tile_log2(1, tile_rows_tmp);
> > +        if ((priv->sb_cols + tile_width_sb - 1) / tile_width_sb ==
> > priv->tile_cols) {
> > +            for (i = 0; i < priv->tile_cols - 1; i++)
> > +                priv->width_in_sbs_minus_1[i] = tile_width_sb - 1;
> > +            priv->width_in_sbs_minus_1[i] = priv->sb_cols - (priv-
> > >tile_cols - 1) * tile_width_sb - 1;
> > +
> > +            tile_height_sb = (priv->sb_rows + (1 << priv-
> > >tile_rows_log2) - 1) >>
> > +                             priv->tile_rows_log2;
> > +
> > +            if ((priv->sb_rows + tile_height_sb - 1) /
> > tile_height_sb == tile_rows_tmp &&
> > +                tile_height_sb <= max_tile_area_sb /
> > tile_width_sb) {
> > +                for (i = 0; i < tile_rows_tmp - 1; i++)
> > +                    priv->height_in_sbs_minus_1[i] =
> > tile_height_sb - 1;
> > +                priv->height_in_sbs_minus_1[i] = priv->sb_rows -
> > (tile_rows_tmp - 1) * tile_height_sb - 1;
> > +
> > +                priv->uniform_tile = 1;
> > +                priv->min_log2_tile_rows = FFMAX(min_log2_tiles -
> > priv->tile_cols_log2, 0);
> > +
> > +                break;
> > +            }
> > +        }
> > +
> > +        /** try non-uniformed tile. */
> > +        widest_tile_sb = 0;
> > +        for (i = 0; i < priv->tile_cols; i++) {
> > +            priv->width_in_sbs_minus_1[i] = (i + 1) * priv-
> > >sb_cols / priv->tile_cols - i * priv->sb_cols / priv->tile_cols -
> > 1;
> > +            widest_tile_sb = FFMAX(widest_tile_sb, priv-
> > >width_in_sbs_minus_1[i] + 1);
> > +        }
> > +
> > +        if (min_log2_tiles)
> > +            max_tile_area_sb_varied = (priv->sb_rows * priv-
> > >sb_cols) >> (min_log2_tiles + 1);
> > +        else
> > +            max_tile_area_sb_varied = priv->sb_rows * priv-
> > >sb_cols;
> > +        priv->max_tile_height_sb = FFMAX(1,
> > max_tile_area_sb_varied / widest_tile_sb);
> > +
> > +        if (tile_rows_tmp == av_clip(tile_rows_tmp, (priv->sb_rows
> > + priv->max_tile_height_sb - 1) / priv->max_tile_height_sb, priv-
> > >sb_rows)) {
> > +            for (i = 0; i < tile_rows_tmp; i++)
> > +                priv->height_in_sbs_minus_1[i] = (i + 1) * priv-
> > >sb_rows / tile_rows_tmp - i * priv->sb_rows / tile_rows_tmp - 1;
> > +
> > +            break;
> > +        }
> > +    }
> > +
> > +    if (priv->tile_rows != tile_rows_tmp) {
> > +        priv->tile_rows = tile_rows_tmp;
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct
> > to %d.\n", priv->tile_rows);
> > +    }
> > +
> > +    /** check if tile cols/rows is supported by driver. */
> > +    if (priv->attr_ext2.bits.max_tile_num_minus1) {
> > +        if ((priv->tile_cols * priv->tile_rows - 1) > priv-
> > >attr_ext2.bits.max_tile_num_minus1) {
> > +            av_log(avctx, AV_LOG_ERROR, "Unsupported tile num %d *
> > %d = %d by driver, "
> > +                   "should be less than %d.\n", priv->tile_cols,
> > priv->tile_rows,
>
> s/less than/at most/ ?
>
> > +                   priv->tile_cols * priv->tile_rows,
> > +                   priv->attr_ext2.bits.max_tile_num_minus1 + 1);
> > +            return AVERROR(EINVAL);
> > +        }
> > +    }
> > +    av_log(avctx, AV_LOG_DEBUG, "Setting tile cols/rows to
> > %d/%d.\n",
> > +           priv->tile_cols, priv->tile_rows);
> > +
> > +    /** check if tile group numbers is valid. */
> > +    if (priv->tile_groups > priv->tile_cols * priv->tile_rows) {
> > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile groups number
> > %d, "
> > +        "correct to %d.\n", priv->tile_groups, priv->tile_cols *
> > priv->tile_rows);
> > +        priv->tile_groups = priv->tile_cols * priv->tile_rows;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_write_sequence_header(AVCodecContext
> > *avctx,
> > +                                                  char *data,
> > size_t *data_len)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +
> > +    memcpy(data, &priv->sh_data, MAX_PARAM_BUFFER_SIZE *
> > sizeof(char));
> > +    *data_len = priv->sh_data_len;
>
> I'm confused why you write the sequence header in the init function
> and then store it to use here?

Sequence data is keep same during the encode, so I think it doesn't
need to re-write it in each GOP.

>
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_init_sequence_params(AVCodecContext
> > *avctx)
> > +{
> > +    VAAPIEncodeContext               *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context           *priv = avctx->priv_data;
> > +    AV1RawOBU                     *sh_obu = &priv->sh;
> > +    AV1RawSequenceHeader              *sh = &sh_obu-
> > >obu.sequence_header;
> > +    VAEncSequenceParameterBufferAV1 *vseq = ctx-
> > >codec_sequence_params;
> > +    CodedBitstreamFragment           *obu = &priv->current_obu;
> > +    const AVPixFmtDescriptor *desc;
> > +    int ret;
> > +
> > +    memset(sh_obu, 0, sizeof(*sh_obu));
> > +    sh_obu->header.obu_type = AV1_OBU_SEQUENCE_HEADER;
> > +
> > +    desc = av_pix_fmt_desc_get(priv->common.input_frames-
> > >sw_format);
> > +    av_assert0(desc);
> > +
> > +    sh->seq_profile  = avctx->profile;
> > +    if (!sh->seq_force_screen_content_tools)
> > +        sh->seq_force_integer_mv = AV1_SELECT_INTEGER_MV;
> > +    sh->frame_width_bits_minus_1  = av_log2(avctx->width);
> > +    sh->frame_height_bits_minus_1 = av_log2(avctx->height);
> > +    sh->max_frame_width_minus_1   = avctx->width - 1;
> > +    sh->max_frame_height_minus_1  = avctx->height - 1;
> > +    sh->seq_tier[0]               = priv->tier;
> > +    /** enable order hint and reserve maximum 8 bits for it by
> > default. */
> > +    sh->enable_order_hint         = 1;
> > +    sh->order_hint_bits_minus_1   = 7;
> > +
> > +    sh->color_config = (AV1RawColorConfig) {
> > +        .high_bitdepth                  = desc->comp[0].depth == 8
> > ? 0 : 1,
> > +        .color_primaries                = avctx->color_primaries,
> > +        .transfer_characteristics       = avctx->color_trc,
> > +        .matrix_coefficients            = avctx->colorspace,
> > +        .color_description_present_flag = (avctx->color_primaries
> > != AVCOL_PRI_UNSPECIFIED ||
> > +                                           avctx-
> > >color_trc       != AVCOL_TRC_UNSPECIFIED ||
> > +                                           avctx-
> > >colorspace      != AVCOL_SPC_UNSPECIFIED),
> > +        .color_range                    = avctx->color_range ==
> > AVCOL_RANGE_JPEG,
> > +        .subsampling_x                  = desc->log2_chroma_w,
> > +        .subsampling_y                  = desc->log2_chroma_h,
>
> .chroma_sample_position = some function of chroma_sample_location.

There is no chroma_sample_position supported in VAAPI yet.

>
> > +    };
> > +
> > +    if (avctx->level != FF_LEVEL_UNKNOWN) {
> > +        sh->seq_level_idx[0] = avctx->level;
> > +    } else {
> > +        const AV1LevelDescriptor *level;
> > +        float framerate;
> > +
> > +        if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
> > +            framerate = avctx->framerate.num / avctx-
> > >framerate.den;
> > +        else
> > +            framerate = 0;
> > +
> > +        level = ff_av1_guess_level(avctx->bit_rate, priv->tier,
> > +                                   ctx->surface_width, ctx-
> > >surface_height,
> > +                                   priv->tile_rows * priv-
> > >tile_cols,
> > +                                   priv->tile_cols, framerate);
> > +        if (level) {
> > +            av_log(avctx, AV_LOG_VERBOSE, "Using level %s.\n",
> > level->name);
> > +            sh->seq_level_idx[0] = level->level_idx;
> > +        } else {
> > +            av_log(avctx, AV_LOG_VERBOSE, "Stream will not conform
> > to "
> > +                   "any normal level, using level 6.3 by
> > default.\n");
> > +            sh->seq_level_idx[0] = 19;
>
> Isn't this exactly what level_idx == 31 is for?
>
> > +            sh->seq_tier[0] = 1;
> > +        }
> > +    }
> > +    vseq->seq_profile             = sh->seq_profile;
> > +    vseq->seq_level_idx           = sh->seq_level_idx[0];
> > +    vseq->seq_tier                = sh->seq_tier[0];
> > +    vseq->order_hint_bits_minus_1 = sh->order_hint_bits_minus_1;
> > +    vseq->intra_period            = ctx->gop_size;
> > +    vseq->ip_period               = ctx->b_per_p + 1;
> > +
> > +    vseq->seq_fields.bits.enable_order_hint = sh-
> > >enable_order_hint;
> > +
> > +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> > +        vseq->bits_per_second = ctx->va_bit_rate;
> > +        vseq->seq_fields.bits.enable_cdef = sh->enable_cdef = 1;
> > +    }
> > +
> > +    ret = vaapi_encode_av1_add_obu(avctx, obu,
> > AV1_OBU_SEQUENCE_HEADER, &priv->sh);
> > +    if (ret < 0)
> > +        goto end;
> > +
> > +    ret = vaapi_encode_av1_write_obu(avctx, priv->sh_data, &priv-
> > >sh_data_len, obu);
> > +    if (ret < 0)
> > +        goto end;
> > +
> > +end:
> > +    ff_cbs_fragment_reset(obu);
> > +    return ret;
> > +}
> > +
> > +static int vaapi_encode_av1_init_picture_params(AVCodecContext
> > *avctx,
> > +                                                VAAPIEncodePicture
> > *pic)
> > +{
> > +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> > +    VAAPIEncodeAV1Picture          *hpic = pic->priv_data;
> > +    AV1RawOBU                    *fh_obu = &priv->fh;
> > +    AV1RawFrameHeader                *fh = &fh_obu-
> > >obu.frame.header;
> > +    VAEncPictureParameterBufferAV1 *vpic = pic-
> > >codec_picture_params;
> > +    CodedBitstreamFragment          *obu = &priv->current_obu;
> > +    VAAPIEncodePicture    *ref;
> > +    VAAPIEncodeAV1Picture *href;
> > +    int slot, i;
> > +    int ret;
> > +    static const int8_t
> > default_loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME] =
> > +        { 1, 0, 0, 0, -1, 0, -1, -1 };
> > +
> > +    memset(fh_obu, 0, sizeof(*fh_obu));
> > +    pic->nb_slices = priv->tile_groups;
> > +    fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> > +    fh_obu->header.obu_has_size_field = 1;
> > +
> > +    switch (pic->type) {
> > +    case PICTURE_TYPE_IDR:
> > +        av_assert0(pic->nb_refs[0] == 0 || pic->nb_refs[1]);
> > +        fh->frame_type = AV1_FRAME_KEY;
> > +        fh->refresh_frame_flags = 0xFF;
> > +        fh->base_q_idx = priv->q_idx_idr;
> > +        hpic->slot = 0;
> > +        hpic->last_idr_frame = pic->display_order;
> > +        break;
> > +    case PICTURE_TYPE_P:
> > +        av_assert0(pic->nb_refs[0]);
> > +        fh->frame_type = AV1_FRAME_INTER;
> > +        fh->base_q_idx = priv->q_idx_p;
> > +        ref = pic->refs[0][pic->nb_refs[0] - 1];
> > +        href = ref->priv_data;
> > +        hpic->slot = !href->slot;
>
> I'm very confused how all of these multiple-reference stuff can work
> if you only ever put pictures in slots 0 and 1.
>
> What does it actually look like?  Can you share a stream using it?

An example encoded by:
# ffmpeg -vaapi_device /dev/dri/renderD128 -f lavfi -i testsrc -vf
format=nv12,hwupload -c:v av1_vaapi -vframes 100 -y vaapi_av1.mp4

https://drive.google.com/file/d/1rP_cidjKV1GKzwGzG7-esrhw_rdWyAcG/view

>
> > +        hpic->last_idr_frame = href->last_idr_frame;
> > +        fh->refresh_frame_flags = 1 << hpic->slot;
> > +
> > +        /** set the nearest frame in L0 as all reference frame. */
> > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > +            fh->ref_frame_idx[i] = href->slot;
> > +        }
> > +        fh->primary_ref_frame = href->slot;
> > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +        vpic->ref_frame_ctrl_l0.fields.search_idx0 =
> > AV1_REF_FRAME_LAST;
> > +
> > +        /** set the 2nd nearest frame in L0 as Golden frame. */
> > +        if (pic->nb_refs[0] > 1) {
> > +            ref = pic->refs[0][pic->nb_refs[0] - 2];
> > +            href = ref->priv_data;
> > +            fh->ref_frame_idx[3] = href->slot;
> > +            fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +            vpic->ref_frame_ctrl_l0.fields.search_idx1 =
> > AV1_REF_FRAME_GOLDEN;
> > +        }
> > +        break;
> > +    case PICTURE_TYPE_B:
> > +        av_assert0(pic->nb_refs[0] && pic->nb_refs[1]);
> > +        fh->frame_type = AV1_FRAME_INTER;
> > +        fh->base_q_idx = priv->q_idx_b;
> > +        fh->refresh_frame_flags = 0x0;
> > +        fh->reference_select = 1;
> > +
> > +        /** B frame will not be referenced, disable its recon
> > frame. */
> > +        vpic->picture_flags.bits.disable_frame_recon = 1;
>
> It feels like it would be very easy to support all of the B structure
> just like VP9 does here.

It's in my plan together with multi(>2) reference support. Better to
aligned with VPL and need some time to test and compare.

>
> > +
> > +        /** Use LAST_FRAME and BWDREF_FRAME for reference. */
> > +        vpic->ref_frame_ctrl_l0.fields.search_idx0 =
> > AV1_REF_FRAME_LAST;
> > +        vpic->ref_frame_ctrl_l1.fields.search_idx0 =
> > AV1_REF_FRAME_BWDREF;
> > +
> > +        ref                            = pic->refs[0][pic-
> > >nb_refs[0] - 1];
> > +        href                           = ref->priv_data;
> > +        hpic->last_idr_frame           = href->last_idr_frame;
> > +        fh->primary_ref_frame          = href->slot;
> > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +        for (i = 0; i < AV1_REF_FRAME_GOLDEN; i++) {
> > +            fh->ref_frame_idx[i] = href->slot;
> > +        }
> > +
> > +        ref                            = pic->refs[1][pic-
> > >nb_refs[1] - 1];
> > +        href                           = ref->priv_data;
> > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > href->last_idr_frame;
> > +        for (i = AV1_REF_FRAME_GOLDEN; i < AV1_REFS_PER_FRAME;
> > i++) {
> > +            fh->ref_frame_idx[i] = href->slot;
> > +        }
> > +        break;
> > +    default:
> > +        av_assert0(0 && "invalid picture type");
> > +    }
> > +
> > +    fh->show_frame                = pic->display_order <= pic-
> > >encode_order;
> > +    fh->showable_frame            = fh->frame_type !=
> > AV1_FRAME_KEY;
> > +    fh->frame_width_minus_1       = avctx->width - 1;
> > +    fh->frame_height_minus_1      = avctx->height - 1;
> > +    fh->render_width_minus_1      = fh->frame_width_minus_1;
> > +    fh->render_height_minus_1     = fh->frame_height_minus_1;
> > +    fh->order_hint                = pic->display_order - hpic-
> > >last_idr_frame;
>
> Doesn't this overflow?

The max order hint for AV1 is 255, force convert from int64_t to
uint8_t will get the expected result.

>
> > +    fh->tile_cols                 = priv->tile_cols;
> > +    fh->tile_rows                 = priv->tile_rows;
> > +    fh->tile_cols_log2            = priv->tile_cols_log2;
> > +    fh->tile_rows_log2            = priv->tile_rows_log2;
> > +    fh->uniform_tile_spacing_flag = priv->uniform_tile;
> > +    fh->tile_size_bytes_minus1    = priv-
> > >attr_ext2.bits.tile_size_bytes_minus1;
> > +    fh->reduced_tx_set            = 1;
>
> This not being present in the capabilities feels like an omission in
> the API.  That's a large loss for an implementation which does
> support the full transform set.

Personally understanding, "not being present" means it must be
supported. If any exception driver, then they can ask to add new
capability to libva.

>
> > +
> > +    /** ignore ONLY_4x4 mode for codedlossless is not fully
> > implemented. */
> > +    if (priv->attr_ext2.bits.tx_mode_support & 0x04)
> > +        fh->tx_mode = AV1_TX_MODE_SELECT;
> > +    else if (priv->attr_ext2.bits.tx_mode_support & 0x02)
> > +        fh->tx_mode = AV1_TX_MODE_LARGEST;
> > +    else
> > +        return AVERROR(EINVAL);
>
> What would this case even mean?  The encoder doesn't support any
> transforms?  Seems like a message rather than silent failure would be
> a good idea anyway.

OK, will add the message to let user know the failure reason.

>
> > +
> > +    for (i = 0; i < fh->tile_cols; i++)
> > +        fh->width_in_sbs_minus_1[i] = vpic-
> > >width_in_sbs_minus_1[i] = priv->width_in_sbs_minus_1[i];
> > +
> > +    for (i = 0; i < fh->tile_rows; i++)
> > +        fh->height_in_sbs_minus_1[i] = vpic-
> > >height_in_sbs_minus_1[i] = priv->height_in_sbs_minus_1[i];
>
> Er, what?  Doesn't this fail the inference on the last tile
> width/height if you don't split exactly?

What's the meaning of "split exactly"? All the tile w/h will be split
in vaapi_encode_av1_set_tile include last tile. Just make sure the info
of tiles w/h in frame header are same with the info passed to VAAPI
should be fine.

>
> > +
> > +    memcpy(fh->loop_filter_ref_deltas,
> > default_loop_filter_ref_deltas,
> > +           AV1_TOTAL_REFS_PER_FRAME * sizeof(int8_t));
> > +
> > +    if (fh->frame_type == AV1_FRAME_KEY && fh->show_frame) {
> > +        fh->error_resilient_mode = 1;
> > +    }
> > +
> > +    if (fh->frame_type == AV1_FRAME_KEY || fh-
> > >error_resilient_mode)
> > +        fh->primary_ref_frame = AV1_PRIMARY_REF_NONE;
> > +
> > +    vpic->base_qindex          = fh->base_q_idx;
> > +    vpic->frame_width_minus_1  = fh->frame_width_minus_1;
> > +    vpic->frame_height_minus_1 = fh->frame_height_minus_1;
> > +    vpic->primary_ref_frame    = fh->primary_ref_frame;
> > +    vpic->reconstructed_frame  = pic->recon_surface;
> > +    vpic->coded_buf            = pic->output_buffer;
> > +    vpic->tile_cols            = fh->tile_cols;
> > +    vpic->tile_rows            = fh->tile_rows;
> > +    vpic->order_hint           = fh->order_hint;
> > +#if VA_CHECK_VERSION(1, 15, 0)
> > +    vpic->refresh_frame_flags  = fh->refresh_frame_flags;
> > +#endif
>
> What will the difference be between running in the two versions?  (Is
> it worth supporting the older one?  Cf. bit_depth on VP9.)

I'd prefer to support older one. In case of someone using old version
and works well with ffmpeg-vpl(as vpl already support av1 encode long
time ago), otherwise he must be confuse why ffmpeg-vaapi need higher
VAAPI version but VPL doesn't.

>
> > +
> > +    vpic->picture_flags.bits.enable_frame_obu     = 0;
> > +    vpic->picture_flags.bits.frame_type           = fh-
> > >frame_type;
> > +    vpic->picture_flags.bits.reduced_tx_set       = fh-
> > >reduced_tx_set;
> > +    vpic->picture_flags.bits.error_resilient_mode = fh-
> > >error_resilient_mode;
> > +
> > +    /** let driver decide to use single or compound reference
> > prediction mode. */
> > +    vpic->mode_control_flags.bits.reference_mode = fh-
> > >reference_select ? 2 : 0;
> > +    vpic->mode_control_flags.bits.tx_mode = fh->tx_mode;
> > +
> > +    vpic->tile_group_obu_hdr_info.bits.obu_has_size_field = 1;
> > +
> > +    /** set reference. */
> > +    for (i = 0; i < AV1_REFS_PER_FRAME; i++)
> > +        vpic->ref_frame_idx[i] = fh->ref_frame_idx[i];
> > +
> > +    for (i = 0; i < FF_ARRAY_ELEMS(vpic->reference_frames); i++)
> > +        vpic->reference_frames[i] = VA_INVALID_SURFACE;
> > +
> > +    for (i = 0; i < MAX_REFERENCE_LIST_NUM; i++) {
> > +        for (int j = 0; j < pic->nb_refs[i]; j++) {
> > +            VAAPIEncodePicture *ref_pic = pic->refs[i][j];
> > +
> > +            slot = ((VAAPIEncodeAV1Picture*)ref_pic->priv_data)-
> > >slot;
> > +            av_assert0(vpic->reference_frames[slot] ==
> > VA_INVALID_SURFACE);
> > +
> > +            vpic->reference_frames[slot] = ref_pic->recon_surface;
> > +        }
> > +    }
> > +
> > +    /** pack frame header, and set va params offset like
> > bit_offset_qindex etc. */
> > +    ret = vaapi_encode_av1_write_frame_header(avctx, pic, priv-
> > >fh_data, &priv->fh_data_len);
> > +    if (ret < 0)
> > +        goto end;
> > +
> > +end:
> > +    ff_cbs_fragment_reset(obu);
> > +    return ret;
> > +}
> > +
> > +static int vaapi_encode_av1_init_slice_params(AVCodecContext
> > *avctx,
> > +                                              VAAPIEncodePicture
> > *pic,
> > +                                              VAAPIEncodeSlice
> > *slice)
> > +{
> > +    VAAPIEncodeAV1Context      *priv = avctx->priv_data;
> > +    VAEncTileGroupBufferAV1  *vslice = slice->codec_slice_params;
> > +    CodedBitstreamAV1Context  *cbctx = priv->cbc->priv_data;
> > +    int div;
> > +
> > +    /** Set tile group info. */
> > +    div = priv->tile_cols * priv->tile_rows / priv->tile_groups;
> > +    vslice->tg_start = slice->index * div;
> > +    if (slice->index == (priv->tile_groups - 1)) {
> > +        vslice->tg_end = priv->tile_cols * priv->tile_rows - 1;
> > +        cbctx->seen_frame_header = 0;
> > +    } else {
> > +        vslice->tg_end = (slice->index + 1) * div - 1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int vaapi_encode_av1_write_picture_header(AVCodecContext
> > *avctx,
> > +                                                 VAAPIEncodePictur
> > e *pic,
> > +                                                 char *data,
> > size_t *data_len)
> > +{
> > +    VAAPIEncodeAV1Context     *priv = avctx->priv_data;
> > +    CodedBitstreamFragment     *obu = &priv->current_obu;
> > +    AV1RawOBU               *fh_obu = &priv->fh;
> > +    AV1RawFrameHeader       *rep_fh = &fh_obu->obu.frame_header;
> > +    VAAPIEncodeAV1Picture *href;
> > +    int ret = 0;
> > +
> > +    pic->tail_size = 0;
> > +    /** Pack repeat frame header. */
> > +    if (pic->display_order > pic->encode_order) {
> > +        memset(fh_obu, 0, sizeof(*fh_obu));
> > +        href = pic->refs[0][pic->nb_refs[0] - 1]->priv_data;
> > +        fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> > +        fh_obu->header.obu_has_size_field = 1;
> > +
> > +        rep_fh->show_existing_frame   = 1;
> > +        rep_fh->frame_to_show_map_idx = href->slot == 0;
> > +        rep_fh->frame_type            = AV1_FRAME_INTER;
> > +        rep_fh->frame_width_minus_1   = avctx->width - 1;
> > +        rep_fh->frame_height_minus_1  = avctx->height - 1;
> > +        rep_fh->render_width_minus_1  = rep_fh-
> > >frame_width_minus_1;
> > +        rep_fh->render_height_minus_1 = rep_fh-
> > >frame_height_minus_1;
> > +
> > +        ret = vaapi_encode_av1_write_frame_header(avctx, pic, pic-
> > >tail_data, &pic->tail_size);
> > +        if (ret < 0)
> > +            goto end;
> > +
> > +        pic->tail_size /= 8;
> > +    }
> > +
> > +    memcpy(data, &priv->fh_data, MAX_PARAM_BUFFER_SIZE *
> > sizeof(char));
> > +    *data_len = priv->fh_data_len;
> > +
> > +end:
> > +    ff_cbs_fragment_reset(obu);
> > +    return ret;
> > +}
> > +
> > +static const VAAPIEncodeProfile vaapi_encode_av1_profiles[] = {
> > +    { FF_PROFILE_AV1_MAIN,  8, 3, 1, 1, VAProfileAV1Profile0 },
> > +    { FF_PROFILE_AV1_MAIN, 10, 3, 1, 1, VAProfileAV1Profile0 },
> > +    { FF_PROFILE_UNKNOWN }
> > +};
> > +
> > +static const VAAPIEncodeType vaapi_encode_type_av1 = {
> > +    .profiles        = vaapi_encode_av1_profiles,
> > +    .flags           = FLAG_B_PICTURES,
> > +    .default_quality = 25,
> > +    .configure       = &vaapi_encode_av1_configure,
> > +
> > +    .sequence_header_type  = VAEncPackedHeaderSequence,
> > +    .sequence_params_size  =
> > sizeof(VAEncSequenceParameterBufferAV1),
> > +    .init_sequence_params  =
> > &vaapi_encode_av1_init_sequence_params,
> > +    .write_sequence_header =
> > &vaapi_encode_av1_write_sequence_header,
> > +
> > +    .picture_priv_data_size = sizeof(VAAPIEncodeAV1Picture),
> > +    .picture_header_type    = VAEncPackedHeaderPicture,
> > +    .picture_params_size    =
> > sizeof(VAEncPictureParameterBufferAV1),
> > +    .init_picture_params    =
> > &vaapi_encode_av1_init_picture_params,
> > +    .write_picture_header   =
> > &vaapi_encode_av1_write_picture_header,
> > +
> > +    .slice_params_size = sizeof(VAEncTileGroupBufferAV1),
> > +    .init_slice_params = &vaapi_encode_av1_init_slice_params,
> > +};
> > +
> > +static av_cold int vaapi_encode_av1_init(AVCodecContext *avctx)
> > +{
> > +    VAAPIEncodeContext      *ctx = avctx->priv_data;
> > +    VAAPIEncodeAV1Context  *priv = avctx->priv_data;
> > +    VAConfigAttrib attr;
> > +    VAStatus vas;
> > +    int ret;
> > +
> > +    ctx->codec = &vaapi_encode_type_av1;
> > +
> > +    ctx->desired_packed_headers =
> > +        VA_ENC_PACKED_HEADER_SEQUENCE |
> > +        VA_ENC_PACKED_HEADER_PICTURE;
> > +
> > +    if (avctx->profile == FF_PROFILE_UNKNOWN)
> > +        avctx->profile = priv->profile;
> > +    if (avctx->level == FF_LEVEL_UNKNOWN)
> > +        avctx->level = priv->level;
> > +
> > +    if (avctx->level != FF_LEVEL_UNKNOWN && avctx->level & ~0x1f)
> > {
> > +        av_log(avctx, AV_LOG_ERROR, "Invalid level %d\n", avctx-
> > >level);
> > +        return AVERROR(EINVAL);
> > +    }
> > +
> > +    ret = ff_vaapi_encode_init(avctx);
> > +    if (ret < 0)
> > +        return ret;
> > +
> > +    attr.type = VAConfigAttribEncAV1;
> > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > +                                ctx->va_profile,
> > +                                ctx->va_entrypoint,
> > +                                &attr, 1);
> > +    if (vas != VA_STATUS_SUCCESS) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > +               "config attribute: %d (%s).\n", vas,
> > vaErrorStr(vas));
> > +        return AVERROR_EXTERNAL;
> > +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> > +        priv->attr.value = 0;
> > +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> > +               "supported.\n", attr.type);
>
> Is zero actually reasonable in this attribute (and below)?  Would we
> be better just requiring them to work?

The attribute content of EncAV1 and EncAV1Ext1 is not the necessary for
encoder. If it is not supported, I will regard all the attribute
features is un-usable, and it doesn't break our encode. So I gives it
more tolerance without error, just a warning message.

>
> > +    } else {
> > +        priv->attr.value = attr.value;
> > +    }
> > +
> > +    attr.type = VAConfigAttribEncAV1Ext1;
> > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > +                                ctx->va_profile,
> > +                                ctx->va_entrypoint,
> > +                                &attr, 1);
> > +    if (vas != VA_STATUS_SUCCESS) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > +               "config attribute: %d (%s).\n", vas,
> > vaErrorStr(vas));
> > +        return AVERROR_EXTERNAL;
> > +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> > +        priv->attr_ext1.value = 0;
> > +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> > +               "supported.\n", attr.type);
> > +    } else {
> > +        priv->attr_ext1.value = attr.value;
> > +    }
> > +
> > +    /** This attr provides essential indicators, return error if
> > not support. */
> > +    attr.type = VAConfigAttribEncAV1Ext2;
> > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > +                                ctx->va_profile,
> > +                                ctx->va_entrypoint,
> > +                                &attr, 1);
> > +    if (vas != VA_STATUS_SUCCESS || attr.value ==
> > VA_ATTRIB_NOT_SUPPORTED) {
> > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > +               "config attribute: %d (%s).\n", vas,
> > vaErrorStr(vas));
> > +        return AVERROR_EXTERNAL;
> > +    } else {
> > +        priv->attr_ext2.value = attr.value;
> > +    }
> > +
> > +    ret = vaapi_encode_av1_set_tile(avctx);
> > +    if (ret < 0)
> > +        return ret;
> > +
> > +    return 0;
> > +}
> > +
> > +static av_cold int vaapi_encode_av1_close(AVCodecContext *avctx)
> > +{
> > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > +
> > +    ff_cbs_fragment_free(&priv->current_obu);
> > +    ff_cbs_close(&priv->cbc);
> > +
> > +    return ff_vaapi_encode_close(avctx);
> > +}
> > +
> > +#define OFFSET(x) offsetof(VAAPIEncodeAV1Context, x)
> > +#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM |
> > AV_OPT_FLAG_ENCODING_PARAM)
> > +
> > +static const AVOption vaapi_encode_av1_options[] = {
> > +    VAAPI_ENCODE_COMMON_OPTIONS,
> > +    VAAPI_ENCODE_RC_OPTIONS,
> > +    { "profile", "Set profile (seq_profile)",
> > +      OFFSET(profile), AV_OPT_TYPE_INT,
> > +      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 0xff,
> > FLAGS, "profile" },
> > +
> > +#define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
> > +    { .i64 = value }, 0, 0, FLAGS, "profile"
> > +    { PROFILE("main",               FF_PROFILE_AV1_MAIN) },
> > +    { PROFILE("high",               FF_PROFILE_AV1_HIGH) },
> > +    { PROFILE("professional",       FF_PROFILE_AV1_PROFESSIONAL)
> > },
> > +#undef PROFILE
> > +
> > +    { "tier", "Set tier (seq_tier)",
> > +      OFFSET(tier), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS,
> > "tier" },
> > +    { "main", NULL, 0, AV_OPT_TYPE_CONST,
> > +      { .i64 = 0 }, 0, 0, FLAGS, "tier" },
> > +    { "high", NULL, 0, AV_OPT_TYPE_CONST,
> > +      { .i64 = 1 }, 0, 0, FLAGS, "tier" },
> > +    { "level", "Set level (seq_level_idx)",
> > +      OFFSET(level), AV_OPT_TYPE_INT,
> > +      { .i64 = FF_LEVEL_UNKNOWN }, FF_LEVEL_UNKNOWN, 0x1f, FLAGS,
> > "level" },
> > +
> > +#define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
> > +      { .i64 = value }, 0, 0, FLAGS, "level"
> > +    { LEVEL("2.0",  0) },
> > +    { LEVEL("2.1",  1) },
> > +    { LEVEL("3.0",  4) },
> > +    { LEVEL("3.1",  5) },
> > +    { LEVEL("4.0",  8) },
> > +    { LEVEL("4.1",  9) },
> > +    { LEVEL("5.0", 12) },
> > +    { LEVEL("5.1", 13) },
> > +    { LEVEL("5.2", 14) },
> > +    { LEVEL("5.3", 15) },
> > +    { LEVEL("6.0", 16) },
> > +    { LEVEL("6.1", 17) },
> > +    { LEVEL("6.2", 18) },
> > +    { LEVEL("6.3", 19) },
> > +#undef LEVEL
> > +
> > +    { "tiles", "Tile columns x rows",
> > +      OFFSET(tile_cols), AV_OPT_TYPE_IMAGE_SIZE, { .str = "1x1" },
> > 1, AV1_MAX_TILE_COLS, FLAGS },
> > +    { "tile_groups", "Number of tile groups for encoding",
> > +      OFFSET(tile_groups), AV_OPT_TYPE_INT, { .i64 = 1 }, 1,
> > AV1_MAX_TILE_ROWS * AV1_MAX_TILE_COLS, FLAGS },
> > +
> > +    { NULL },
> > +};
> > +
> > +static const FFCodecDefault vaapi_encode_av1_defaults[] = {
> > +    { "b",              "0"   },
> > +    { "bf",             "7"   },
>
> This feels excessive when only one layer is supported.  Do
> experimental results give this as the best tradeoff?

Did a quick check, set bf as 2 can get a better quality in BRC mode.
Will change it to 2.

Thanks.

>
> > +    { "g",              "120" },
> > +    { "qmin",           "1"   },
> > +    { "qmax",           "255" },
> > +    { NULL },
> > +};
> > +
> > +static const AVClass vaapi_encode_av1_class = {
> > +    .class_name = "av1_vaapi",
> > +    .item_name  = av_default_item_name,
> > +    .option     = vaapi_encode_av1_options,
> > +    .version    = LIBAVUTIL_VERSION_INT,
> > +};
> > +
> > +const FFCodec ff_av1_vaapi_encoder = {
> > +    .p.name         = "av1_vaapi",
> > +    CODEC_LONG_NAME("AV1 (VAAPI)"),
> > +    .p.type         = AVMEDIA_TYPE_VIDEO,
> > +    .p.id           = AV_CODEC_ID_AV1,
> > +    .priv_data_size = sizeof(VAAPIEncodeAV1Context),
> > +    .init           = &vaapi_encode_av1_init,
> > +    FF_CODEC_RECEIVE_PACKET_CB(&ff_vaapi_encode_receive_packet),
> > +    .close          = &vaapi_encode_av1_close,
> > +    .p.priv_class   = &vaapi_encode_av1_class,
> > +    .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE |
> > +                      AV_CODEC_CAP_DR1 |
> > AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
> > +    .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE |
> > +                      FF_CODEC_CAP_INIT_CLEANUP,
> > +    .defaults       = vaapi_encode_av1_defaults,
> > +    .p.pix_fmts = (const enum AVPixelFormat[]) {
> > +        AV_PIX_FMT_VAAPI,
> > +        AV_PIX_FMT_NONE,
> > +    },
> > +    .hw_configs     = ff_vaapi_encode_hw_configs,
> > +    .p.wrapper_name = "vaapi",
> > +};
> > diff --git a/libavcodec/version.h b/libavcodec/version.h
> > index 9411511e04..728ab8839d 100644
> > --- a/libavcodec/version.h
> > +++ b/libavcodec/version.h
> > @@ -29,7 +29,7 @@
> >
> >   #include "version_major.h"
> >
> > -#define LIBAVCODEC_VERSION_MINOR  22
> > +#define LIBAVCODEC_VERSION_MINOR  23
> >   #define LIBAVCODEC_VERSION_MICRO 100
> >
> >   #define
> > LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
>
> Thanks,
>
> - Mark
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Neal Gompa Aug. 15, 2023, 4:49 p.m. UTC | #5
On Tue, Aug 15, 2023 at 9:59 AM Dong, Ruijing <Ruijing.Dong@amd.com> wrote:
>
> [AMD Official Use Only - General]
>
> -----Original Message-----
> From: Wang, Fei W <fei.w.wang@intel.com>
> Sent: Wednesday, August 9, 2023 10:55 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Dong, Ruijing <Ruijing.Dong@amd.com>
> Subject: Re: [FFmpeg-devel] [PATCH v3 6/6] lavc/vaapi_encode: Add VAAPI AV1 encoder
>
> On Mon, 2023-08-07 at 22:21 +0100, Mark Thompson wrote:
> > On 03/08/2023 07:01, fei.w.wang-at-intel.com@ffmpeg.org wrote:
> > > From: Fei Wang <fei.w.wang@intel.com>
> > >
> > > Signed-off-by: Fei Wang <fei.w.wang@intel.com>
> > > ---
> > >   Changelog                     |    1 +
> > >   configure                     |    3 +
> > >   doc/encoders.texi             |   13 +
> > >   libavcodec/Makefile           |    1 +
> > >   libavcodec/allcodecs.c        |    1 +
> > >   libavcodec/vaapi_encode.c     |  125 +++-
> > >   libavcodec/vaapi_encode.h     |   12 +
> > >   libavcodec/vaapi_encode_av1.c | 1229
> > > +++++++++++++++++++++++++++++++++
> > >   libavcodec/version.h          |    2 +-
> > >   9 files changed, 1368 insertions(+), 19 deletions(-)
> > >   create mode 100644 libavcodec/vaapi_encode_av1.c
> >
> > I assume this is tested on Intel hardware.  Is it tested on AMD as
> > well?  (Apparently working in recent Mesa.)
> Yep, tested on AMD as well, working.
>
> AMD tested the patchset months ago:
> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22585
>
> This patch changed a little compare with the version in Cartwheel,
> @Ruijing, Could you help to review this version in ML? To avoid the
> diffs break you. Thanks.
>
> >
> >
> > > diff --git a/Changelog b/Changelog
> > > index bbda4f4fd4..e86f742cd3 100644
> > > --- a/Changelog
> > > +++ b/Changelog
> > > @@ -27,6 +27,7 @@ version <next>:
> > >   - Bitstream filter for converting VVC from MP4 to Annex B
> > >   - scale_vt filter for videotoolbox
> > >   - transpose_vt filter for videotoolbox
> > > +- VAAPI AV1 encoder
> > >
> > >   version 6.0:
> > >   - Radiance HDR image support
> > > diff --git a/configure b/configure
> > > index 99388e7664..68a238a819 100755
> > > --- a/configure
> > > +++ b/configure
> > > @@ -3322,6 +3322,8 @@ av1_qsv_decoder_select="qsvdec"
> > >   av1_qsv_encoder_select="qsvenc"
> > >   av1_qsv_encoder_deps="libvpl"
> > >   av1_amf_encoder_deps="amf"
> > > +av1_vaapi_encoder_deps="VAEncPictureParameterBufferAV1"
> > > +av1_vaapi_encoder_select="cbs_av1 vaapi_encode"
> > >
> > >   # parsers
> > >   aac_parser_select="adts_header mpeg4audio"
> > > @@ -7106,6 +7108,7 @@ if enabled vaapi; then
> > >       check_type "va/va.h va/va_enc_jpeg.h"
> > > "VAEncPictureParameterBufferJPEG"
> > >       check_type "va/va.h
> > > va/va_enc_vp8.h"  "VAEncPictureParameterBufferVP8"
> > >       check_type "va/va.h
> > > va/va_enc_vp9.h"  "VAEncPictureParameterBufferVP9"
> > > +    check_type "va/va.h
> > > va/va_enc_av1.h"  "VAEncPictureParameterBufferAV1"
> > >   fi
> > >
> > >   if enabled_all opencl libdrm ; then
> > > diff --git a/doc/encoders.texi b/doc/encoders.texi
> > > index 25d6b7f09e..fb331ebd8e 100644
> > > --- a/doc/encoders.texi
> > > +++ b/doc/encoders.texi
> > > @@ -3991,6 +3991,19 @@ Average variable bitrate.
> > >   Each encoder also has its own specific options:
> > >   @table @option
> > >
> > > +@item av1_vaapi
> > > +@option{profile} sets the value of @emph{seq_profile}.
> > > +@option{tier} sets the value of @emph{seq_tier}.
> > > +@option{level} sets the value of @emph{seq_level_idx}.
> > > +
> > > +@table @option
> > > +@item tiles
> > > +Set the number of tiles to encode the input video with, as columns
> > > x rows.
> > > +(default is 1x1).
> >
> > Probably needs some clarification that large resolutions must be
> > split into tiles?  Maybe an "auto" value for this (meaning use as few
> > as possible), and let explicit "1x1" fail if the resolution is too
> > large.
> >
> > > +@item tile_groups
> > > +Set tile groups number (default is 1).
> >
> > Meaning what?  It splits into tile group OBUs containing equal
> > numbers of tiles, or of equal size in bits, or something else?
> >
> > > +@end table
> > > +
> > >   @item h264_vaapi
> > >   @option{profile} sets the value of @emph{profile_idc} and the
> > > @emph{constraint_set*_flag}s.
> > >   @option{level} sets the value of @emph{level_idc}.
> > > diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> > > index a6b2ecbb22..473afb4471 100644
> > > --- a/libavcodec/Makefile
> > > +++ b/libavcodec/Makefile
> > > @@ -258,6 +258,7 @@ OBJS-$(CONFIG_AV1_MEDIACODEC_DECODER)  +=
> > > mediacodecdec.o
> > >   OBJS-$(CONFIG_AV1_MEDIACODEC_ENCODER)  += mediacodecenc.o
> > >   OBJS-$(CONFIG_AV1_NVENC_ENCODER)       += nvenc_av1.o nvenc.o
> > >   OBJS-$(CONFIG_AV1_QSV_ENCODER)         += qsvenc_av1.o
> > > +OBJS-$(CONFIG_AV1_VAAPI_ENCODER)       += vaapi_encode_av1.o
> > > av1_levels.o
> > >   OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o
> > >   OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
> > >   OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
> > > diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> > > index 8775d15a4f..c43c1d7b48 100644
> > > --- a/libavcodec/allcodecs.c
> > > +++ b/libavcodec/allcodecs.c
> > > @@ -844,6 +844,7 @@ extern const FFCodec ff_av1_nvenc_encoder;
> > >   extern const FFCodec ff_av1_qsv_decoder;
> > >   extern const FFCodec ff_av1_qsv_encoder;
> > >   extern const FFCodec ff_av1_amf_encoder;
> > > +extern const FFCodec ff_av1_vaapi_encoder;
> > >   extern const FFCodec ff_libopenh264_encoder;
> > >   extern const FFCodec ff_libopenh264_decoder;
> > >   extern const FFCodec ff_h264_amf_encoder;
> > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> > > index 2604f12b9e..2907e159fb 100644
> > > --- a/libavcodec/vaapi_encode.c
> > > +++ b/libavcodec/vaapi_encode.c
> > > @@ -669,6 +669,15 @@ static int
> > > vaapi_encode_set_output_timestamp(AVCodecContext *avctx,
> > >   {
> > >       VAAPIEncodeContext *ctx = avctx->priv_data;
> > >
> > > +    // AV1 packs P frame and next B frame into one pkt, and uses
> > > the other
> > > +    // repeat frame header pkt at the display order position of
> > > the P frame
> > > +    // to indicate its frame index. Each frame has a corresponding
> > > pkt in its
> > > +    // display order position. So don't need to consider delay for
> > > AV1 timestamp.
> > > +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > > +        pkt->dts = pkt->pts - ctx->dts_pts_diff;
> > > +        return 0;
> > > +    }
> >
> > This doesn't get you the right result, though?  The PTS and DTS on
> > every AV1 packet want to be the same.
>
> The result tested good which can be played normally. Just aligned with
> other vaapi encoders that the 1st frame start with 0/-1 of PTS/DTS if
> have B frame. Set PTS/DTS to same also looks good.
>
> >
> > In any case, please don't put tests for codec ID in the common
> > code.  I suggest that the timestamp behaviour wants to be a new
> > FLAG_*.
> >
> > > +
> > >       if (ctx->output_delay == 0) {
> > >           pkt->dts = pkt->pts;
> > >       } else if (pic->encode_order < ctx->decode_delay) {
> > > @@ -689,9 +698,10 @@ static int vaapi_encode_output(AVCodecContext
> > > *avctx,
> > >   {
> > >       VAAPIEncodeContext *ctx = avctx->priv_data;
> > >       VACodedBufferSegment *buf_list, *buf;
> > > -    VAStatus vas;
> > > +    AVPacket *pkt_ptr = pkt;
> > >       int total_size = 0;
> > >       uint8_t *ptr;
> > > +    VAStatus vas;
> > >       int err;
> > >
> > >       err = vaapi_encode_wait(avctx, pic);
> > > @@ -711,11 +721,52 @@ static int vaapi_encode_output(AVCodecContext
> > > *avctx,
> > >       for (buf = buf_list; buf; buf = buf->next)
> > >           total_size += buf->size;
> > >
> > > -    err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > > -    ptr = pkt->data;
> > > +    /** repack av1 coded frame for not display and repeat frames
> > > */
> > > +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > > +        int display_frame = pic->display_order <= pic-
> > > >encode_order;
> > >
> > > -    if (err < 0)
> > > -        goto fail_mapped;
> > > +        if (display_frame) {
> > > +            total_size += ctx->header_data_size;
> > > +            err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > > +            if (err < 0)
> > > +                goto fail_mapped;
> > > +            ptr = pkt->data;
> > > +
> > > +            if (ctx->header_data_size) {
> > > +                memcpy(ptr, ctx->header_data, ctx-
> > > >header_data_size);
> > > +                ptr += ctx->header_data_size;
> > > +                ctx->header_data_size = 0;
> > > +            }
> > > +        } else {
> > > +            ctx->header_data = av_realloc(ctx->header_data,
> > > total_size);
> > > +            if (!ctx->header_data) {
> > > +                err = AVERROR(ENOMEM);
> > > +                goto fail_mapped;
> > > +            }
> > > +            ptr = ctx->header_data;
> > > +            ctx->header_data_size = total_size;
> > > +
> > > +            if (pic->tail_size) {
> > > +                if (ctx->tail_pkt->size) {
> > > +                    err = AVERROR(AVERROR_BUG);
> > > +                    goto fail_mapped;
> > > +                }
> > > +
> > > +                err = ff_get_encode_buffer(avctx, ctx->tail_pkt,
> > > pic->tail_size, 0);
> > > +                if (err < 0)
> > > +                    goto fail_mapped;
> > > +
> > > +                memcpy(ctx->tail_pkt->data, pic->tail_data, pic-
> > > >tail_size);
> > > +                pkt_ptr = ctx->tail_pkt;
> > > +            }
> > > +        }
> > > +    } else {
> > > +        err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
> > > +        ptr = pkt->data;
> > > +
> > > +        if (err < 0)
> > > +            goto fail_mapped;
> > > +    }
> >
> > I'm not convinced that this repeated-realloc-and-copy structure in
> > the main context is a good approach.
> >
> > How about something like storing the output buffer for every picture
> > in that picture as it is generated (could even stay in the VABuffer
> > without mapping it if that worked?), then when you get a display-now
> > picture concatenate all the oustanding frames together in order into
> > the packet (with a single copy).
>
> Your way is feasible, may need to add additional flag in pic to
> indicate the pic is completed but not dumped and release pic after
> dumped, or just delay to change pic state to completed. I will try it.
>
> >
> > The tail data is trivial to generate on the fly (it's only three
> > bytes), so storing it does not seem useful.
> >
> > >
> > >       for (buf = buf_list; buf; buf = buf->next) {
> > >           av_log(avctx, AV_LOG_DEBUG, "Output buffer: %u bytes "
> > > @@ -726,10 +777,10 @@ static int vaapi_encode_output(AVCodecContext
> > > *avctx,
> > >       }
> > >
> > >       if (pic->type == PICTURE_TYPE_IDR)
> > > -        pkt->flags |= AV_PKT_FLAG_KEY;
> > > +        pkt_ptr->flags |= AV_PKT_FLAG_KEY;
> > >
> > > -    pkt->pts = pic->pts;
> > > -    pkt->duration = pic->duration;
> > > +    pkt_ptr->pts = pic->pts;
> > > +    pkt_ptr->duration = pic->duration;
> > >
> > >       vas = vaUnmapBuffer(ctx->hwctx->display, pic->output_buffer);
> > >       if (vas != VA_STATUS_SUCCESS) {
> > > @@ -742,8 +793,8 @@ static int vaapi_encode_output(AVCodecContext
> > > *avctx,
> > >       // for no-delay encoders this is handled in generic codec
> > >       if (avctx->codec->capabilities & AV_CODEC_CAP_DELAY &&
> > >           avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
> > > -        pkt->opaque     = pic->opaque;
> > > -        pkt->opaque_ref = pic->opaque_ref;
> > > +        pkt_ptr->opaque     = pic->opaque;
> > > +        pkt_ptr->opaque_ref = pic->opaque_ref;
> > >           pic->opaque_ref = NULL;
> > >       }
> > >
> > > @@ -752,6 +803,9 @@ static int vaapi_encode_output(AVCodecContext
> > > *avctx,
> > >
> > >       av_log(avctx, AV_LOG_DEBUG, "Output read for pic
> > > %"PRId64"/%"PRId64".\n",
> > >              pic->display_order, pic->encode_order);
> > > +
> > > +    vaapi_encode_set_output_timestamp(avctx, pic, pkt_ptr);
> > > +
> > >       return 0;
> > >
> > >   fail_mapped:
> > > @@ -1128,9 +1182,19 @@ static int
> > > vaapi_encode_pick_next(AVCodecContext *avctx,
> > >
> > >       vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
> > >       if (pic->type != PICTURE_TYPE_IDR) {
> > > -        vaapi_encode_add_ref(avctx, pic, start,
> > > -                             pic->type == PICTURE_TYPE_P,
> > > -                             b_counter > 0, 0);
> > > +        // TODO: apply both previous and forward multi reference
> > > for all vaapi encoders.
> > > +        // And L0/L1 reference frame number can be set dynamically
> > > through query
> > > +        // VAConfigAttribEncMaxRefFrames attribute.
> > > +        if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > > +            for (i = 0; i < ctx->nb_next_prev; i++)
> > > +                vaapi_encode_add_ref(avctx, pic, ctx-
> > > >next_prev[i],
> > > +                                     pic->type == PICTURE_TYPE_P,
> > > +                                     b_counter > 0, 0);
> >
> > So an undisclosed aim of the previous patch was to make this extra
> > list of the most recent N frames for this to use with P frames only?
>
> Currently, yes. As the TODO comment says, I will apply it to all other
> codecs and B frame next, it will need lots of test on different
> hardware. Add this for AV1 here is to align with VPL which use 2
> previous reference for P frame. Base on our test, 2 reference frame for
> P will improve ~5% BD-rate for quality, so that can make it possible to
> let user get same quality with VPL after adjusting and aligning encoder
> options.
>
> >
> > Why this partcular structure for P frames only, though?  Seems like
> > if you support extra references then other codecs could also make use
> > of them in either direction, so we would be better off hinting that
> > the codec would like up to N references and then using the contents
> > of the existing DPB, plus keep extras if there is space.
> >
> > > +        } else
> > > +            vaapi_encode_add_ref(avctx, pic, start,
> > > +                                 pic->type == PICTURE_TYPE_P,
> > > +                                 b_counter > 0, 0);
> > > +
> > >           vaapi_encode_add_ref(avctx, pic, ctx->next_prev[ctx-
> > > >nb_next_prev - 1], 0, 0, 1);
> > >       }
> > >
> > > @@ -1292,6 +1356,19 @@ int
> > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket
> > > *pkt)
> > >       AVFrame *frame = ctx->frame;
> > >       int err;
> > >
> > > +start:
> > > +    /** if no B frame before repeat P frame, sent repeat P frame
> > > out. */
> > > +    if (avctx->codec_id == AV_CODEC_ID_AV1 && ctx->tail_pkt->size)
> > > {
> > > +        for (VAAPIEncodePicture *tmp = ctx->pic_start; tmp; tmp =
> > > tmp->next) {
> > > +            if (tmp->type == PICTURE_TYPE_B && tmp->pts < ctx-
> > > >tail_pkt->pts)
> > > +                break;
> > > +            else if (!tmp->next) {
> > > +                av_packet_move_ref(pkt, ctx->tail_pkt);
> > > +                goto end;
> > > +            }
> > > +        }
> > > +    }
> > > +
> > >       err = ff_encode_get_frame(avctx, frame);
> > >       if (err < 0 && err != AVERROR_EOF)
> > >           return err;
> > > @@ -1356,17 +1433,21 @@ int
> > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket
> > > *pkt)
> > >           return err;
> > >       }
> > >
> > > -    vaapi_encode_set_output_timestamp(avctx, pic, pkt);
> > > -    av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts
> > > %"PRId64", "
> > > -           "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> > > -
> > >       ctx->output_order = pic->encode_order;
> > >       vaapi_encode_clear_old(avctx);
> > >
> > > +    /** loop to get an available pkt in encoder flushing. */
> > > +    if (ctx->end_of_stream && !pkt->size)
> > > +        goto start;
> > > +
> > > +end:
> > > +    if (pkt->size)
> > > +        av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64",
> > > dts %"PRId64", "
> > > +               "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
> > > +
> > >       return 0;
> > >   }
> > >
> > > -
> > >   static av_cold void vaapi_encode_add_global_param(AVCodecContext
> > > *avctx, int type,
> > >                                                     void *buffer,
> > > size_t size)
> > >   {
> > > @@ -2667,6 +2748,12 @@ av_cold int
> > > ff_vaapi_encode_init(AVCodecContext *avctx)
> > >       ctx->device = (AVHWDeviceContext*)ctx->device_ref->data;
> > >       ctx->hwctx = ctx->device->hwctx;
> > >
> > > +    ctx->tail_pkt = av_packet_alloc();
> > > +    if (!ctx->tail_pkt) {
> > > +        err = AVERROR(ENOMEM);
> > > +        goto fail;
> > > +    }
> > > +
> > >       err = vaapi_encode_profile_entrypoint(avctx);
> > >       if (err < 0)
> > >           goto fail;
> > > @@ -2859,9 +2946,11 @@ av_cold int
> > > ff_vaapi_encode_close(AVCodecContext *avctx)
> > >       }
> > >
> > >       av_frame_free(&ctx->frame);
> > > +    av_packet_free(&ctx->tail_pkt);
> > >
> > >       av_freep(&ctx->codec_sequence_params);
> > >       av_freep(&ctx->codec_picture_params);
> > > +    av_freep(&ctx->header_data);
> > >       av_fifo_freep2(&ctx->encode_fifo);
> > >
> > >       av_buffer_unref(&ctx->recon_frames_ref);
> > > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> > > index d5452a37b3..03df8d6d46 100644
> > > --- a/libavcodec/vaapi_encode.h
> > > +++ b/libavcodec/vaapi_encode.h
> > > @@ -133,6 +133,11 @@ typedef struct VAAPIEncodePicture {
> > >
> > >       int          nb_slices;
> > >       VAAPIEncodeSlice *slices;
> > > +
> > > +    /** Tail data of current pic, used only for repeat header of
> > > AV1. */
> > > +    char tail_data[MAX_PARAM_BUFFER_SIZE];
> > > +    /** Byte length of tail_data. */
> > > +    size_t tail_size;
> > >   } VAAPIEncodePicture;
> > >
> > >   typedef struct VAAPIEncodeProfile {
> > > @@ -367,6 +372,13 @@ typedef struct VAAPIEncodeContext {
> > >       AVFifo          *encode_fifo;
> > >       // Max number of frame buffered in encoder.
> > >       int             async_depth;
> > > +
> > > +    /** Head data for current output pkt, used only for AV1. */
> > > +    void  *header_data;
> > > +    size_t header_data_size;
> > > +
> > > +    /** Store av1 repeat frame header pkt. */
> > > +    AVPacket *tail_pkt;
> > >   } VAAPIEncodeContext;
> > >
> > >   enum {
> > > diff --git a/libavcodec/vaapi_encode_av1.c
> > > b/libavcodec/vaapi_encode_av1.c
> > > new file mode 100644
> > > index 0000000000..4c8a9d995d
> > > --- /dev/null
> > > +++ b/libavcodec/vaapi_encode_av1.c
> > > @@ -0,0 +1,1229 @@
> > > +/*
> > > + * Copyright (c) 2023 Intel Corporation
> > > + *
> > > + * This file is part of FFmpeg.
> > > + *
> > > + * FFmpeg is free software; you can redistribute it and/or
> > > + * modify it under the terms of the GNU Lesser General Public
> > > + * License as published by the Free Software Foundation; either
> > > + * version 2.1 of the License, or (at your option) any later
> > > version.
> > > + *
> > > + * FFmpeg is distributed in the hope that it will be useful,
> > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > GNU
> > > + * Lesser General Public License for more details.
> > > + *
> > > + * You should have received a copy of the GNU Lesser General
> > > Public
> > > + * License along with FFmpeg; if not, write to the Free Software
> > > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> > > 02110-1301 USA
> > > + */
> > > +
> > > +#include <va/va.h>
> > > +#include <va/va_enc_av1.h>
> > > +
> > > +#include "libavutil/pixdesc.h"
> > > +#include "libavutil/opt.h"
> > > +
> > > +#include "cbs_av1.h"
> > > +#include "put_bits.h"
> > > +#include "codec_internal.h"
> > > +#include "av1_levels.h"
> > > +#include "vaapi_encode.h"
> > > +
> > > +#define AV1_MAX_QUANT 255
> > > +
> > > +typedef struct VAAPIEncodeAV1Picture {
> > > +    int64_t last_idr_frame;
> > > +    int slot;
> > > +} VAAPIEncodeAV1Picture;
> > > +
> > > +typedef struct VAAPIEncodeAV1Context {
> > > +    VAAPIEncodeContext common;
> > > +    AV1RawOBU sh; /**< sequence header.*/
> > > +    AV1RawOBU fh; /**< frame header.*/
> > > +    CodedBitstreamContext *cbc;
> > > +    CodedBitstreamFragment current_obu;
> > > +    VAConfigAttribValEncAV1 attr;
> > > +    VAConfigAttribValEncAV1Ext1 attr_ext1;
> > > +    VAConfigAttribValEncAV1Ext2 attr_ext2;
> > > +
> > > +    char sh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded sequence
> > > header data. */
> > > +    size_t sh_data_len; /**< bit length of sh_data. */
> > > +    char fh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded frame header
> > > data. */
> > > +    size_t fh_data_len; /**< bit length of fh_data. */
> > > +
> > > +    uint8_t uniform_tile;
> > > +    uint8_t use_128x128_superblock;
> > > +    int sb_cols;
> > > +    int sb_rows;
> > > +    int tile_cols_log2;
> > > +    int tile_rows_log2;
> > > +    int max_tile_width_sb;
> > > +    int max_tile_height_sb;
> > > +    uint8_t width_in_sbs_minus_1[AV1_MAX_TILE_COLS];
> > > +    uint8_t height_in_sbs_minus_1[AV1_MAX_TILE_ROWS];
> > > +
> > > +    int min_log2_tile_cols;
> > > +    int max_log2_tile_cols;
> > > +    int min_log2_tile_rows;
> > > +    int max_log2_tile_rows;
> > > +
> > > +    int q_idx_idr;
> > > +    int q_idx_p;
> > > +    int q_idx_b;
> > > +
> > > +    /** user options */
> > > +    int profile;
> > > +    int level;
> > > +    int tier;
> > > +    int tile_cols, tile_rows;
> > > +    int tile_groups;
> > > +} VAAPIEncodeAV1Context;
> > > +
> > > +static av_cold int vaapi_encode_av1_configure(AVCodecContext
> > > *avctx)
> > > +{
> > > +    VAAPIEncodeContext     *ctx = avctx->priv_data;
> > > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > > +    int ret;
> > > +
> > > +    ret = ff_cbs_init(&priv->cbc, AV_CODEC_ID_AV1, avctx);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    if (ctx->rc_mode->quality) {
> > > +        priv->q_idx_p = av_clip(ctx->rc_quality, 0,
> > > AV1_MAX_QUANT);
> > > +        if (fabs(avctx->i_quant_factor) > 0.0)
> > > +            priv->q_idx_idr =
> > > +                av_clip((fabs(avctx->i_quant_factor) * priv-
> > > >q_idx_p  +
> > > +                         avctx->i_quant_offset) + 0.5,
> > > +                        0, AV1_MAX_QUANT);
> > > +        else
> > > +            priv->q_idx_idr = priv->q_idx_p;
> > > +
> > > +        if (fabs(avctx->b_quant_factor) > 0.0)
> > > +            priv->q_idx_b =
> > > +                av_clip((fabs(avctx->b_quant_factor) * priv-
> > > >q_idx_p  +
> > > +                         avctx->b_quant_offset) + 0.5,
> > > +                        0, AV1_MAX_QUANT);
> > > +        else
> > > +            priv->q_idx_b = priv->q_idx_p;
> > > +    } else {
> > > +        /** Arbitrary value */
> > > +        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 128;
> > > +    }
> >
> > Are there any alignment requirements here?  (If I gave it an input
> > which is, say, 1361x1, would that work?)
>
> I didn't get you. Which parameter here need to be align? Max/Min
> resolution HW supported will be checked somewhere else. If use 1361x1
> there will be the error like:
>
> "Hardware does not support encoding at size 1376x16 (constraints: width
> 32-8192 height 32-8192)."
>
> >
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_add_obu(AVCodecContext *avctx,
> > > +                                    CodedBitstreamFragment *au,
> > > +                                    uint8_t type,
> > > +                                    void *obu_unit)
> > > +{
> > > +    int ret;
> > > +
> > > +    ret = ff_cbs_insert_unit_content(au, -1,
> > > +                                     type, obu_unit, NULL);
> > > +    if (ret < 0) {
> > > +        av_log(avctx, AV_LOG_ERROR, "Failed to add OBU unit: "
> > > +               "type = %d.\n", type);
> > > +        return ret;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_write_obu(AVCodecContext *avctx,
> > > +                                      char *data, size_t
> > > *data_len,
> > > +                                      CodedBitstreamFragment *bs)
> > > +{
> > > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > > +    int ret;
> > > +
> > > +    ret = ff_cbs_write_fragment_data(priv->cbc, bs);
> > > +    if (ret < 0) {
> > > +        av_log(avctx, AV_LOG_ERROR, "Failed to write packed
> > > header.\n");
> > > +        return ret;
> > > +    }
> > > +
> > > +    if ((size_t)8 * MAX_PARAM_BUFFER_SIZE < 8 * bs->data_size -
> > > bs->data_bit_padding) {
> > > +        av_log(avctx, AV_LOG_ERROR, "Access unit too large: "
> > > +               "%zu < %zu.\n", (size_t)8 * MAX_PARAM_BUFFER_SIZE,
> > > +               8 * bs->data_size - bs->data_bit_padding);
> > > +        return AVERROR(ENOSPC);
> > > +    }
> > > +
> > > +    memcpy(data, bs->data, bs->data_size);
> > > +    *data_len = 8 * bs->data_size - bs->data_bit_padding;
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int get_relative_dist(const AV1RawSequenceHeader *seq,
> > > +                             unsigned int a, unsigned int b)
> > > +{
> > > +    unsigned int diff, m;
> > > +    if (!seq->enable_order_hint)
> > > +        return 0;
> > > +    diff = a - b;
> > > +    m = 1 << seq->order_hint_bits_minus_1;
> > > +    diff = (diff & (m - 1)) - (diff & m);
> > > +    return diff;
> > > +}
> > > +
> > > +static int write_ns(PutBitContext *pbc, uint32_t n, uint32_t
> > > value)
> > > +{
> > > +    uint32_t w, m, v, extra_bit;
> > > +
> > > +    w = av_log2(n) + 1;
> > > +    m = (1 << w) - n;
> > > +
> > > +    if (value < m) {
> > > +        v = value;
> > > +        put_bits(pbc, w - 1, v);
> > > +    } else {
> > > +        v = m + ((value - m) >> 1);
> > > +        extra_bit = (value - m) & 1;
> > > +        put_bits(pbc, w - 1, v);
> > > +        put_bits(pbc, 1, extra_bit);
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +/**
> > > + * This API provide the minmum implemention according current
> > > enabled features
> > > + * in frame header. If more features will enable in furture,
> > > please make sure
> > > + * the relative flags of features should be packed correctly into
> > > frame header
> > > + * obu in this API.
> > > + */
> > > +static int vaapi_encode_av1_write_frame_header(AVCodecContext
> > > *avctx,
> > > +                                               VAAPIEncodePicture
> > > *pic,
> > > +                                               char *data, size_t
> > > *data_len)
> > > +{
> > > +    VAAPIEncodeContext              *ctx = avctx->priv_data;
> > > +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> > > +    AV1RawOBU                    *fh_obu = &priv->fh;
> > > +    AV1RawOBU                    *sh_obu = &priv->sh;
> > > +    AV1RawFrameHeader                *fh = &fh_obu-
> > > >obu.frame.header;
> > > +    AV1RawSequenceHeader             *sh = &sh_obu-
> > > >obu.sequence_header;
> > > +    VAEncPictureParameterBufferAV1 *vpic = pic-
> > > >codec_picture_params;
> > > +    PutBitContext pbc, pbc_tmp;
> > > +    uint8_t byte;
> > > +    int qindex, coded_lossless;
> > > +    int id_len, frame_is_intra, skip_mode_allowed;
> > > +    int start, payload_bits, obu_size, obu_size_len;
> > > +    int qindex_offset, loopfilter_offset;
> > > +    int cdef_start_offset, cdef_end_offset;
> > > +    int i;
> > > +
> > > +    init_put_bits(&pbc, data, MAX_PARAM_BUFFER_SIZE);
> > > +
> > > +    /** obu header */
> > > +    put_bits(&pbc, 1, fh_obu->header.obu_forbidden_bit);
> > > +    put_bits(&pbc, 4, fh_obu->header.obu_type);
> > > +    put_bits(&pbc, 1, fh_obu->header.obu_extension_flag);
> > > +    put_bits(&pbc, 1, fh_obu->header.obu_has_size_field);
> > > +    put_bits(&pbc, 1, fh_obu->header.obu_reserved_1bit);
> > > +
> > > +    /** record pbc status to re-write obu size later. */
> > > +    if (fh_obu->header.obu_has_size_field) {
> > > +        pbc_tmp = pbc;
> > > +        put_bits32(&pbc, 0);
> > > +        put_bits32(&pbc, 0);
> > > +    }
> > > +
> > > +    start = put_bits_count(&pbc);
> > > +
> > > +    /** uncompressed_header() */
> > > +    if (sh->frame_id_numbers_present_flag)
> > > +        id_len = sh->additional_frame_id_length_minus_1 +
> > > +                 sh->delta_frame_id_length_minus_2 + 3;
> > > +
> > > +    frame_is_intra = (fh->frame_type == AV1_FRAME_KEY ||
> > > +                      fh->frame_type == AV1_FRAME_INTRA_ONLY);
> > > +    if (!sh->reduced_still_picture_header) {
> > > +        put_bits(&pbc, 1, fh->show_existing_frame);
> > > +        if (fh->show_existing_frame) {
> > > +            put_bits(&pbc, 3, fh->frame_to_show_map_idx);
> > > +            goto trailing_bits;
> > > +        }
> > > +        if (sh->frame_id_numbers_present_flag)
> > > +            put_bits(&pbc, id_len, fh->display_frame_id);
> > > +
> > > +        put_bits(&pbc, 2, fh->frame_type);
> > > +        put_bits(&pbc, 1, fh->show_frame);
> > > +        if (!fh->show_frame)
> > > +            put_bits(&pbc, 1, fh->showable_frame);
> > > +
> > > +        if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> > > +            (fh->frame_type == AV1_FRAME_KEY && fh->show_frame)))
> > > +            put_bits(&pbc, 1, fh->error_resilient_mode);
> > > +    }
> > > +
> > > +    put_bits(&pbc, 1, fh->disable_cdf_update);
> > > +    if (sh->seq_force_screen_content_tools ==
> > > AV1_SELECT_SCREEN_CONTENT_TOOLS)
> > > +        put_bits(&pbc, 1, fh->allow_screen_content_tools);
> > > +
> > > +    if (fh->allow_screen_content_tools && sh->seq_force_integer_mv
> > > == AV1_SELECT_INTEGER_MV)
> > > +        put_bits(&pbc, 1, fh->force_integer_mv);
> > > +
> > > +    if (sh->frame_id_numbers_present_flag)
> > > +        put_bits(&pbc, id_len, fh->current_frame_id);
> > > +
> > > +    if (fh->frame_type != AV1_FRAME_SWITCH && !sh-
> > > >reduced_still_picture_header)
> > > +        put_bits(&pbc, 1, fh->frame_size_override_flag);
> > > +
> > > +    if (sh->enable_order_hint)
> > > +        put_bits(&pbc, sh->order_hint_bits_minus_1 + 1, fh-
> > > >order_hint);
> > > +
> > > +    if (!(frame_is_intra || fh->error_resilient_mode))
> > > +        put_bits(&pbc, 3, fh->primary_ref_frame);
> > > +
> > > +    if (!(fh->frame_type == AV1_FRAME_SWITCH ||
> > > +        fh->frame_type == AV1_FRAME_KEY && fh->show_frame))
> > > +        put_bits(&pbc, 8, fh->refresh_frame_flags);
> > > +
> > > +    if (frame_is_intra) {
> > > +        /** render_size() */
> > > +         put_bits(&pbc, 1, fh->render_and_frame_size_different);
> > > +    } else {
> > > +        if (!frame_is_intra && sh->enable_order_hint)
> > > +            put_bits(&pbc, 1, fh->frame_refs_short_signaling);
> > > +
> > > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > > +            if (!fh->frame_refs_short_signaling)
> > > +                put_bits(&pbc, 3, fh->ref_frame_idx[i]);
> > > +        }
> > > +
> > > +        if (!(fh->frame_size_override_flag && !fh-
> > > >error_resilient_mode))
> > > +            put_bits(&pbc, 1, fh-
> > > >render_and_frame_size_different);
> > > +
> > > +        if (!fh->force_integer_mv)
> > > +            put_bits(&pbc, 1, fh->allow_high_precision_mv);
> > > +
> > > +        /** read_interpolation_filter() */
> > > +        put_bits(&pbc, 1, fh->is_filter_switchable);
> > > +        if (!fh->is_filter_switchable)
> > > +            put_bits(&pbc, 2, fh->interpolation_filter);
> > > +
> > > +        put_bits(&pbc, 1, fh->is_motion_mode_switchable);
> > > +    }
> > > +
> > > +    if (!(sh->reduced_still_picture_header || fh-
> > > >disable_cdf_update))
> > > +        put_bits(&pbc, 1, fh->disable_frame_end_update_cdf);
> > > +
> > > +    /** tile_info() */
> > > +    put_bits(&pbc, 1, fh->uniform_tile_spacing_flag);
> > > +    if (fh->uniform_tile_spacing_flag) {
> > > +        for (i = 0; i < priv->tile_cols_log2 - priv-
> > > >min_log2_tile_cols; i++) {
> > > +            put_bits(&pbc, 1, 1);
> > > +        }
> > > +        if (priv->tile_cols_log2 < priv->max_log2_tile_cols)
> > > +            put_bits(&pbc, 1, 0);
> > > +
> > > +        for (i = 0; i < priv->tile_rows_log2 - priv-
> > > >min_log2_tile_rows; i++) {
> > > +            put_bits(&pbc, 1, 1);
> > > +        }
> > > +        if (priv->tile_rows_log2 < priv->max_log2_tile_rows)
> > > +            put_bits(&pbc, 1, 0);
> > > +    } else {
> > > +        int start_sb = 0;
> > > +        int max_width, max_height;
> > > +        for (i = 0; start_sb < priv->sb_cols; i++) {
> > > +            max_width = FFMIN(priv->sb_cols - start_sb, priv-
> > > >max_tile_width_sb);
> > > +            write_ns(&pbc, max_width, fh-
> > > >width_in_sbs_minus_1[i]);
> > > +            start_sb += fh->width_in_sbs_minus_1[i] + 1;
> > > +        }
> > > +
> > > +        start_sb = 0;
> > > +        for (i = 0; start_sb < priv->sb_rows; i++) {
> > > +            max_height = FFMIN(priv->sb_rows - start_sb, priv-
> > > >max_tile_height_sb);
> > > +            write_ns(&pbc, max_height, fh-
> > > >height_in_sbs_minus_1[i]);
> > > +            start_sb += fh->height_in_sbs_minus_1[i] + 1;
> > > +        }
> > > +    }
> > > +
> > > +    if (priv->tile_cols_log2 || priv->tile_rows_log2) {
> > > +        put_bits(&pbc, priv->tile_cols_log2 + priv-
> > > >tile_rows_log2, fh->context_update_tile_id);
> > > +        put_bits(&pbc, 2, fh->tile_size_bytes_minus1);
> > > +    }
> > > +
> > > +    qindex_offset = put_bits_count(&pbc);
> > > +    /** quantization_params() */
> > > +    put_bits(&pbc, 8, fh->base_q_idx);
> > > +    put_bits(&pbc, 1, fh->delta_q_y_dc);
> > > +    put_bits(&pbc, 1, fh->delta_q_u_dc);
> > > +    put_bits(&pbc, 1, fh->delta_q_u_ac);
> > > +    put_bits(&pbc, 1, fh->using_qmatrix);
> > > +
> > > +    /** segmentation_params() */
> > > +    put_bits(&pbc, 1, fh->segmentation_enabled);
> > > +
> > > +    /** delta_q_params() */
> > > +    if (fh->base_q_idx)
> > > +        put_bits(&pbc, 1, fh->delta_q_present);
> > > +    if (fh->delta_q_present)
> > > +        put_bits(&pbc, 2, fh->delta_q_res);
> > > +
> > > +    /** delta_lf_params() */
> > > +    if (fh->delta_q_present) {
> > > +        if (!fh->allow_intrabc)
> > > +            put_bits(&pbc, 1, fh->delta_lf_present);
> > > +        if (fh->delta_lf_present) {
> > > +            put_bits(&pbc, 2, fh->delta_lf_res);
> > > +            put_bits(&pbc, 1, fh->delta_lf_multi);
> > > +        }
> > > +    }
> > > +
> > > +    /** codelossless */
> > > +    coded_lossless = 1;
> > > +    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
> > > +        if (fh->segmentation_enabled && fh-
> > > >feature_enabled[i][AV1_SEG_LVL_ALT_Q])
> > > +            qindex = fh->base_q_idx + fh-
> > > >feature_value[i][AV1_SEG_LVL_ALT_Q];
> > > +        else
> > > +            qindex = fh->base_q_idx;
> > > +        qindex = av_clip_uintp2(qindex, 8);
> > > +
> > > +        if (qindex || fh->delta_q_y_dc || fh->delta_q_u_ac || fh-
> > > >delta_q_u_dc ||
> > > +            fh->delta_q_v_ac || fh->delta_q_v_dc)
> > > +            coded_lossless = 0;
> > > +    }
> > > +
> > > +    loopfilter_offset = put_bits_count(&pbc);
> > > +    /** loop_filter_params() */
> > > +    if (!(coded_lossless || fh->allow_intrabc)) {
> > > +        put_bits(&pbc, 6, fh->loop_filter_level[0]);
> > > +        put_bits(&pbc, 6, fh->loop_filter_level[1]);
> > > +        if (fh->loop_filter_level[0] || fh->loop_filter_level[1])
> > > {
> > > +            put_bits(&pbc, 6, fh->loop_filter_level[2]);
> > > +            put_bits(&pbc, 6, fh->loop_filter_level[3]);
> > > +        }
> > > +        put_bits(&pbc, 3, fh->loop_filter_sharpness);
> > > +        put_bits(&pbc, 1, fh->loop_filter_delta_enabled);
> > > +    }
> > > +
> > > +    cdef_start_offset = put_bits_count(&pbc);
> > > +    /** cdef_params() */
> > > +    if (!(coded_lossless || fh->allow_intrabc || !sh-
> > > >enable_cdef)) {
> > > +        put_bits(&pbc, 2, fh->cdef_damping_minus_3);
> > > +        put_bits(&pbc, 2, fh->cdef_bits);
> > > +        for (i = 0; i < (1 << fh->cdef_bits); i++) {
> > > +            put_bits(&pbc, 4, fh->cdef_y_pri_strength[i]);
> > > +            put_bits(&pbc, 2, fh->cdef_y_sec_strength[i]);
> > > +            put_bits(&pbc, 4, fh->cdef_uv_pri_strength[i]);
> > > +            put_bits(&pbc, 2, fh->cdef_uv_sec_strength[i]);
> > > +        }
> > > +    }
> > > +    cdef_end_offset = put_bits_count(&pbc);
> > > +
> > > +    /** read_tx_mode() */
> > > +    if (fh->tx_mode == AV1_TX_MODE_SELECT)
> > > +        put_bits(&pbc, 1, 1);
> > > +    else
> > > +        put_bits(&pbc, 1, 0);
> > > +
> > > +    /** frame_reference_mode() */
> > > +    if (!frame_is_intra)
> > > +        put_bits(&pbc, 1, fh->reference_select);
> > > +
> > > +    /** skip_mode_params() */
> > > +    if (frame_is_intra || !fh->reference_select || !sh-
> > > >enable_order_hint)
> > > +        skip_mode_allowed = 0;
> > > +    else {
> > > +        int forward_idx, backward_idx;
> > > +        int ref_hint, forward_hint, backward_hint;
> > > +
> > > +        forward_idx = -1;
> > > +        backward_idx = -1;
> > > +
> > > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > > +            ref_hint = fh->ref_order_hint[fh->ref_frame_idx[i]];
> > > +            if (get_relative_dist(sh, ref_hint, fh->order_hint) <
> > > 0) {
> > > +                if (forward_idx < 0 || get_relative_dist(sh,
> > > ref_hint, forward_idx) > 0) {
> > > +                    forward_idx = i;
> > > +                    forward_hint = ref_hint;
> > > +                }
> > > +            } else if (get_relative_dist(sh, ref_hint, fh-
> > > >order_hint) > 0) {
> > > +                if (backward_idx < 0 || get_relative_dist(sh,
> > > ref_hint, backward_hint) < 0) {
> > > +                    backward_idx = i;
> > > +                    backward_hint = ref_hint;
> > > +                }
> > > +            }
> > > +        }
> > > +        if (forward_idx < 0)
> > > +            skip_mode_allowed = 0;
> > > +        else if (backward_idx >= 0)
> > > +            skip_mode_allowed = 1;
> > > +        else {
> > > +            int second_forward_idx, second_forward_hint;
> > > +            second_forward_idx = -1;
> > > +            for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > > +                ref_hint = fh->ref_order_hint[fh-
> > > >ref_frame_idx[i]];
> > > +                if (get_relative_dist(sh, ref_hint, forward_hint)
> > > < 0) {
> > > +                    if (second_forward_idx < 0 ||
> > > get_relative_dist(sh, ref_hint, second_forward_hint) > 0){
> > > +                        second_forward_idx = i;
> > > +                        second_forward_hint = ref_hint;
> > > +                    }
> > > +                }
> > > +            }
> > > +            if (second_forward_idx < 0)
> > > +                skip_mode_allowed = 0;
> > > +            else
> > > +                skip_mode_allowed = 1;
> > > +        }
> > > +    }
> > > +
> > > +    if (skip_mode_allowed)
> > > +        put_bits(&pbc, 1, fh->skip_mode_present);
> > > +
> > > +    put_bits(&pbc, 1, fh->reduced_tx_set);
> > > +
> > > +    /** global_motion_params() */
> > > +    if (!frame_is_intra) {
> > > +        for (i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF;
> > > i++) {
> > > +            put_bits(&pbc, 1, fh->is_global[i]);
> > > +            if (fh->is_global[i]) {
> > > +                put_bits(&pbc, 1, fh->is_rot_zoom[i]);
> > > +                if (!fh->is_rot_zoom[i])
> > > +                    put_bits(&pbc, 1, fh->is_translation[i]);
> > > +            }
> > > +        }
> > > +    }
> > > +
> > > +trailing_bits:
> > > +    payload_bits = put_bits_count(&pbc) - start;
> > > +
> > > +    /** trailing_bits() */
> > > +    put_bits(&pbc, 1, 1);
> > > +    obu_size = (put_bits_count(&pbc) - start + 7) / 8;
> > > +    for (i = 0; i < obu_size * 8 - payload_bits - 1; i++)
> > > +        put_bits(&pbc, 1, 0);
> > > +
> > > +    flush_put_bits(&pbc);
> > > +    *data_len = put_bits_count(&pbc);
> > > +
> > > +    /** update obu size in bitstream */
> > > +    if (fh_obu->header.obu_has_size_field) {
> > > +        obu_size_len = priv->attr_ext2.bits.obu_size_bytes_minus1
> > > + 1;
> > > +        for (i = 0; i < obu_size_len; i++) {
> > > +            byte = obu_size >> (7 * i) & 0x7f;
> > > +            if (i < obu_size_len - 1)
> > > +                byte |= 0x80;
> > > +            put_bits(&pbc_tmp, 8, byte);
> > > +        }
> > > +        flush_put_bits(&pbc_tmp);
> > > +        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 -
> > > obu_size_len), obu_size);
> > > +        *data_len -= (8 - obu_size_len) * 8;
> > > +    }
> >
> > Why is there an incomplete duplicate of the cbs_av1 header writing
> > code here?
>
> To record some position/size in bitstream that needed for VAAPI. Like
> qp_index/loopfilter/cdef offset and cdef parameters size in bit. It's
> not reasonable to add the specific parameters into CBS.
>
> >
> > > +
> > > +    if (fh->show_existing_frame)
> > > +        return 0;
> > > +
> > > +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> > > +        vpic->min_base_qindex = av_clip(avctx->qmin, 1,
> > > AV1_MAX_QUANT);
> > > +        vpic->max_base_qindex = av_clip(avctx->qmax, 1,
> > > AV1_MAX_QUANT);
> > > +
> > > +        vpic->bit_offset_qindex            = qindex_offset - (8 -
> > > obu_size_len) * 8;
> > > +        vpic->bit_offset_loopfilter_params = loopfilter_offset -
> > > (8 - obu_size_len) * 8;
> > > +        vpic->bit_offset_cdef_params       = cdef_start_offset -
> > > (8 - obu_size_len) * 8;
> > > +        vpic->size_in_bits_cdef_params     = cdef_end_offset -
> > > cdef_start_offset;
> > > +        vpic->size_in_bits_frame_hdr_obu   = *data_len;
> > > +
> > > +        vpic->byte_offset_frame_hdr_obu_size = (((pic->type ==
> > > PICTURE_TYPE_IDR) ?
> > > +                                               priv->sh_data_len /
> > > 8 : 0) +
> > > +                                               (fh_obu-
> > > >header.obu_extension_flag ?
> > > +                                               2 : 1));
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int tile_log2(int blkSize, int target) {
> > > +    int k;
> > > +    for (k = 0; (blkSize << k) < target; k++);
> > > +    return k;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_set_tile(AVCodecContext *avctx)
> > > +{
> > > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > > +    int mi_cols, mi_rows, sb_shift, sb_size;
> > > +    int max_tile_area_sb, max_tile_area_sb_varied;
> > > +    int tile_width_sb, tile_height_sb, widest_tile_sb;
> > > +    int min_log2_tiles;
> > > +    int tile_rows_tmp, i;
> > > +
> > > +    if (priv->tile_cols > AV1_MAX_TILE_COLS ||
> > > +        priv->tile_rows > AV1_MAX_TILE_ROWS) {
> > > +        av_log(avctx, AV_LOG_ERROR, "Invalid tile number %dx%d,
> > > should less than %dx%d.\n",
> > > +               priv->tile_cols, priv->tile_rows,
> > > AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);
> >
> > Can't this be a constraint on the option rather than a special check
> > here?
>
> Tiles set with IMAGE_SIZE option type which aligns with HEVC. It
> doesn't support Min/MAX check.
>
> >
> > > +        return AVERROR(EINVAL);
> > > +    }
> > > +
> > > +    mi_cols = 2 * ((avctx->width + 7) >> 3);
> > > +    mi_rows = 2 * ((avctx->height + 7) >> 3);
> > > +    priv->sb_cols = priv->use_128x128_superblock ?
> > > +                    ((mi_cols + 31) >> 5) : ((mi_cols + 15) >> 4);
> > > +    priv->sb_rows = priv->use_128x128_superblock ?
> > > +                    ((mi_rows + 31) >> 5) : ((mi_rows + 15) >> 4);
> > > +    sb_shift = priv->use_128x128_superblock ? 5 : 4;
> > > +    sb_size  = sb_shift + 2;
> > > +    priv->max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size;
> > > +    max_tile_area_sb = AV1_MAX_TILE_AREA  >> (2 * sb_size);
> > > +
> > > +    priv->min_log2_tile_cols = tile_log2(priv->max_tile_width_sb,
> > > priv->sb_cols);
> > > +    priv->max_log2_tile_cols = tile_log2(1, FFMIN(priv->sb_cols,
> > > AV1_MAX_TILE_COLS));
> > > +    priv->max_log2_tile_rows = tile_log2(1, FFMIN(priv->sb_rows,
> > > AV1_MAX_TILE_ROWS));
> > > +    min_log2_tiles = FFMAX(priv->min_log2_tile_cols,
> > > +                           tile_log2(max_tile_area_sb, priv-
> > > >sb_rows * priv->sb_cols));
> > > +
> > > +    if (priv->tile_cols != av_clip(priv->tile_cols, (priv->sb_cols
> > > + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv-
> > > >sb_cols)) {
> > > +        priv->tile_cols = av_clip(priv->tile_cols, (priv->sb_cols
> > > + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv-
> > > >sb_cols);
> > > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile cols, correct
> > > to %d.\n", priv->tile_cols);
> > > +    }
> >
> > If the user explicitly asked for a number which isn't possible then
> > you probably want to fail here (see comment about some sort of "auto"
> > setting above).
>
> OK, will change it to 'auto' by default.
>
> >
> > > +
> > > +    priv->tile_cols_log2 = tile_log2(1, priv->tile_cols);
> > > +    tile_width_sb = (priv->sb_cols + (1 << priv->tile_cols_log2) -
> > > 1) >>
> > > +                    priv->tile_cols_log2;
> > > +
> > > +    if (priv->tile_rows > priv->sb_rows) {
> > > +        priv->tile_rows = priv->sb_rows;
> > > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct
> > > to %d.\n", priv->tile_rows);
> > > +    }
> > > +
> > > +    for (tile_rows_tmp = priv->tile_rows; tile_rows_tmp <= priv-
> > > >sb_rows && tile_rows_tmp <= AV1_MAX_TILE_ROWS; tile_rows_tmp++) {
> > > +        /** try uniformed tile. */
> > > +        priv->tile_rows_log2 = tile_log2(1, tile_rows_tmp);
> > > +        if ((priv->sb_cols + tile_width_sb - 1) / tile_width_sb ==
> > > priv->tile_cols) {
> > > +            for (i = 0; i < priv->tile_cols - 1; i++)
> > > +                priv->width_in_sbs_minus_1[i] = tile_width_sb - 1;
> > > +            priv->width_in_sbs_minus_1[i] = priv->sb_cols - (priv-
> > > >tile_cols - 1) * tile_width_sb - 1;
> > > +
> > > +            tile_height_sb = (priv->sb_rows + (1 << priv-
> > > >tile_rows_log2) - 1) >>
> > > +                             priv->tile_rows_log2;
> > > +
> > > +            if ((priv->sb_rows + tile_height_sb - 1) /
> > > tile_height_sb == tile_rows_tmp &&
> > > +                tile_height_sb <= max_tile_area_sb /
> > > tile_width_sb) {
> > > +                for (i = 0; i < tile_rows_tmp - 1; i++)
> > > +                    priv->height_in_sbs_minus_1[i] =
> > > tile_height_sb - 1;
> > > +                priv->height_in_sbs_minus_1[i] = priv->sb_rows -
> > > (tile_rows_tmp - 1) * tile_height_sb - 1;
> > > +
> > > +                priv->uniform_tile = 1;
> > > +                priv->min_log2_tile_rows = FFMAX(min_log2_tiles -
> > > priv->tile_cols_log2, 0);
> > > +
> > > +                break;
> > > +            }
> > > +        }
> > > +
> > > +        /** try non-uniformed tile. */
> > > +        widest_tile_sb = 0;
> > > +        for (i = 0; i < priv->tile_cols; i++) {
> > > +            priv->width_in_sbs_minus_1[i] = (i + 1) * priv-
> > > >sb_cols / priv->tile_cols - i * priv->sb_cols / priv->tile_cols -
> > > 1;
> > > +            widest_tile_sb = FFMAX(widest_tile_sb, priv-
> > > >width_in_sbs_minus_1[i] + 1);
> > > +        }
> > > +
> > > +        if (min_log2_tiles)
> > > +            max_tile_area_sb_varied = (priv->sb_rows * priv-
> > > >sb_cols) >> (min_log2_tiles + 1);
> > > +        else
> > > +            max_tile_area_sb_varied = priv->sb_rows * priv-
> > > >sb_cols;
> > > +        priv->max_tile_height_sb = FFMAX(1,
> > > max_tile_area_sb_varied / widest_tile_sb);
> > > +
> > > +        if (tile_rows_tmp == av_clip(tile_rows_tmp, (priv->sb_rows
> > > + priv->max_tile_height_sb - 1) / priv->max_tile_height_sb, priv-
> > > >sb_rows)) {
> > > +            for (i = 0; i < tile_rows_tmp; i++)
> > > +                priv->height_in_sbs_minus_1[i] = (i + 1) * priv-
> > > >sb_rows / tile_rows_tmp - i * priv->sb_rows / tile_rows_tmp - 1;
> > > +
> > > +            break;
> > > +        }
> > > +    }
> > > +
> > > +    if (priv->tile_rows != tile_rows_tmp) {
> > > +        priv->tile_rows = tile_rows_tmp;
> > > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct
> > > to %d.\n", priv->tile_rows);
> > > +    }
> > > +
> > > +    /** check if tile cols/rows is supported by driver. */
> > > +    if (priv->attr_ext2.bits.max_tile_num_minus1) {
> > > +        if ((priv->tile_cols * priv->tile_rows - 1) > priv-
> > > >attr_ext2.bits.max_tile_num_minus1) {
> > > +            av_log(avctx, AV_LOG_ERROR, "Unsupported tile num %d *
> > > %d = %d by driver, "
> > > +                   "should be less than %d.\n", priv->tile_cols,
> > > priv->tile_rows,
> >
> > s/less than/at most/ ?
> >
> > > +                   priv->tile_cols * priv->tile_rows,
> > > +                   priv->attr_ext2.bits.max_tile_num_minus1 + 1);
> > > +            return AVERROR(EINVAL);
> > > +        }
> > > +    }
> > > +    av_log(avctx, AV_LOG_DEBUG, "Setting tile cols/rows to
> > > %d/%d.\n",
> > > +           priv->tile_cols, priv->tile_rows);
> > > +
> > > +    /** check if tile group numbers is valid. */
> > > +    if (priv->tile_groups > priv->tile_cols * priv->tile_rows) {
> > > +        av_log(avctx, AV_LOG_WARNING, "Invalid tile groups number
> > > %d, "
> > > +        "correct to %d.\n", priv->tile_groups, priv->tile_cols *
> > > priv->tile_rows);
> > > +        priv->tile_groups = priv->tile_cols * priv->tile_rows;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_write_sequence_header(AVCodecContext
> > > *avctx,
> > > +                                                  char *data,
> > > size_t *data_len)
> > > +{
> > > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > > +
> > > +    memcpy(data, &priv->sh_data, MAX_PARAM_BUFFER_SIZE *
> > > sizeof(char));
> > > +    *data_len = priv->sh_data_len;
> >
> > I'm confused why you write the sequence header in the init function
> > and then store it to use here?
>
> Sequence data is keep same during the encode, so I think it doesn't
> need to re-write it in each GOP.
>
> >
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_init_sequence_params(AVCodecContext
> > > *avctx)
> > > +{
> > > +    VAAPIEncodeContext               *ctx = avctx->priv_data;
> > > +    VAAPIEncodeAV1Context           *priv = avctx->priv_data;
> > > +    AV1RawOBU                     *sh_obu = &priv->sh;
> > > +    AV1RawSequenceHeader              *sh = &sh_obu-
> > > >obu.sequence_header;
> > > +    VAEncSequenceParameterBufferAV1 *vseq = ctx-
> > > >codec_sequence_params;
> > > +    CodedBitstreamFragment           *obu = &priv->current_obu;
> > > +    const AVPixFmtDescriptor *desc;
> > > +    int ret;
> > > +
> > > +    memset(sh_obu, 0, sizeof(*sh_obu));
> > > +    sh_obu->header.obu_type = AV1_OBU_SEQUENCE_HEADER;
> > > +
> > > +    desc = av_pix_fmt_desc_get(priv->common.input_frames-
> > > >sw_format);
> > > +    av_assert0(desc);
> > > +
> > > +    sh->seq_profile  = avctx->profile;
> > > +    if (!sh->seq_force_screen_content_tools)
> > > +        sh->seq_force_integer_mv = AV1_SELECT_INTEGER_MV;
> > > +    sh->frame_width_bits_minus_1  = av_log2(avctx->width);
> > > +    sh->frame_height_bits_minus_1 = av_log2(avctx->height);
> > > +    sh->max_frame_width_minus_1   = avctx->width - 1;
> > > +    sh->max_frame_height_minus_1  = avctx->height - 1;
> > > +    sh->seq_tier[0]               = priv->tier;
> > > +    /** enable order hint and reserve maximum 8 bits for it by
> > > default. */
> > > +    sh->enable_order_hint         = 1;
> > > +    sh->order_hint_bits_minus_1   = 7;
> > > +
> > > +    sh->color_config = (AV1RawColorConfig) {
> > > +        .high_bitdepth                  = desc->comp[0].depth == 8
> > > ? 0 : 1,
> > > +        .color_primaries                = avctx->color_primaries,
> > > +        .transfer_characteristics       = avctx->color_trc,
> > > +        .matrix_coefficients            = avctx->colorspace,
> > > +        .color_description_present_flag = (avctx->color_primaries
> > > != AVCOL_PRI_UNSPECIFIED ||
> > > +                                           avctx-
> > > >color_trc       != AVCOL_TRC_UNSPECIFIED ||
> > > +                                           avctx-
> > > >colorspace      != AVCOL_SPC_UNSPECIFIED),
> > > +        .color_range                    = avctx->color_range ==
> > > AVCOL_RANGE_JPEG,
> > > +        .subsampling_x                  = desc->log2_chroma_w,
> > > +        .subsampling_y                  = desc->log2_chroma_h,
> >
> > .chroma_sample_position = some function of chroma_sample_location.
>
> There is no chroma_sample_position supported in VAAPI yet.
>
> >
> > > +    };
> > > +
> > > +    if (avctx->level != FF_LEVEL_UNKNOWN) {
> > > +        sh->seq_level_idx[0] = avctx->level;
> > > +    } else {
> > > +        const AV1LevelDescriptor *level;
> > > +        float framerate;
> > > +
> > > +        if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
> > > +            framerate = avctx->framerate.num / avctx-
> > > >framerate.den;
> > > +        else
> > > +            framerate = 0;
> > > +
> > > +        level = ff_av1_guess_level(avctx->bit_rate, priv->tier,
> > > +                                   ctx->surface_width, ctx-
> > > >surface_height,
> > > +                                   priv->tile_rows * priv-
> > > >tile_cols,
> > > +                                   priv->tile_cols, framerate);
> > > +        if (level) {
> > > +            av_log(avctx, AV_LOG_VERBOSE, "Using level %s.\n",
> > > level->name);
> > > +            sh->seq_level_idx[0] = level->level_idx;
> > > +        } else {
> > > +            av_log(avctx, AV_LOG_VERBOSE, "Stream will not conform
> > > to "
> > > +                   "any normal level, using level 6.3 by
> > > default.\n");
> > > +            sh->seq_level_idx[0] = 19;
> >
> > Isn't this exactly what level_idx == 31 is for?
> >
> > > +            sh->seq_tier[0] = 1;
> > > +        }
> > > +    }
> > > +    vseq->seq_profile             = sh->seq_profile;
> > > +    vseq->seq_level_idx           = sh->seq_level_idx[0];
> > > +    vseq->seq_tier                = sh->seq_tier[0];
> > > +    vseq->order_hint_bits_minus_1 = sh->order_hint_bits_minus_1;
> > > +    vseq->intra_period            = ctx->gop_size;
> > > +    vseq->ip_period               = ctx->b_per_p + 1;
> > > +
> > > +    vseq->seq_fields.bits.enable_order_hint = sh-
> > > >enable_order_hint;
> > > +
> > > +    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
> > > +        vseq->bits_per_second = ctx->va_bit_rate;
> > > +        vseq->seq_fields.bits.enable_cdef = sh->enable_cdef = 1;
> > > +    }
> > > +
> > > +    ret = vaapi_encode_av1_add_obu(avctx, obu,
> > > AV1_OBU_SEQUENCE_HEADER, &priv->sh);
> > > +    if (ret < 0)
> > > +        goto end;
> > > +
> > > +    ret = vaapi_encode_av1_write_obu(avctx, priv->sh_data, &priv-
> > > >sh_data_len, obu);
> > > +    if (ret < 0)
> > > +        goto end;
> > > +
> > > +end:
> > > +    ff_cbs_fragment_reset(obu);
> > > +    return ret;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_init_picture_params(AVCodecContext
> > > *avctx,
> > > +                                                VAAPIEncodePicture
> > > *pic)
> > > +{
> > > +    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> > > +    VAAPIEncodeAV1Picture          *hpic = pic->priv_data;
> > > +    AV1RawOBU                    *fh_obu = &priv->fh;
> > > +    AV1RawFrameHeader                *fh = &fh_obu-
> > > >obu.frame.header;
> > > +    VAEncPictureParameterBufferAV1 *vpic = pic-
> > > >codec_picture_params;
> > > +    CodedBitstreamFragment          *obu = &priv->current_obu;
> > > +    VAAPIEncodePicture    *ref;
> > > +    VAAPIEncodeAV1Picture *href;
> > > +    int slot, i;
> > > +    int ret;
> > > +    static const int8_t
> > > default_loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME] =
> > > +        { 1, 0, 0, 0, -1, 0, -1, -1 };
> > > +
> > > +    memset(fh_obu, 0, sizeof(*fh_obu));
> > > +    pic->nb_slices = priv->tile_groups;
> > > +    fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> > > +    fh_obu->header.obu_has_size_field = 1;
> > > +
> > > +    switch (pic->type) {
> > > +    case PICTURE_TYPE_IDR:
> > > +        av_assert0(pic->nb_refs[0] == 0 || pic->nb_refs[1]);
> > > +        fh->frame_type = AV1_FRAME_KEY;
> > > +        fh->refresh_frame_flags = 0xFF;
> > > +        fh->base_q_idx = priv->q_idx_idr;
> > > +        hpic->slot = 0;
> > > +        hpic->last_idr_frame = pic->display_order;
> > > +        break;
> > > +    case PICTURE_TYPE_P:
> > > +        av_assert0(pic->nb_refs[0]);
> > > +        fh->frame_type = AV1_FRAME_INTER;
> > > +        fh->base_q_idx = priv->q_idx_p;
> > > +        ref = pic->refs[0][pic->nb_refs[0] - 1];
> > > +        href = ref->priv_data;
> > > +        hpic->slot = !href->slot;
> >
> > I'm very confused how all of these multiple-reference stuff can work
> > if you only ever put pictures in slots 0 and 1.
> >
> > What does it actually look like?  Can you share a stream using it?
>
> An example encoded by:
> # ffmpeg -vaapi_device /dev/dri/renderD128 -f lavfi -i testsrc -vf
> format=nv12,hwupload -c:v av1_vaapi -vframes 100 -y vaapi_av1.mp4
>
> https://drive.google.com/file/d/1rP_cidjKV1GKzwGzG7-esrhw_rdWyAcG/view
>
> >
> > > +        hpic->last_idr_frame = href->last_idr_frame;
> > > +        fh->refresh_frame_flags = 1 << hpic->slot;
> > > +
> > > +        /** set the nearest frame in L0 as all reference frame. */
> > > +        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> > > +            fh->ref_frame_idx[i] = href->slot;
> > > +        }
> > > +        fh->primary_ref_frame = href->slot;
> > > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > > href->last_idr_frame;
> > > +        vpic->ref_frame_ctrl_l0.fields.search_idx0 =
> > > AV1_REF_FRAME_LAST;
> > > +
> > > +        /** set the 2nd nearest frame in L0 as Golden frame. */
> > > +        if (pic->nb_refs[0] > 1) {
> > > +            ref = pic->refs[0][pic->nb_refs[0] - 2];
> > > +            href = ref->priv_data;
> > > +            fh->ref_frame_idx[3] = href->slot;
> > > +            fh->ref_order_hint[href->slot] = ref->display_order -
> > > href->last_idr_frame;
> > > +            vpic->ref_frame_ctrl_l0.fields.search_idx1 =
> > > AV1_REF_FRAME_GOLDEN;
> > > +        }
> > > +        break;
> > > +    case PICTURE_TYPE_B:
> > > +        av_assert0(pic->nb_refs[0] && pic->nb_refs[1]);
> > > +        fh->frame_type = AV1_FRAME_INTER;
> > > +        fh->base_q_idx = priv->q_idx_b;
> > > +        fh->refresh_frame_flags = 0x0;
> > > +        fh->reference_select = 1;
> > > +
> > > +        /** B frame will not be referenced, disable its recon
> > > frame. */
> > > +        vpic->picture_flags.bits.disable_frame_recon = 1;
> >
> > It feels like it would be very easy to support all of the B structure
> > just like VP9 does here.
>
> It's in my plan together with multi(>2) reference support. Better to
> aligned with VPL and need some time to test and compare.
>
> >
> > > +
> > > +        /** Use LAST_FRAME and BWDREF_FRAME for reference. */
> > > +        vpic->ref_frame_ctrl_l0.fields.search_idx0 =
> > > AV1_REF_FRAME_LAST;
> > > +        vpic->ref_frame_ctrl_l1.fields.search_idx0 =
> > > AV1_REF_FRAME_BWDREF;
> > > +
> > > +        ref                            = pic->refs[0][pic-
> > > >nb_refs[0] - 1];
> > > +        href                           = ref->priv_data;
> > > +        hpic->last_idr_frame           = href->last_idr_frame;
> > > +        fh->primary_ref_frame          = href->slot;
> > > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > > href->last_idr_frame;
> > > +        for (i = 0; i < AV1_REF_FRAME_GOLDEN; i++) {
> > > +            fh->ref_frame_idx[i] = href->slot;
> > > +        }
> > > +
> > > +        ref                            = pic->refs[1][pic-
> > > >nb_refs[1] - 1];
> > > +        href                           = ref->priv_data;
> > > +        fh->ref_order_hint[href->slot] = ref->display_order -
> > > href->last_idr_frame;
> > > +        for (i = AV1_REF_FRAME_GOLDEN; i < AV1_REFS_PER_FRAME;
> > > i++) {
> > > +            fh->ref_frame_idx[i] = href->slot;
> > > +        }
> > > +        break;
> > > +    default:
> > > +        av_assert0(0 && "invalid picture type");
> > > +    }
> > > +
> > > +    fh->show_frame                = pic->display_order <= pic-
> > > >encode_order;
> > > +    fh->showable_frame            = fh->frame_type !=
> > > AV1_FRAME_KEY;
> > > +    fh->frame_width_minus_1       = avctx->width - 1;
> > > +    fh->frame_height_minus_1      = avctx->height - 1;
> > > +    fh->render_width_minus_1      = fh->frame_width_minus_1;
> > > +    fh->render_height_minus_1     = fh->frame_height_minus_1;
> > > +    fh->order_hint                = pic->display_order - hpic-
> > > >last_idr_frame;
> >
> > Doesn't this overflow?
>
> The max order hint for AV1 is 255, force convert from int64_t to
> uint8_t will get the expected result.
>
> >
> > > +    fh->tile_cols                 = priv->tile_cols;
> > > +    fh->tile_rows                 = priv->tile_rows;
> > > +    fh->tile_cols_log2            = priv->tile_cols_log2;
> > > +    fh->tile_rows_log2            = priv->tile_rows_log2;
> > > +    fh->uniform_tile_spacing_flag = priv->uniform_tile;
> > > +    fh->tile_size_bytes_minus1    = priv-
> > > >attr_ext2.bits.tile_size_bytes_minus1;
> > > +    fh->reduced_tx_set            = 1;
> >
> > This not being present in the capabilities feels like an omission in
> > the API.  That's a large loss for an implementation which does
> > support the full transform set.
>
> Personally understanding, "not being present" means it must be
> supported. If any exception driver, then they can ask to add new
> capability to libva.
>
> >
> > > +
> > > +    /** ignore ONLY_4x4 mode for codedlossless is not fully
> > > implemented. */
> > > +    if (priv->attr_ext2.bits.tx_mode_support & 0x04)
> > > +        fh->tx_mode = AV1_TX_MODE_SELECT;
> > > +    else if (priv->attr_ext2.bits.tx_mode_support & 0x02)
> > > +        fh->tx_mode = AV1_TX_MODE_LARGEST;
> > > +    else
> > > +        return AVERROR(EINVAL);
> >
> > What would this case even mean?  The encoder doesn't support any
> > transforms?  Seems like a message rather than silent failure would be
> > a good idea anyway.
>
> OK, will add the message to let user know the failure reason.
>
> >
> > > +
> > > +    for (i = 0; i < fh->tile_cols; i++)
> > > +        fh->width_in_sbs_minus_1[i] = vpic-
> > > >width_in_sbs_minus_1[i] = priv->width_in_sbs_minus_1[i];
> > > +
> > > +    for (i = 0; i < fh->tile_rows; i++)
> > > +        fh->height_in_sbs_minus_1[i] = vpic-
> > > >height_in_sbs_minus_1[i] = priv->height_in_sbs_minus_1[i];
> >
> > Er, what?  Doesn't this fail the inference on the last tile
> > width/height if you don't split exactly?
>
> What's the meaning of "split exactly"? All the tile w/h will be split
> in vaapi_encode_av1_set_tile include last tile. Just make sure the info
> of tiles w/h in frame header are same with the info passed to VAAPI
> should be fine.
>
> >
> > > +
> > > +    memcpy(fh->loop_filter_ref_deltas,
> > > default_loop_filter_ref_deltas,
> > > +           AV1_TOTAL_REFS_PER_FRAME * sizeof(int8_t));
> > > +
> > > +    if (fh->frame_type == AV1_FRAME_KEY && fh->show_frame) {
> > > +        fh->error_resilient_mode = 1;
> > > +    }
> > > +
> > > +    if (fh->frame_type == AV1_FRAME_KEY || fh-
> > > >error_resilient_mode)
> > > +        fh->primary_ref_frame = AV1_PRIMARY_REF_NONE;
> > > +
> > > +    vpic->base_qindex          = fh->base_q_idx;
> > > +    vpic->frame_width_minus_1  = fh->frame_width_minus_1;
> > > +    vpic->frame_height_minus_1 = fh->frame_height_minus_1;
> > > +    vpic->primary_ref_frame    = fh->primary_ref_frame;
> > > +    vpic->reconstructed_frame  = pic->recon_surface;
> > > +    vpic->coded_buf            = pic->output_buffer;
> > > +    vpic->tile_cols            = fh->tile_cols;
> > > +    vpic->tile_rows            = fh->tile_rows;
> > > +    vpic->order_hint           = fh->order_hint;
> > > +#if VA_CHECK_VERSION(1, 15, 0)
> > > +    vpic->refresh_frame_flags  = fh->refresh_frame_flags;
> > > +#endif
> >
> > What will the difference be between running in the two versions?  (Is
> > it worth supporting the older one?  Cf. bit_depth on VP9.)
>
> I'd prefer to support older one. In case of someone using old version
> and works well with ffmpeg-vpl(as vpl already support av1 encode long
> time ago), otherwise he must be confuse why ffmpeg-vaapi need higher
> VAAPI version but VPL doesn't.
>
> >
> > > +
> > > +    vpic->picture_flags.bits.enable_frame_obu     = 0;
> > > +    vpic->picture_flags.bits.frame_type           = fh-
> > > >frame_type;
> > > +    vpic->picture_flags.bits.reduced_tx_set       = fh-
> > > >reduced_tx_set;
> > > +    vpic->picture_flags.bits.error_resilient_mode = fh-
> > > >error_resilient_mode;
> > > +
> > > +    /** let driver decide to use single or compound reference
> > > prediction mode. */
> > > +    vpic->mode_control_flags.bits.reference_mode = fh-
> > > >reference_select ? 2 : 0;
> > > +    vpic->mode_control_flags.bits.tx_mode = fh->tx_mode;
> > > +
> > > +    vpic->tile_group_obu_hdr_info.bits.obu_has_size_field = 1;
> > > +
> > > +    /** set reference. */
> > > +    for (i = 0; i < AV1_REFS_PER_FRAME; i++)
> > > +        vpic->ref_frame_idx[i] = fh->ref_frame_idx[i];
> > > +
> > > +    for (i = 0; i < FF_ARRAY_ELEMS(vpic->reference_frames); i++)
> > > +        vpic->reference_frames[i] = VA_INVALID_SURFACE;
> > > +
> > > +    for (i = 0; i < MAX_REFERENCE_LIST_NUM; i++) {
> > > +        for (int j = 0; j < pic->nb_refs[i]; j++) {
> > > +            VAAPIEncodePicture *ref_pic = pic->refs[i][j];
> > > +
> > > +            slot = ((VAAPIEncodeAV1Picture*)ref_pic->priv_data)-
> > > >slot;
> > > +            av_assert0(vpic->reference_frames[slot] ==
> > > VA_INVALID_SURFACE);
> > > +
> > > +            vpic->reference_frames[slot] = ref_pic->recon_surface;
> > > +        }
> > > +    }
> > > +
> > > +    /** pack frame header, and set va params offset like
> > > bit_offset_qindex etc. */
> > > +    ret = vaapi_encode_av1_write_frame_header(avctx, pic, priv-
> > > >fh_data, &priv->fh_data_len);
> > > +    if (ret < 0)
> > > +        goto end;
> > > +
> > > +end:
> > > +    ff_cbs_fragment_reset(obu);
> > > +    return ret;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_init_slice_params(AVCodecContext
> > > *avctx,
> > > +                                              VAAPIEncodePicture
> > > *pic,
> > > +                                              VAAPIEncodeSlice
> > > *slice)
> > > +{
> > > +    VAAPIEncodeAV1Context      *priv = avctx->priv_data;
> > > +    VAEncTileGroupBufferAV1  *vslice = slice->codec_slice_params;
> > > +    CodedBitstreamAV1Context  *cbctx = priv->cbc->priv_data;
> > > +    int div;
> > > +
> > > +    /** Set tile group info. */
> > > +    div = priv->tile_cols * priv->tile_rows / priv->tile_groups;
> > > +    vslice->tg_start = slice->index * div;
> > > +    if (slice->index == (priv->tile_groups - 1)) {
> > > +        vslice->tg_end = priv->tile_cols * priv->tile_rows - 1;
> > > +        cbctx->seen_frame_header = 0;
> > > +    } else {
> > > +        vslice->tg_end = (slice->index + 1) * div - 1;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int vaapi_encode_av1_write_picture_header(AVCodecContext
> > > *avctx,
> > > +                                                 VAAPIEncodePictur
> > > e *pic,
> > > +                                                 char *data,
> > > size_t *data_len)
> > > +{
> > > +    VAAPIEncodeAV1Context     *priv = avctx->priv_data;
> > > +    CodedBitstreamFragment     *obu = &priv->current_obu;
> > > +    AV1RawOBU               *fh_obu = &priv->fh;
> > > +    AV1RawFrameHeader       *rep_fh = &fh_obu->obu.frame_header;
> > > +    VAAPIEncodeAV1Picture *href;
> > > +    int ret = 0;
> > > +
> > > +    pic->tail_size = 0;
> > > +    /** Pack repeat frame header. */
> > > +    if (pic->display_order > pic->encode_order) {
> > > +        memset(fh_obu, 0, sizeof(*fh_obu));
> > > +        href = pic->refs[0][pic->nb_refs[0] - 1]->priv_data;
> > > +        fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
> > > +        fh_obu->header.obu_has_size_field = 1;
> > > +
> > > +        rep_fh->show_existing_frame   = 1;
> > > +        rep_fh->frame_to_show_map_idx = href->slot == 0;
> > > +        rep_fh->frame_type            = AV1_FRAME_INTER;
> > > +        rep_fh->frame_width_minus_1   = avctx->width - 1;
> > > +        rep_fh->frame_height_minus_1  = avctx->height - 1;
> > > +        rep_fh->render_width_minus_1  = rep_fh-
> > > >frame_width_minus_1;
> > > +        rep_fh->render_height_minus_1 = rep_fh-
> > > >frame_height_minus_1;
> > > +
> > > +        ret = vaapi_encode_av1_write_frame_header(avctx, pic, pic-
> > > >tail_data, &pic->tail_size);
> > > +        if (ret < 0)
> > > +            goto end;
> > > +
> > > +        pic->tail_size /= 8;
> > > +    }
> > > +
> > > +    memcpy(data, &priv->fh_data, MAX_PARAM_BUFFER_SIZE *
> > > sizeof(char));
> > > +    *data_len = priv->fh_data_len;
> > > +
> > > +end:
> > > +    ff_cbs_fragment_reset(obu);
> > > +    return ret;
> > > +}
> > > +
> > > +static const VAAPIEncodeProfile vaapi_encode_av1_profiles[] = {
> > > +    { FF_PROFILE_AV1_MAIN,  8, 3, 1, 1, VAProfileAV1Profile0 },
> > > +    { FF_PROFILE_AV1_MAIN, 10, 3, 1, 1, VAProfileAV1Profile0 },
> > > +    { FF_PROFILE_UNKNOWN }
> > > +};
> > > +
> > > +static const VAAPIEncodeType vaapi_encode_type_av1 = {
> > > +    .profiles        = vaapi_encode_av1_profiles,
> > > +    .flags           = FLAG_B_PICTURES,
> > > +    .default_quality = 25,
> > > +    .configure       = &vaapi_encode_av1_configure,
> > > +
> > > +    .sequence_header_type  = VAEncPackedHeaderSequence,
> > > +    .sequence_params_size  =
> > > sizeof(VAEncSequenceParameterBufferAV1),
> > > +    .init_sequence_params  =
> > > &vaapi_encode_av1_init_sequence_params,
> > > +    .write_sequence_header =
> > > &vaapi_encode_av1_write_sequence_header,
> > > +
> > > +    .picture_priv_data_size = sizeof(VAAPIEncodeAV1Picture),
> > > +    .picture_header_type    = VAEncPackedHeaderPicture,
> > > +    .picture_params_size    =
> > > sizeof(VAEncPictureParameterBufferAV1),
> > > +    .init_picture_params    =
> > > &vaapi_encode_av1_init_picture_params,
> > > +    .write_picture_header   =
> > > &vaapi_encode_av1_write_picture_header,
> > > +
> > > +    .slice_params_size = sizeof(VAEncTileGroupBufferAV1),
> > > +    .init_slice_params = &vaapi_encode_av1_init_slice_params,
> > > +};
> > > +
> > > +static av_cold int vaapi_encode_av1_init(AVCodecContext *avctx)
> > > +{
> > > +    VAAPIEncodeContext      *ctx = avctx->priv_data;
> > > +    VAAPIEncodeAV1Context  *priv = avctx->priv_data;
> > > +    VAConfigAttrib attr;
> > > +    VAStatus vas;
> > > +    int ret;
> > > +
> > > +    ctx->codec = &vaapi_encode_type_av1;
> > > +
> > > +    ctx->desired_packed_headers =
> > > +        VA_ENC_PACKED_HEADER_SEQUENCE |
> > > +        VA_ENC_PACKED_HEADER_PICTURE;
> > > +
> > > +    if (avctx->profile == FF_PROFILE_UNKNOWN)
> > > +        avctx->profile = priv->profile;
> > > +    if (avctx->level == FF_LEVEL_UNKNOWN)
> > > +        avctx->level = priv->level;
> > > +
> > > +    if (avctx->level != FF_LEVEL_UNKNOWN && avctx->level & ~0x1f)
> > > {
> > > +        av_log(avctx, AV_LOG_ERROR, "Invalid level %d\n", avctx-
> > > >level);
> > > +        return AVERROR(EINVAL);
> > > +    }
> > > +
> > > +    ret = ff_vaapi_encode_init(avctx);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    attr.type = VAConfigAttribEncAV1;
> > > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > > +                                ctx->va_profile,
> > > +                                ctx->va_entrypoint,
> > > +                                &attr, 1);
> > > +    if (vas != VA_STATUS_SUCCESS) {
> > > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > > +               "config attribute: %d (%s).\n", vas,
> > > vaErrorStr(vas));
> > > +        return AVERROR_EXTERNAL;
> > > +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> > > +        priv->attr.value = 0;
> > > +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> > > +               "supported.\n", attr.type);
> >
> > Is zero actually reasonable in this attribute (and below)?  Would we
> > be better just requiring them to work?
>
> The attribute content of EncAV1 and EncAV1Ext1 is not the necessary for
> encoder. If it is not supported, I will regard all the attribute
> features is un-usable, and it doesn't break our encode. So I gives it
> more tolerance without error, just a warning message.
>
> >
> > > +    } else {
> > > +        priv->attr.value = attr.value;
> > > +    }
> > > +
> > > +    attr.type = VAConfigAttribEncAV1Ext1;
> > > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > > +                                ctx->va_profile,
> > > +                                ctx->va_entrypoint,
> > > +                                &attr, 1);
> > > +    if (vas != VA_STATUS_SUCCESS) {
> > > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > > +               "config attribute: %d (%s).\n", vas,
> > > vaErrorStr(vas));
> > > +        return AVERROR_EXTERNAL;
> > > +    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
> > > +        priv->attr_ext1.value = 0;
> > > +        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
> > > +               "supported.\n", attr.type);
> > > +    } else {
> > > +        priv->attr_ext1.value = attr.value;
> > > +    }
> > > +
> > > +    /** This attr provides essential indicators, return error if
> > > not support. */
> > > +    attr.type = VAConfigAttribEncAV1Ext2;
> > > +    vas = vaGetConfigAttributes(ctx->hwctx->display,
> > > +                                ctx->va_profile,
> > > +                                ctx->va_entrypoint,
> > > +                                &attr, 1);
> > > +    if (vas != VA_STATUS_SUCCESS || attr.value ==
> > > VA_ATTRIB_NOT_SUPPORTED) {
> > > +        av_log(avctx, AV_LOG_ERROR, "Failed to query "
> > > +               "config attribute: %d (%s).\n", vas,
> > > vaErrorStr(vas));
> > > +        return AVERROR_EXTERNAL;
> > > +    } else {
> > > +        priv->attr_ext2.value = attr.value;
> > > +    }
> > > +
> > > +    ret = vaapi_encode_av1_set_tile(avctx);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static av_cold int vaapi_encode_av1_close(AVCodecContext *avctx)
> > > +{
> > > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > > +
> > > +    ff_cbs_fragment_free(&priv->current_obu);
> > > +    ff_cbs_close(&priv->cbc);
> > > +
> > > +    return ff_vaapi_encode_close(avctx);
> > > +}
> > > +
> > > +#define OFFSET(x) offsetof(VAAPIEncodeAV1Context, x)
> > > +#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM |
> > > AV_OPT_FLAG_ENCODING_PARAM)
> > > +
> > > +static const AVOption vaapi_encode_av1_options[] = {
> > > +    VAAPI_ENCODE_COMMON_OPTIONS,
> > > +    VAAPI_ENCODE_RC_OPTIONS,
> > > +    { "profile", "Set profile (seq_profile)",
> > > +      OFFSET(profile), AV_OPT_TYPE_INT,
> > > +      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 0xff,
> > > FLAGS, "profile" },
> > > +
> > > +#define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
> > > +    { .i64 = value }, 0, 0, FLAGS, "profile"
> > > +    { PROFILE("main",               FF_PROFILE_AV1_MAIN) },
> > > +    { PROFILE("high",               FF_PROFILE_AV1_HIGH) },
> > > +    { PROFILE("professional",       FF_PROFILE_AV1_PROFESSIONAL)
> > > },
> > > +#undef PROFILE
> > > +
> > > +    { "tier", "Set tier (seq_tier)",
> > > +      OFFSET(tier), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS,
> > > "tier" },
> > > +    { "main", NULL, 0, AV_OPT_TYPE_CONST,
> > > +      { .i64 = 0 }, 0, 0, FLAGS, "tier" },
> > > +    { "high", NULL, 0, AV_OPT_TYPE_CONST,
> > > +      { .i64 = 1 }, 0, 0, FLAGS, "tier" },
> > > +    { "level", "Set level (seq_level_idx)",
> > > +      OFFSET(level), AV_OPT_TYPE_INT,
> > > +      { .i64 = FF_LEVEL_UNKNOWN }, FF_LEVEL_UNKNOWN, 0x1f, FLAGS,
> > > "level" },
> > > +
> > > +#define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
> > > +      { .i64 = value }, 0, 0, FLAGS, "level"
> > > +    { LEVEL("2.0",  0) },
> > > +    { LEVEL("2.1",  1) },
> > > +    { LEVEL("3.0",  4) },
> > > +    { LEVEL("3.1",  5) },
> > > +    { LEVEL("4.0",  8) },
> > > +    { LEVEL("4.1",  9) },
> > > +    { LEVEL("5.0", 12) },
> > > +    { LEVEL("5.1", 13) },
> > > +    { LEVEL("5.2", 14) },
> > > +    { LEVEL("5.3", 15) },
> > > +    { LEVEL("6.0", 16) },
> > > +    { LEVEL("6.1", 17) },
> > > +    { LEVEL("6.2", 18) },
> > > +    { LEVEL("6.3", 19) },
> > > +#undef LEVEL
> > > +
> > > +    { "tiles", "Tile columns x rows",
> > > +      OFFSET(tile_cols), AV_OPT_TYPE_IMAGE_SIZE, { .str = "1x1" },
> > > 1, AV1_MAX_TILE_COLS, FLAGS },
> > > +    { "tile_groups", "Number of tile groups for encoding",
> > > +      OFFSET(tile_groups), AV_OPT_TYPE_INT, { .i64 = 1 }, 1,
> > > AV1_MAX_TILE_ROWS * AV1_MAX_TILE_COLS, FLAGS },
> > > +
> > > +    { NULL },
> > > +};
> > > +
> > > +static const FFCodecDefault vaapi_encode_av1_defaults[] = {
> > > +    { "b",              "0"   },
> > > +    { "bf",             "7"   },
> >
> > This feels excessive when only one layer is supported.  Do
> > experimental results give this as the best tradeoff?
>
> Did a quick check, set bf as 2 can get a better quality in BRC mode.
> Will change it to 2.
>
> Thanks.
>
> >
> > > +    { "g",              "120" },
> > > +    { "qmin",           "1"   },
> > > +    { "qmax",           "255" },
> > > +    { NULL },
> > > +};
> > > +
> > > +static const AVClass vaapi_encode_av1_class = {
> > > +    .class_name = "av1_vaapi",
> > > +    .item_name  = av_default_item_name,
> > > +    .option     = vaapi_encode_av1_options,
> > > +    .version    = LIBAVUTIL_VERSION_INT,
> > > +};
> > > +
> > > +const FFCodec ff_av1_vaapi_encoder = {
> > > +    .p.name         = "av1_vaapi",
> > > +    CODEC_LONG_NAME("AV1 (VAAPI)"),
> > > +    .p.type         = AVMEDIA_TYPE_VIDEO,
> > > +    .p.id           = AV_CODEC_ID_AV1,
> > > +    .priv_data_size = sizeof(VAAPIEncodeAV1Context),
> > > +    .init           = &vaapi_encode_av1_init,
> > > +    FF_CODEC_RECEIVE_PACKET_CB(&ff_vaapi_encode_receive_packet),
> > > +    .close          = &vaapi_encode_av1_close,
> > > +    .p.priv_class   = &vaapi_encode_av1_class,
> > > +    .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE |
> > > +                      AV_CODEC_CAP_DR1 |
> > > AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
> > > +    .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE |
> > > +                      FF_CODEC_CAP_INIT_CLEANUP,
> > > +    .defaults       = vaapi_encode_av1_defaults,
> > > +    .p.pix_fmts = (const enum AVPixelFormat[]) {
> > > +        AV_PIX_FMT_VAAPI,
> > > +        AV_PIX_FMT_NONE,
> > > +    },
> > > +    .hw_configs     = ff_vaapi_encode_hw_configs,
> > > +    .p.wrapper_name = "vaapi",
> > > +};
> > > diff --git a/libavcodec/version.h b/libavcodec/version.h
> > > index 9411511e04..728ab8839d 100644
> > > --- a/libavcodec/version.h
> > > +++ b/libavcodec/version.h
> > > @@ -29,7 +29,7 @@
> > >
> > >   #include "version_major.h"
> > >
> > > -#define LIBAVCODEC_VERSION_MINOR  22
> > > +#define LIBAVCODEC_VERSION_MINOR  23
> > >   #define LIBAVCODEC_VERSION_MICRO 100
> > >
> > >   #define
> > > LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
> >

Also, this patch series looks good to me, though I can't test them fully
myself due to a lack of hardware.

Acked-by: Neal Gompa <ngompa13@gmail.com>




--
真実はいつも一つ!/ Always, there's only one truth!
Wang, Fei W Aug. 16, 2023, 7:54 a.m. UTC | #6
On Sun, 2023-08-13 at 22:43 +0100, Mark Thompson wrote:
> On 10/08/2023 03:54, Wang, Fei W wrote:
> > On Mon, 2023-08-07 at 22:21 +0100, Mark Thompson wrote:
> > > On 03/08/2023 07:01, fei.w.wang-at-intel.com@ffmpeg.org wrote:
> > > > From: Fei Wang <fei.w.wang@intel.com>
> > > > 
> > > > Signed-off-by: Fei Wang <fei.w.wang@intel.com>
> > > > ---
> > > >    Changelog                     |    1 +
> > > >    configure                     |    3 +
> > > >    doc/encoders.texi             |   13 +
> > > >    libavcodec/Makefile           |    1 +
> > > >    libavcodec/allcodecs.c        |    1 +
> > > >    libavcodec/vaapi_encode.c     |  125 +++-
> > > >    libavcodec/vaapi_encode.h     |   12 +
> > > >    libavcodec/vaapi_encode_av1.c | 1229
> > > > +++++++++++++++++++++++++++++++++
> > > >    libavcodec/version.h          |    2 +-
> > > >    9 files changed, 1368 insertions(+), 19 deletions(-)
> > > >    create mode 100644 libavcodec/vaapi_encode_av1.c
> > > 
> > > I assume this is tested on Intel hardware.  Is it tested on AMD
> > > as
> > > well?  (Apparently working in recent Mesa.)
> > 
> > AMD tested the patchset months ago:
> > https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22585
> > 
> > This patch changed a little compare with the version in Cartwheel,
> > @Ruijing, Could you help to review this version in ML? To avoid the
> > diffs break you. Thanks.
> > 
> > > > ...
> > > > @@ -669,6 +669,15 @@ static int
> > > > vaapi_encode_set_output_timestamp(AVCodecContext *avctx,
> > > >    {
> > > >        VAAPIEncodeContext *ctx = avctx->priv_data;
> > > >    
> > > > +    // AV1 packs P frame and next B frame into one pkt, and
> > > > uses
> > > > the other
> > > > +    // repeat frame header pkt at the display order position
> > > > of
> > > > the P frame
> > > > +    // to indicate its frame index. Each frame has a
> > > > corresponding
> > > > pkt in its
> > > > +    // display order position. So don't need to consider delay
> > > > for
> > > > AV1 timestamp.
> > > > +    if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > > > +        pkt->dts = pkt->pts - ctx->dts_pts_diff;
> > > > +        return 0;
> > > > +    }
> > > 
> > > This doesn't get you the right result, though?  The PTS and DTS
> > > on
> > > every AV1 packet want to be the same.
> > 
> > The result tested good which can be played normally. Just aligned
> > with
> > other vaapi encoders that the 1st frame start with 0/-1 of PTS/DTS
> > if
> > have B frame. Set PTS/DTS to same also looks good.
> 
> DTS != PTS should only be true when the stream has externally-visible 
> frame reordering in, which AV1 doesn't.
> 
> The other two AV1 encoders both output DTS == PTS; I think you should
> match that.
> 
> > > In any case, please don't put tests for codec ID in the common
> > > code.  I suggest that the timestamp behaviour wants to be a new
> > > FLAG_*.
> > > 
> > > > +
> > > > ...
> > > > @@ -1128,9 +1182,19 @@ static int
> > > > vaapi_encode_pick_next(AVCodecContext *avctx,
> > > >    
> > > >        vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
> > > >        if (pic->type != PICTURE_TYPE_IDR) {
> > > > -        vaapi_encode_add_ref(avctx, pic, start,
> > > > -                             pic->type == PICTURE_TYPE_P,
> > > > -                             b_counter > 0, 0);
> > > > +        // TODO: apply both previous and forward multi
> > > > reference
> > > > for all vaapi encoders.
> > > > +        // And L0/L1 reference frame number can be set
> > > > dynamically
> > > > through query
> > > > +        // VAConfigAttribEncMaxRefFrames attribute.
> > > > +        if (avctx->codec_id == AV_CODEC_ID_AV1) {
> > > > +            for (i = 0; i < ctx->nb_next_prev; i++)
> > > > +                vaapi_encode_add_ref(avctx, pic, ctx-
> > > > > next_prev[i],
> > > > +                                     pic->type ==
> > > > PICTURE_TYPE_P,
> > > > +                                     b_counter > 0, 0);
> > > 
> > > So an undisclosed aim of the previous patch was to make this
> > > extra
> > > list of the most recent N frames for this to use with P frames
> > > only?
> > 
> > Currently, yes. As the TODO comment says, I will apply it to all
> > other
> > codecs and B frame next, it will need lots of test on different
> > hardware. Add this for AV1 here is to align with VPL which use 2
> > previous reference for P frame. Base on our test, 2 reference frame
> > for
> > P will improve ~5% BD-rate for quality, so that can make it
> > possible to
> > let user get same quality with VPL after adjusting and aligning
> > encoder
> > options.
> 
> Hmm.  My Intel test machine says it can take 8 L0 and 2 L1 references
> in H.264.  I might try this, seems like it should be easy to test.

Yes, if L0/L1 number queried from driver are fully supported, need to
increase MAX_PICTURE_REFERENCES and init size of surface pool.

> 
> > > Why this partcular structure for P frames only, though?  Seems
> > > like
> > > if you support extra references then other codecs could also make
> > > use
> > > of them in either direction, so we would be better off hinting
> > > that
> > > the codec would like up to N references and then using the
> > > contents
> > > of the existing DPB, plus keep extras if there is space.
> > > 
> > > > ...
> > > > +
> > > > +static av_cold int vaapi_encode_av1_configure(AVCodecContext
> > > > *avctx)
> > > > +{
> > > > +    VAAPIEncodeContext     *ctx = avctx->priv_data;
> > > > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > > > +    int ret;
> > > > +
> > > > +    ret = ff_cbs_init(&priv->cbc, AV_CODEC_ID_AV1, avctx);
> > > > +    if (ret < 0)
> > > > +        return ret;
> > > > +
> > > > +    if (ctx->rc_mode->quality) {
> > > > +        priv->q_idx_p = av_clip(ctx->rc_quality, 0,
> > > > AV1_MAX_QUANT);
> > > > +        if (fabs(avctx->i_quant_factor) > 0.0)
> > > > +            priv->q_idx_idr =
> > > > +                av_clip((fabs(avctx->i_quant_factor) * priv-
> > > > > q_idx_p  +
> > > > +                         avctx->i_quant_offset) + 0.5,
> > > > +                        0, AV1_MAX_QUANT);
> > > > +        else
> > > > +            priv->q_idx_idr = priv->q_idx_p;
> > > > +
> > > > +        if (fabs(avctx->b_quant_factor) > 0.0)
> > > > +            priv->q_idx_b =
> > > > +                av_clip((fabs(avctx->b_quant_factor) * priv-
> > > > > q_idx_p  +
> > > > +                         avctx->b_quant_offset) + 0.5,
> > > > +                        0, AV1_MAX_QUANT);
> > > > +        else
> > > > +            priv->q_idx_b = priv->q_idx_p;
> > > > +    } else {
> > > > +        /** Arbitrary value */
> > > > +        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 128;
> > > > +    }
> > > 
> > > Are there any alignment requirements here?  (If I gave it an
> > > input
> > > which is, say, 1361x1, would that work?)
> > 
> > I didn't get you. Which parameter here need to be align? Max/Min
> > resolution HW supported will be checked somewhere else. If use
> > 1361x1
> > there will be the error like:
> > 
> > "Hardware does not support encoding at size 1376x16 (constraints:
> > width
> > 32-8192 height 32-8192)."
> 
> I mean of padding for the reference surfaces.  I was expecting this
> to be rounded up to a multiple of 8 (to match MiCols/MiRows) - see
> this function in other codecs.
> 
> > > > ...
> > > > +
> > > > +    /** update obu size in bitstream */
> > > > +    if (fh_obu->header.obu_has_size_field) {
> > > > +        obu_size_len = priv-
> > > > >attr_ext2.bits.obu_size_bytes_minus1
> > > > + 1;
> > > > +        for (i = 0; i < obu_size_len; i++) {
> > > > +            byte = obu_size >> (7 * i) & 0x7f;
> > > > +            if (i < obu_size_len - 1)
> > > > +                byte |= 0x80;
> > > > +            put_bits(&pbc_tmp, 8, byte);
> > > > +        }
> > > > +        flush_put_bits(&pbc_tmp);
> > > > +        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 -
> > > > obu_size_len), obu_size);
> > > > +        *data_len -= (8 - obu_size_len) * 8;
> > > > +    }
> > > 
> > > Why is there an incomplete duplicate of the cbs_av1 header
> > > writing
> > > code here?
> > 
> > To record some position/size in bitstream that needed for VAAPI.
> > Like
> > qp_index/loopfilter/cdef offset and cdef parameters size in bit.
> > It's
> > not reasonable to add the specific parameters into CBS.
> 
> How about with <
> https://lists.ffmpeg.org/pipermail/ffmpeg-devel/2023-August/313228.html>
> ;?

How to pass position info out of .trace_write_callback? If define own
write_callback function in vaapi_encode_av1.c, and it can easily get
the positions of each syntax element, but can't pass them back to VAAPI
AV1 encoder. A possible way is according
to CodedBitstreamContext.priv_data, but that will need to add lots of
xxx_offset into CodedBitstreamAV1Context.

> 
> > > > ...
> > > > +
> > > > +static int vaapi_encode_av1_set_tile(AVCodecContext *avctx)
> > > > +{
> > > > +    VAAPIEncodeAV1Context *priv = avctx->priv_data;
> > > > +    int mi_cols, mi_rows, sb_shift, sb_size;
> > > > +    int max_tile_area_sb, max_tile_area_sb_varied;
> > > > +    int tile_width_sb, tile_height_sb, widest_tile_sb;
> > > > +    int min_log2_tiles;
> > > > +    int tile_rows_tmp, i;
> > > > +
> > > > +    if (priv->tile_cols > AV1_MAX_TILE_COLS ||
> > > > +        priv->tile_rows > AV1_MAX_TILE_ROWS) {
> > > > +        av_log(avctx, AV_LOG_ERROR, "Invalid tile number
> > > > %dx%d,
> > > > should less than %dx%d.\n",
> > > > +               priv->tile_cols, priv->tile_rows,
> > > > AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);
> > > 
> > > Can't this be a constraint on the option rather than a special
> > > check
> > > here?
> > 
> > Tiles set with IMAGE_SIZE option type which aligns with HEVC. It
> > doesn't support Min/MAX check.
> 
> Hmm, yeah - fair enough.
> 
> > > > ...
> > > > +
> > > > +    sh->color_config = (AV1RawColorConfig) {
> > > > +        .high_bitdepth                  = desc->comp[0].depth
> > > > == 8
> > > > ? 0 : 1,
> > > > +        .color_primaries                = avctx-
> > > > >color_primaries,
> > > > +        .transfer_characteristics       = avctx->color_trc,
> > > > +        .matrix_coefficients            = avctx->colorspace,
> > > > +        .color_description_present_flag = (avctx-
> > > > >color_primaries
> > > > != AVCOL_PRI_UNSPECIFIED ||
> > > > +                                           avctx-
> > > > > color_trc       != AVCOL_TRC_UNSPECIFIED ||
> > > > +                                           avctx-
> > > > > colorspace      != AVCOL_SPC_UNSPECIFIED),
> > > > +        .color_range                    = avctx->color_range
> > > > ==
> > > > AVCOL_RANGE_JPEG,
> > > > +        .subsampling_x                  = desc->log2_chroma_w,
> > > > +        .subsampling_y                  = desc->log2_chroma_h,
> > > 
> > > .chroma_sample_position = some function of
> > > chroma_sample_location.
> > 
> > There is no chroma_sample_position supported in VAAPI yet.
> 
> VAAPI won't care.  It should be marked correctly in the headers so
> that the output stream has the right value.
> 
> (CFL not having any ability to deal with the sample location is a
> known deficiency of AV1.)
> 
> > > > ...
> > > > +    fh->tile_cols                 = priv->tile_cols;
> > > > +    fh->tile_rows                 = priv->tile_rows;
> > > > +    fh->tile_cols_log2            = priv->tile_cols_log2;
> > > > +    fh->tile_rows_log2            = priv->tile_rows_log2;
> > > > +    fh->uniform_tile_spacing_flag = priv->uniform_tile;
> > > > +    fh->tile_size_bytes_minus1    = priv-
> > > > > attr_ext2.bits.tile_size_bytes_minus1;
> > > > +    fh->reduced_tx_set            = 1;
> > > 
> > > This not being present in the capabilities feels like an omission
> > > in
> > > the API.  That's a large loss for an implementation which does
> > > support the full transform set.
> > 
> > Personally understanding, "not being present" means it must be
> > supported. If any exception driver, then they can ask to add new
> > capability to libva.
> 
> reduced_tx_set is a negative property - you want to set it to zero to
> allow all transforms, unless the implementation can't cope with that.
> 
> Does the Intel implementation require it to be set?  (That is, does
> not support the full transform set.)
> 

That make sense. Intel HW doesn't require it to be set as 1. Will
change it to 0.

> > > > ...
> > > > +
> > > > +    for (i = 0; i < fh->tile_cols; i++)
> > > > +        fh->width_in_sbs_minus_1[i] = vpic-
> > > > > width_in_sbs_minus_1[i] = priv->width_in_sbs_minus_1[i];
> > > > +
> > > > +    for (i = 0; i < fh->tile_rows; i++)
> > > > +        fh->height_in_sbs_minus_1[i] = vpic-
> > > > > height_in_sbs_minus_1[i] = priv->height_in_sbs_minus_1[i];
> > > 
> > > Er, what?  Doesn't this fail the inference on the last tile
> > > width/height if you don't split exactly?
> > 
> > What's the meaning of "split exactly"? All the tile w/h will be
> > split
> > in vaapi_encode_av1_set_tile include last tile. Just make sure the
> > info
> > of tiles w/h in frame header are same with the info passed to VAAPI
> > should be fine.
> 
> Sorry, I misread this.  The values are set elsewhere, it's not
> setting all tiles to have the same SB width/height.
> 
> > > > ...
> > > > +
> > > > +    vpic->base_qindex          = fh->base_q_idx;
> > > > +    vpic->frame_width_minus_1  = fh->frame_width_minus_1;
> > > > +    vpic->frame_height_minus_1 = fh->frame_height_minus_1;
> > > > +    vpic->primary_ref_frame    = fh->primary_ref_frame;
> > > > +    vpic->reconstructed_frame  = pic->recon_surface;
> > > > +    vpic->coded_buf            = pic->output_buffer;
> > > > +    vpic->tile_cols            = fh->tile_cols;
> > > > +    vpic->tile_rows            = fh->tile_rows;
> > > > +    vpic->order_hint           = fh->order_hint;
> > > > +#if VA_CHECK_VERSION(1, 15, 0)
> > > > +    vpic->refresh_frame_flags  = fh->refresh_frame_flags;
> > > > +#endif
> > > 
> > > What will the difference be between running in the two
> > > versions?  (Is
> > > it worth supporting the older one?  Cf. bit_depth on VP9.)
> > 
> > I'd prefer to support older one. In case of someone using old
> > version
> > and works well with ffmpeg-vpl(as vpl already support av1 encode
> > long
> > time ago), otherwise he must be confuse why ffmpeg-vaapi need
> > higher
> > VAAPI version but VPL doesn't.
> 
> Ok.  So the driver doesn't use the value of refresh_frame_flags if it
> doesn't affect the result?  (Seems like it shouldn't, not sure why it
> would be added to the API.)

Media-driver doesn't use it, but MS Windows driver does, it may need
maintain DPB and pack frame header itself:
https://github.com/intel/cartwheel-ffmpeg/issues/244

Thanks

> 
> > > > ...
> 
> Thanks,
> 
> - Mark
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Mark Thompson Aug. 16, 2023, 8:45 p.m. UTC | #7
On 16/08/2023 08:54, Wang, Fei W wrote:
> On Sun, 2023-08-13 at 22:43 +0100, Mark Thompson wrote:
>> On 10/08/2023 03:54, Wang, Fei W wrote:
>>> On Mon, 2023-08-07 at 22:21 +0100, Mark Thompson wrote:
>>>> On 03/08/2023 07:01, fei.w.wang-at-intel.com@ffmpeg.org wrote:
>>>>> From: Fei Wang <fei.w.wang@intel.com>
>>>>>
>>>>> Signed-off-by: Fei Wang <fei.w.wang@intel.com>
>>>>> ---
>>>>>     Changelog                     |    1 +
>>>>>     configure                     |    3 +
>>>>>     doc/encoders.texi             |   13 +
>>>>>     libavcodec/Makefile           |    1 +
>>>>>     libavcodec/allcodecs.c        |    1 +
>>>>>     libavcodec/vaapi_encode.c     |  125 +++-
>>>>>     libavcodec/vaapi_encode.h     |   12 +
>>>>>     libavcodec/vaapi_encode_av1.c | 1229
>>>>> +++++++++++++++++++++++++++++++++
>>>>>     libavcodec/version.h          |    2 +-
>>>>>     9 files changed, 1368 insertions(+), 19 deletions(-)
>>>>>     create mode 100644 libavcodec/vaapi_encode_av1.c
>>>>> ...
>>>>> +
>>>>> +    /** update obu size in bitstream */
>>>>> +    if (fh_obu->header.obu_has_size_field) {
>>>>> +        obu_size_len = priv-
>>>>>> attr_ext2.bits.obu_size_bytes_minus1
>>>>> + 1;
>>>>> +        for (i = 0; i < obu_size_len; i++) {
>>>>> +            byte = obu_size >> (7 * i) & 0x7f;
>>>>> +            if (i < obu_size_len - 1)
>>>>> +                byte |= 0x80;
>>>>> +            put_bits(&pbc_tmp, 8, byte);
>>>>> +        }
>>>>> +        flush_put_bits(&pbc_tmp);
>>>>> +        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 -
>>>>> obu_size_len), obu_size);
>>>>> +        *data_len -= (8 - obu_size_len) * 8;
>>>>> +    }
>>>>
>>>> Why is there an incomplete duplicate of the cbs_av1 header
>>>> writing
>>>> code here?
>>>
>>> To record some position/size in bitstream that needed for VAAPI.
>>> Like
>>> qp_index/loopfilter/cdef offset and cdef parameters size in bit.
>>> It's
>>> not reasonable to add the specific parameters into CBS.
>>
>> How about with <
>> https://lists.ffmpeg.org/pipermail/ffmpeg-devel/2023-August/313228.html>
>> ;?
> 
> How to pass position info out of .trace_write_callback? If define own
> write_callback function in vaapi_encode_av1.c, and it can easily get
> the positions of each syntax element, but can't pass them back to VAAPI
> AV1 encoder. A possible way is according
> to CodedBitstreamContext.priv_data, but that will need to add lots of
> xxx_offset into CodedBitstreamAV1Context.

Right, it needs a context pointer in the callback because CBS hasn't previously had that.

See new version.

Thanks,

- Mark
diff mbox series

Patch

diff --git a/Changelog b/Changelog
index bbda4f4fd4..e86f742cd3 100644
--- a/Changelog
+++ b/Changelog
@@ -27,6 +27,7 @@  version <next>:
 - Bitstream filter for converting VVC from MP4 to Annex B
 - scale_vt filter for videotoolbox
 - transpose_vt filter for videotoolbox
+- VAAPI AV1 encoder
 
 version 6.0:
 - Radiance HDR image support
diff --git a/configure b/configure
index 99388e7664..68a238a819 100755
--- a/configure
+++ b/configure
@@ -3322,6 +3322,8 @@  av1_qsv_decoder_select="qsvdec"
 av1_qsv_encoder_select="qsvenc"
 av1_qsv_encoder_deps="libvpl"
 av1_amf_encoder_deps="amf"
+av1_vaapi_encoder_deps="VAEncPictureParameterBufferAV1"
+av1_vaapi_encoder_select="cbs_av1 vaapi_encode"
 
 # parsers
 aac_parser_select="adts_header mpeg4audio"
@@ -7106,6 +7108,7 @@  if enabled vaapi; then
     check_type "va/va.h va/va_enc_jpeg.h" "VAEncPictureParameterBufferJPEG"
     check_type "va/va.h va/va_enc_vp8.h"  "VAEncPictureParameterBufferVP8"
     check_type "va/va.h va/va_enc_vp9.h"  "VAEncPictureParameterBufferVP9"
+    check_type "va/va.h va/va_enc_av1.h"  "VAEncPictureParameterBufferAV1"
 fi
 
 if enabled_all opencl libdrm ; then
diff --git a/doc/encoders.texi b/doc/encoders.texi
index 25d6b7f09e..fb331ebd8e 100644
--- a/doc/encoders.texi
+++ b/doc/encoders.texi
@@ -3991,6 +3991,19 @@  Average variable bitrate.
 Each encoder also has its own specific options:
 @table @option
 
+@item av1_vaapi
+@option{profile} sets the value of @emph{seq_profile}.
+@option{tier} sets the value of @emph{seq_tier}.
+@option{level} sets the value of @emph{seq_level_idx}.
+
+@table @option
+@item tiles
+Set the number of tiles to encode the input video with, as columns x rows.
+(default is 1x1).
+@item tile_groups
+Set tile groups number (default is 1).
+@end table
+
 @item h264_vaapi
 @option{profile} sets the value of @emph{profile_idc} and the @emph{constraint_set*_flag}s.
 @option{level} sets the value of @emph{level_idc}.
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index a6b2ecbb22..473afb4471 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -258,6 +258,7 @@  OBJS-$(CONFIG_AV1_MEDIACODEC_DECODER)  += mediacodecdec.o
 OBJS-$(CONFIG_AV1_MEDIACODEC_ENCODER)  += mediacodecenc.o
 OBJS-$(CONFIG_AV1_NVENC_ENCODER)       += nvenc_av1.o nvenc.o
 OBJS-$(CONFIG_AV1_QSV_ENCODER)         += qsvenc_av1.o
+OBJS-$(CONFIG_AV1_VAAPI_ENCODER)       += vaapi_encode_av1.o av1_levels.o
 OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o
 OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
 OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 8775d15a4f..c43c1d7b48 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -844,6 +844,7 @@  extern const FFCodec ff_av1_nvenc_encoder;
 extern const FFCodec ff_av1_qsv_decoder;
 extern const FFCodec ff_av1_qsv_encoder;
 extern const FFCodec ff_av1_amf_encoder;
+extern const FFCodec ff_av1_vaapi_encoder;
 extern const FFCodec ff_libopenh264_encoder;
 extern const FFCodec ff_libopenh264_decoder;
 extern const FFCodec ff_h264_amf_encoder;
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index 2604f12b9e..2907e159fb 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -669,6 +669,15 @@  static int vaapi_encode_set_output_timestamp(AVCodecContext *avctx,
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
 
+    // AV1 packs P frame and next B frame into one pkt, and uses the other
+    // repeat frame header pkt at the display order position of the P frame
+    // to indicate its frame index. Each frame has a corresponding pkt in its
+    // display order position. So don't need to consider delay for AV1 timestamp.
+    if (avctx->codec_id == AV_CODEC_ID_AV1) {
+        pkt->dts = pkt->pts - ctx->dts_pts_diff;
+        return 0;
+    }
+
     if (ctx->output_delay == 0) {
         pkt->dts = pkt->pts;
     } else if (pic->encode_order < ctx->decode_delay) {
@@ -689,9 +698,10 @@  static int vaapi_encode_output(AVCodecContext *avctx,
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
     VACodedBufferSegment *buf_list, *buf;
-    VAStatus vas;
+    AVPacket *pkt_ptr = pkt;
     int total_size = 0;
     uint8_t *ptr;
+    VAStatus vas;
     int err;
 
     err = vaapi_encode_wait(avctx, pic);
@@ -711,11 +721,52 @@  static int vaapi_encode_output(AVCodecContext *avctx,
     for (buf = buf_list; buf; buf = buf->next)
         total_size += buf->size;
 
-    err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
-    ptr = pkt->data;
+    /** repack av1 coded frame for not display and repeat frames */
+    if (avctx->codec_id == AV_CODEC_ID_AV1) {
+        int display_frame = pic->display_order <= pic->encode_order;
 
-    if (err < 0)
-        goto fail_mapped;
+        if (display_frame) {
+            total_size += ctx->header_data_size;
+            err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
+            if (err < 0)
+                goto fail_mapped;
+            ptr = pkt->data;
+
+            if (ctx->header_data_size) {
+                memcpy(ptr, ctx->header_data, ctx->header_data_size);
+                ptr += ctx->header_data_size;
+                ctx->header_data_size = 0;
+            }
+        } else {
+            ctx->header_data = av_realloc(ctx->header_data, total_size);
+            if (!ctx->header_data) {
+                err = AVERROR(ENOMEM);
+                goto fail_mapped;
+            }
+            ptr = ctx->header_data;
+            ctx->header_data_size = total_size;
+
+            if (pic->tail_size) {
+                if (ctx->tail_pkt->size) {
+                    err = AVERROR(AVERROR_BUG);
+                    goto fail_mapped;
+                }
+
+                err = ff_get_encode_buffer(avctx, ctx->tail_pkt, pic->tail_size, 0);
+                if (err < 0)
+                    goto fail_mapped;
+
+                memcpy(ctx->tail_pkt->data, pic->tail_data, pic->tail_size);
+                pkt_ptr = ctx->tail_pkt;
+            }
+        }
+    } else {
+        err = ff_get_encode_buffer(avctx, pkt, total_size, 0);
+        ptr = pkt->data;
+
+        if (err < 0)
+            goto fail_mapped;
+    }
 
     for (buf = buf_list; buf; buf = buf->next) {
         av_log(avctx, AV_LOG_DEBUG, "Output buffer: %u bytes "
@@ -726,10 +777,10 @@  static int vaapi_encode_output(AVCodecContext *avctx,
     }
 
     if (pic->type == PICTURE_TYPE_IDR)
-        pkt->flags |= AV_PKT_FLAG_KEY;
+        pkt_ptr->flags |= AV_PKT_FLAG_KEY;
 
-    pkt->pts = pic->pts;
-    pkt->duration = pic->duration;
+    pkt_ptr->pts = pic->pts;
+    pkt_ptr->duration = pic->duration;
 
     vas = vaUnmapBuffer(ctx->hwctx->display, pic->output_buffer);
     if (vas != VA_STATUS_SUCCESS) {
@@ -742,8 +793,8 @@  static int vaapi_encode_output(AVCodecContext *avctx,
     // for no-delay encoders this is handled in generic codec
     if (avctx->codec->capabilities & AV_CODEC_CAP_DELAY &&
         avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
-        pkt->opaque     = pic->opaque;
-        pkt->opaque_ref = pic->opaque_ref;
+        pkt_ptr->opaque     = pic->opaque;
+        pkt_ptr->opaque_ref = pic->opaque_ref;
         pic->opaque_ref = NULL;
     }
 
@@ -752,6 +803,9 @@  static int vaapi_encode_output(AVCodecContext *avctx,
 
     av_log(avctx, AV_LOG_DEBUG, "Output read for pic %"PRId64"/%"PRId64".\n",
            pic->display_order, pic->encode_order);
+
+    vaapi_encode_set_output_timestamp(avctx, pic, pkt_ptr);
+
     return 0;
 
 fail_mapped:
@@ -1128,9 +1182,19 @@  static int vaapi_encode_pick_next(AVCodecContext *avctx,
 
     vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
     if (pic->type != PICTURE_TYPE_IDR) {
-        vaapi_encode_add_ref(avctx, pic, start,
-                             pic->type == PICTURE_TYPE_P,
-                             b_counter > 0, 0);
+        // TODO: apply both previous and forward multi reference for all vaapi encoders.
+        // And L0/L1 reference frame number can be set dynamically through query
+        // VAConfigAttribEncMaxRefFrames attribute.
+        if (avctx->codec_id == AV_CODEC_ID_AV1) {
+            for (i = 0; i < ctx->nb_next_prev; i++)
+                vaapi_encode_add_ref(avctx, pic, ctx->next_prev[i],
+                                     pic->type == PICTURE_TYPE_P,
+                                     b_counter > 0, 0);
+        } else
+            vaapi_encode_add_ref(avctx, pic, start,
+                                 pic->type == PICTURE_TYPE_P,
+                                 b_counter > 0, 0);
+
         vaapi_encode_add_ref(avctx, pic, ctx->next_prev[ctx->nb_next_prev - 1], 0, 0, 1);
     }
 
@@ -1292,6 +1356,19 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
     AVFrame *frame = ctx->frame;
     int err;
 
+start:
+    /** if no B frame before repeat P frame, sent repeat P frame out. */
+    if (avctx->codec_id == AV_CODEC_ID_AV1 && ctx->tail_pkt->size) {
+        for (VAAPIEncodePicture *tmp = ctx->pic_start; tmp; tmp = tmp->next) {
+            if (tmp->type == PICTURE_TYPE_B && tmp->pts < ctx->tail_pkt->pts)
+                break;
+            else if (!tmp->next) {
+                av_packet_move_ref(pkt, ctx->tail_pkt);
+                goto end;
+            }
+        }
+    }
+
     err = ff_encode_get_frame(avctx, frame);
     if (err < 0 && err != AVERROR_EOF)
         return err;
@@ -1356,17 +1433,21 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
         return err;
     }
 
-    vaapi_encode_set_output_timestamp(avctx, pic, pkt);
-    av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts %"PRId64", "
-           "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
-
     ctx->output_order = pic->encode_order;
     vaapi_encode_clear_old(avctx);
 
+    /** loop to get an available pkt in encoder flushing. */
+    if (ctx->end_of_stream && !pkt->size)
+        goto start;
+
+end:
+    if (pkt->size)
+        av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts %"PRId64", "
+               "size %u bytes.\n", pkt->pts, pkt->dts, pkt->size);
+
     return 0;
 }
 
-
 static av_cold void vaapi_encode_add_global_param(AVCodecContext *avctx, int type,
                                                   void *buffer, size_t size)
 {
@@ -2667,6 +2748,12 @@  av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
     ctx->device = (AVHWDeviceContext*)ctx->device_ref->data;
     ctx->hwctx = ctx->device->hwctx;
 
+    ctx->tail_pkt = av_packet_alloc();
+    if (!ctx->tail_pkt) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
     err = vaapi_encode_profile_entrypoint(avctx);
     if (err < 0)
         goto fail;
@@ -2859,9 +2946,11 @@  av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
     }
 
     av_frame_free(&ctx->frame);
+    av_packet_free(&ctx->tail_pkt);
 
     av_freep(&ctx->codec_sequence_params);
     av_freep(&ctx->codec_picture_params);
+    av_freep(&ctx->header_data);
     av_fifo_freep2(&ctx->encode_fifo);
 
     av_buffer_unref(&ctx->recon_frames_ref);
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index d5452a37b3..03df8d6d46 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -133,6 +133,11 @@  typedef struct VAAPIEncodePicture {
 
     int          nb_slices;
     VAAPIEncodeSlice *slices;
+
+    /** Tail data of current pic, used only for repeat header of AV1. */
+    char tail_data[MAX_PARAM_BUFFER_SIZE];
+    /** Byte length of tail_data. */
+    size_t tail_size;
 } VAAPIEncodePicture;
 
 typedef struct VAAPIEncodeProfile {
@@ -367,6 +372,13 @@  typedef struct VAAPIEncodeContext {
     AVFifo          *encode_fifo;
     // Max number of frame buffered in encoder.
     int             async_depth;
+
+    /** Head data for current output pkt, used only for AV1. */
+    void  *header_data;
+    size_t header_data_size;
+
+    /** Store av1 repeat frame header pkt. */
+    AVPacket *tail_pkt;
 } VAAPIEncodeContext;
 
 enum {
diff --git a/libavcodec/vaapi_encode_av1.c b/libavcodec/vaapi_encode_av1.c
new file mode 100644
index 0000000000..4c8a9d995d
--- /dev/null
+++ b/libavcodec/vaapi_encode_av1.c
@@ -0,0 +1,1229 @@ 
+/*
+ * Copyright (c) 2023 Intel Corporation
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <va/va.h>
+#include <va/va_enc_av1.h>
+
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+
+#include "cbs_av1.h"
+#include "put_bits.h"
+#include "codec_internal.h"
+#include "av1_levels.h"
+#include "vaapi_encode.h"
+
+#define AV1_MAX_QUANT 255
+
+typedef struct VAAPIEncodeAV1Picture {
+    int64_t last_idr_frame;
+    int slot;
+} VAAPIEncodeAV1Picture;
+
+typedef struct VAAPIEncodeAV1Context {
+    VAAPIEncodeContext common;
+    AV1RawOBU sh; /**< sequence header.*/
+    AV1RawOBU fh; /**< frame header.*/
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment current_obu;
+    VAConfigAttribValEncAV1 attr;
+    VAConfigAttribValEncAV1Ext1 attr_ext1;
+    VAConfigAttribValEncAV1Ext2 attr_ext2;
+
+    char sh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded sequence header data. */
+    size_t sh_data_len; /**< bit length of sh_data. */
+    char fh_data[MAX_PARAM_BUFFER_SIZE]; /**< coded frame header data. */
+    size_t fh_data_len; /**< bit length of fh_data. */
+
+    uint8_t uniform_tile;
+    uint8_t use_128x128_superblock;
+    int sb_cols;
+    int sb_rows;
+    int tile_cols_log2;
+    int tile_rows_log2;
+    int max_tile_width_sb;
+    int max_tile_height_sb;
+    uint8_t width_in_sbs_minus_1[AV1_MAX_TILE_COLS];
+    uint8_t height_in_sbs_minus_1[AV1_MAX_TILE_ROWS];
+
+    int min_log2_tile_cols;
+    int max_log2_tile_cols;
+    int min_log2_tile_rows;
+    int max_log2_tile_rows;
+
+    int q_idx_idr;
+    int q_idx_p;
+    int q_idx_b;
+
+    /** user options */
+    int profile;
+    int level;
+    int tier;
+    int tile_cols, tile_rows;
+    int tile_groups;
+} VAAPIEncodeAV1Context;
+
+static av_cold int vaapi_encode_av1_configure(AVCodecContext *avctx)
+{
+    VAAPIEncodeContext     *ctx = avctx->priv_data;
+    VAAPIEncodeAV1Context *priv = avctx->priv_data;
+    int ret;
+
+    ret = ff_cbs_init(&priv->cbc, AV_CODEC_ID_AV1, avctx);
+    if (ret < 0)
+        return ret;
+
+    if (ctx->rc_mode->quality) {
+        priv->q_idx_p = av_clip(ctx->rc_quality, 0, AV1_MAX_QUANT);
+        if (fabs(avctx->i_quant_factor) > 0.0)
+            priv->q_idx_idr =
+                av_clip((fabs(avctx->i_quant_factor) * priv->q_idx_p  +
+                         avctx->i_quant_offset) + 0.5,
+                        0, AV1_MAX_QUANT);
+        else
+            priv->q_idx_idr = priv->q_idx_p;
+
+        if (fabs(avctx->b_quant_factor) > 0.0)
+            priv->q_idx_b =
+                av_clip((fabs(avctx->b_quant_factor) * priv->q_idx_p  +
+                         avctx->b_quant_offset) + 0.5,
+                        0, AV1_MAX_QUANT);
+        else
+            priv->q_idx_b = priv->q_idx_p;
+    } else {
+        /** Arbitrary value */
+        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 128;
+    }
+
+    return 0;
+}
+
+static int vaapi_encode_av1_add_obu(AVCodecContext *avctx,
+                                    CodedBitstreamFragment *au,
+                                    uint8_t type,
+                                    void *obu_unit)
+{
+    int ret;
+
+    ret = ff_cbs_insert_unit_content(au, -1,
+                                     type, obu_unit, NULL);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to add OBU unit: "
+               "type = %d.\n", type);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int vaapi_encode_av1_write_obu(AVCodecContext *avctx,
+                                      char *data, size_t *data_len,
+                                      CodedBitstreamFragment *bs)
+{
+    VAAPIEncodeAV1Context *priv = avctx->priv_data;
+    int ret;
+
+    ret = ff_cbs_write_fragment_data(priv->cbc, bs);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to write packed header.\n");
+        return ret;
+    }
+
+    if ((size_t)8 * MAX_PARAM_BUFFER_SIZE < 8 * bs->data_size - bs->data_bit_padding) {
+        av_log(avctx, AV_LOG_ERROR, "Access unit too large: "
+               "%zu < %zu.\n", (size_t)8 * MAX_PARAM_BUFFER_SIZE,
+               8 * bs->data_size - bs->data_bit_padding);
+        return AVERROR(ENOSPC);
+    }
+
+    memcpy(data, bs->data, bs->data_size);
+    *data_len = 8 * bs->data_size - bs->data_bit_padding;
+
+    return 0;
+}
+
+static int get_relative_dist(const AV1RawSequenceHeader *seq,
+                             unsigned int a, unsigned int b)
+{
+    unsigned int diff, m;
+    if (!seq->enable_order_hint)
+        return 0;
+    diff = a - b;
+    m = 1 << seq->order_hint_bits_minus_1;
+    diff = (diff & (m - 1)) - (diff & m);
+    return diff;
+}
+
+static int write_ns(PutBitContext *pbc, uint32_t n, uint32_t value)
+{
+    uint32_t w, m, v, extra_bit;
+
+    w = av_log2(n) + 1;
+    m = (1 << w) - n;
+
+    if (value < m) {
+        v = value;
+        put_bits(pbc, w - 1, v);
+    } else {
+        v = m + ((value - m) >> 1);
+        extra_bit = (value - m) & 1;
+        put_bits(pbc, w - 1, v);
+        put_bits(pbc, 1, extra_bit);
+    }
+
+    return 0;
+}
+
+/**
+ * This API provide the minmum implemention according current enabled features
+ * in frame header. If more features will enable in furture, please make sure
+ * the relative flags of features should be packed correctly into frame header
+ * obu in this API.
+ */
+static int vaapi_encode_av1_write_frame_header(AVCodecContext *avctx,
+                                               VAAPIEncodePicture *pic,
+                                               char *data, size_t *data_len)
+{
+    VAAPIEncodeContext              *ctx = avctx->priv_data;
+    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
+    AV1RawOBU                    *fh_obu = &priv->fh;
+    AV1RawOBU                    *sh_obu = &priv->sh;
+    AV1RawFrameHeader                *fh = &fh_obu->obu.frame.header;
+    AV1RawSequenceHeader             *sh = &sh_obu->obu.sequence_header;
+    VAEncPictureParameterBufferAV1 *vpic = pic->codec_picture_params;
+    PutBitContext pbc, pbc_tmp;
+    uint8_t byte;
+    int qindex, coded_lossless;
+    int id_len, frame_is_intra, skip_mode_allowed;
+    int start, payload_bits, obu_size, obu_size_len;
+    int qindex_offset, loopfilter_offset;
+    int cdef_start_offset, cdef_end_offset;
+    int i;
+
+    init_put_bits(&pbc, data, MAX_PARAM_BUFFER_SIZE);
+
+    /** obu header */
+    put_bits(&pbc, 1, fh_obu->header.obu_forbidden_bit);
+    put_bits(&pbc, 4, fh_obu->header.obu_type);
+    put_bits(&pbc, 1, fh_obu->header.obu_extension_flag);
+    put_bits(&pbc, 1, fh_obu->header.obu_has_size_field);
+    put_bits(&pbc, 1, fh_obu->header.obu_reserved_1bit);
+
+    /** record pbc status to re-write obu size later. */
+    if (fh_obu->header.obu_has_size_field) {
+        pbc_tmp = pbc;
+        put_bits32(&pbc, 0);
+        put_bits32(&pbc, 0);
+    }
+
+    start = put_bits_count(&pbc);
+
+    /** uncompressed_header() */
+    if (sh->frame_id_numbers_present_flag)
+        id_len = sh->additional_frame_id_length_minus_1 +
+                 sh->delta_frame_id_length_minus_2 + 3;
+
+    frame_is_intra = (fh->frame_type == AV1_FRAME_KEY ||
+                      fh->frame_type == AV1_FRAME_INTRA_ONLY);
+    if (!sh->reduced_still_picture_header) {
+        put_bits(&pbc, 1, fh->show_existing_frame);
+        if (fh->show_existing_frame) {
+            put_bits(&pbc, 3, fh->frame_to_show_map_idx);
+            goto trailing_bits;
+        }
+        if (sh->frame_id_numbers_present_flag)
+            put_bits(&pbc, id_len, fh->display_frame_id);
+
+        put_bits(&pbc, 2, fh->frame_type);
+        put_bits(&pbc, 1, fh->show_frame);
+        if (!fh->show_frame)
+            put_bits(&pbc, 1, fh->showable_frame);
+
+        if (!(fh->frame_type == AV1_FRAME_SWITCH ||
+            (fh->frame_type == AV1_FRAME_KEY && fh->show_frame)))
+            put_bits(&pbc, 1, fh->error_resilient_mode);
+    }
+
+    put_bits(&pbc, 1, fh->disable_cdf_update);
+    if (sh->seq_force_screen_content_tools == AV1_SELECT_SCREEN_CONTENT_TOOLS)
+        put_bits(&pbc, 1, fh->allow_screen_content_tools);
+
+    if (fh->allow_screen_content_tools && sh->seq_force_integer_mv == AV1_SELECT_INTEGER_MV)
+        put_bits(&pbc, 1, fh->force_integer_mv);
+
+    if (sh->frame_id_numbers_present_flag)
+        put_bits(&pbc, id_len, fh->current_frame_id);
+
+    if (fh->frame_type != AV1_FRAME_SWITCH && !sh->reduced_still_picture_header)
+        put_bits(&pbc, 1, fh->frame_size_override_flag);
+
+    if (sh->enable_order_hint)
+        put_bits(&pbc, sh->order_hint_bits_minus_1 + 1, fh->order_hint);
+
+    if (!(frame_is_intra || fh->error_resilient_mode))
+        put_bits(&pbc, 3, fh->primary_ref_frame);
+
+    if (!(fh->frame_type == AV1_FRAME_SWITCH ||
+        fh->frame_type == AV1_FRAME_KEY && fh->show_frame))
+        put_bits(&pbc, 8, fh->refresh_frame_flags);
+
+    if (frame_is_intra) {
+        /** render_size() */
+         put_bits(&pbc, 1, fh->render_and_frame_size_different);
+    } else {
+        if (!frame_is_intra && sh->enable_order_hint)
+            put_bits(&pbc, 1, fh->frame_refs_short_signaling);
+
+        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+            if (!fh->frame_refs_short_signaling)
+                put_bits(&pbc, 3, fh->ref_frame_idx[i]);
+        }
+
+        if (!(fh->frame_size_override_flag && !fh->error_resilient_mode))
+            put_bits(&pbc, 1, fh->render_and_frame_size_different);
+
+        if (!fh->force_integer_mv)
+            put_bits(&pbc, 1, fh->allow_high_precision_mv);
+
+        /** read_interpolation_filter() */
+        put_bits(&pbc, 1, fh->is_filter_switchable);
+        if (!fh->is_filter_switchable)
+            put_bits(&pbc, 2, fh->interpolation_filter);
+
+        put_bits(&pbc, 1, fh->is_motion_mode_switchable);
+    }
+
+    if (!(sh->reduced_still_picture_header || fh->disable_cdf_update))
+        put_bits(&pbc, 1, fh->disable_frame_end_update_cdf);
+
+    /** tile_info() */
+    put_bits(&pbc, 1, fh->uniform_tile_spacing_flag);
+    if (fh->uniform_tile_spacing_flag) {
+        for (i = 0; i < priv->tile_cols_log2 - priv->min_log2_tile_cols; i++) {
+            put_bits(&pbc, 1, 1);
+        }
+        if (priv->tile_cols_log2 < priv->max_log2_tile_cols)
+            put_bits(&pbc, 1, 0);
+
+        for (i = 0; i < priv->tile_rows_log2 - priv->min_log2_tile_rows; i++) {
+            put_bits(&pbc, 1, 1);
+        }
+        if (priv->tile_rows_log2 < priv->max_log2_tile_rows)
+            put_bits(&pbc, 1, 0);
+    } else {
+        int start_sb = 0;
+        int max_width, max_height;
+        for (i = 0; start_sb < priv->sb_cols; i++) {
+            max_width = FFMIN(priv->sb_cols - start_sb, priv->max_tile_width_sb);
+            write_ns(&pbc, max_width, fh->width_in_sbs_minus_1[i]);
+            start_sb += fh->width_in_sbs_minus_1[i] + 1;
+        }
+
+        start_sb = 0;
+        for (i = 0; start_sb < priv->sb_rows; i++) {
+            max_height = FFMIN(priv->sb_rows - start_sb, priv->max_tile_height_sb);
+            write_ns(&pbc, max_height, fh->height_in_sbs_minus_1[i]);
+            start_sb += fh->height_in_sbs_minus_1[i] + 1;
+        }
+    }
+
+    if (priv->tile_cols_log2 || priv->tile_rows_log2) {
+        put_bits(&pbc, priv->tile_cols_log2 + priv->tile_rows_log2, fh->context_update_tile_id);
+        put_bits(&pbc, 2, fh->tile_size_bytes_minus1);
+    }
+
+    qindex_offset = put_bits_count(&pbc);
+    /** quantization_params() */
+    put_bits(&pbc, 8, fh->base_q_idx);
+    put_bits(&pbc, 1, fh->delta_q_y_dc);
+    put_bits(&pbc, 1, fh->delta_q_u_dc);
+    put_bits(&pbc, 1, fh->delta_q_u_ac);
+    put_bits(&pbc, 1, fh->using_qmatrix);
+
+    /** segmentation_params() */
+    put_bits(&pbc, 1, fh->segmentation_enabled);
+
+    /** delta_q_params() */
+    if (fh->base_q_idx)
+        put_bits(&pbc, 1, fh->delta_q_present);
+    if (fh->delta_q_present)
+        put_bits(&pbc, 2, fh->delta_q_res);
+
+    /** delta_lf_params() */
+    if (fh->delta_q_present) {
+        if (!fh->allow_intrabc)
+            put_bits(&pbc, 1, fh->delta_lf_present);
+        if (fh->delta_lf_present) {
+            put_bits(&pbc, 2, fh->delta_lf_res);
+            put_bits(&pbc, 1, fh->delta_lf_multi);
+        }
+    }
+
+    /** codelossless */
+    coded_lossless = 1;
+    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
+        if (fh->segmentation_enabled && fh->feature_enabled[i][AV1_SEG_LVL_ALT_Q])
+            qindex = fh->base_q_idx + fh->feature_value[i][AV1_SEG_LVL_ALT_Q];
+        else
+            qindex = fh->base_q_idx;
+        qindex = av_clip_uintp2(qindex, 8);
+
+        if (qindex || fh->delta_q_y_dc || fh->delta_q_u_ac || fh->delta_q_u_dc ||
+            fh->delta_q_v_ac || fh->delta_q_v_dc)
+            coded_lossless = 0;
+    }
+
+    loopfilter_offset = put_bits_count(&pbc);
+    /** loop_filter_params() */
+    if (!(coded_lossless || fh->allow_intrabc)) {
+        put_bits(&pbc, 6, fh->loop_filter_level[0]);
+        put_bits(&pbc, 6, fh->loop_filter_level[1]);
+        if (fh->loop_filter_level[0] || fh->loop_filter_level[1]) {
+            put_bits(&pbc, 6, fh->loop_filter_level[2]);
+            put_bits(&pbc, 6, fh->loop_filter_level[3]);
+        }
+        put_bits(&pbc, 3, fh->loop_filter_sharpness);
+        put_bits(&pbc, 1, fh->loop_filter_delta_enabled);
+    }
+
+    cdef_start_offset = put_bits_count(&pbc);
+    /** cdef_params() */
+    if (!(coded_lossless || fh->allow_intrabc || !sh->enable_cdef)) {
+        put_bits(&pbc, 2, fh->cdef_damping_minus_3);
+        put_bits(&pbc, 2, fh->cdef_bits);
+        for (i = 0; i < (1 << fh->cdef_bits); i++) {
+            put_bits(&pbc, 4, fh->cdef_y_pri_strength[i]);
+            put_bits(&pbc, 2, fh->cdef_y_sec_strength[i]);
+            put_bits(&pbc, 4, fh->cdef_uv_pri_strength[i]);
+            put_bits(&pbc, 2, fh->cdef_uv_sec_strength[i]);
+        }
+    }
+    cdef_end_offset = put_bits_count(&pbc);
+
+    /** read_tx_mode() */
+    if (fh->tx_mode == AV1_TX_MODE_SELECT)
+        put_bits(&pbc, 1, 1);
+    else
+        put_bits(&pbc, 1, 0);
+
+    /** frame_reference_mode() */
+    if (!frame_is_intra)
+        put_bits(&pbc, 1, fh->reference_select);
+
+    /** skip_mode_params() */
+    if (frame_is_intra || !fh->reference_select || !sh->enable_order_hint)
+        skip_mode_allowed = 0;
+    else {
+        int forward_idx, backward_idx;
+        int ref_hint, forward_hint, backward_hint;
+
+        forward_idx = -1;
+        backward_idx = -1;
+
+        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+            ref_hint = fh->ref_order_hint[fh->ref_frame_idx[i]];
+            if (get_relative_dist(sh, ref_hint, fh->order_hint) < 0) {
+                if (forward_idx < 0 || get_relative_dist(sh, ref_hint, forward_idx) > 0) {
+                    forward_idx = i;
+                    forward_hint = ref_hint;
+                }
+            } else if (get_relative_dist(sh, ref_hint, fh->order_hint) > 0) {
+                if (backward_idx < 0 || get_relative_dist(sh, ref_hint, backward_hint) < 0) {
+                    backward_idx = i;
+                    backward_hint = ref_hint;
+                }
+            }
+        }
+        if (forward_idx < 0)
+            skip_mode_allowed = 0;
+        else if (backward_idx >= 0)
+            skip_mode_allowed = 1;
+        else {
+            int second_forward_idx, second_forward_hint;
+            second_forward_idx = -1;
+            for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+                ref_hint = fh->ref_order_hint[fh->ref_frame_idx[i]];
+                if (get_relative_dist(sh, ref_hint, forward_hint) < 0) {
+                    if (second_forward_idx < 0 || get_relative_dist(sh, ref_hint, second_forward_hint) > 0){
+                        second_forward_idx = i;
+                        second_forward_hint = ref_hint;
+                    }
+                }
+            }
+            if (second_forward_idx < 0)
+                skip_mode_allowed = 0;
+            else
+                skip_mode_allowed = 1;
+        }
+    }
+
+    if (skip_mode_allowed)
+        put_bits(&pbc, 1, fh->skip_mode_present);
+
+    put_bits(&pbc, 1, fh->reduced_tx_set);
+
+    /** global_motion_params() */
+    if (!frame_is_intra) {
+        for (i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF; i++) {
+            put_bits(&pbc, 1, fh->is_global[i]);
+            if (fh->is_global[i]) {
+                put_bits(&pbc, 1, fh->is_rot_zoom[i]);
+                if (!fh->is_rot_zoom[i])
+                    put_bits(&pbc, 1, fh->is_translation[i]);
+            }
+        }
+    }
+
+trailing_bits:
+    payload_bits = put_bits_count(&pbc) - start;
+
+    /** trailing_bits() */
+    put_bits(&pbc, 1, 1);
+    obu_size = (put_bits_count(&pbc) - start + 7) / 8;
+    for (i = 0; i < obu_size * 8 - payload_bits - 1; i++)
+        put_bits(&pbc, 1, 0);
+
+    flush_put_bits(&pbc);
+    *data_len = put_bits_count(&pbc);
+
+    /** update obu size in bitstream */
+    if (fh_obu->header.obu_has_size_field) {
+        obu_size_len = priv->attr_ext2.bits.obu_size_bytes_minus1 + 1;
+        for (i = 0; i < obu_size_len; i++) {
+            byte = obu_size >> (7 * i) & 0x7f;
+            if (i < obu_size_len - 1)
+                byte |= 0x80;
+            put_bits(&pbc_tmp, 8, byte);
+        }
+        flush_put_bits(&pbc_tmp);
+        memmove(pbc_tmp.buf_ptr, pbc_tmp.buf_ptr + (8 - obu_size_len), obu_size);
+        *data_len -= (8 - obu_size_len) * 8;
+    }
+
+    if (fh->show_existing_frame)
+        return 0;
+
+    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
+        vpic->min_base_qindex = av_clip(avctx->qmin, 1, AV1_MAX_QUANT);
+        vpic->max_base_qindex = av_clip(avctx->qmax, 1, AV1_MAX_QUANT);
+
+        vpic->bit_offset_qindex            = qindex_offset - (8 - obu_size_len) * 8;
+        vpic->bit_offset_loopfilter_params = loopfilter_offset - (8 - obu_size_len) * 8;
+        vpic->bit_offset_cdef_params       = cdef_start_offset - (8 - obu_size_len) * 8;
+        vpic->size_in_bits_cdef_params     = cdef_end_offset - cdef_start_offset;
+        vpic->size_in_bits_frame_hdr_obu   = *data_len;
+
+        vpic->byte_offset_frame_hdr_obu_size = (((pic->type == PICTURE_TYPE_IDR) ?
+                                               priv->sh_data_len / 8 : 0) +
+                                               (fh_obu->header.obu_extension_flag ?
+                                               2 : 1));
+    }
+
+    return 0;
+}
+
+static int tile_log2(int blkSize, int target) {
+    int k;
+    for (k = 0; (blkSize << k) < target; k++);
+    return k;
+}
+
+static int vaapi_encode_av1_set_tile(AVCodecContext *avctx)
+{
+    VAAPIEncodeAV1Context *priv = avctx->priv_data;
+    int mi_cols, mi_rows, sb_shift, sb_size;
+    int max_tile_area_sb, max_tile_area_sb_varied;
+    int tile_width_sb, tile_height_sb, widest_tile_sb;
+    int min_log2_tiles;
+    int tile_rows_tmp, i;
+
+    if (priv->tile_cols > AV1_MAX_TILE_COLS ||
+        priv->tile_rows > AV1_MAX_TILE_ROWS) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid tile number %dx%d, should less than %dx%d.\n",
+               priv->tile_cols, priv->tile_rows, AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);
+        return AVERROR(EINVAL);
+    }
+
+    mi_cols = 2 * ((avctx->width + 7) >> 3);
+    mi_rows = 2 * ((avctx->height + 7) >> 3);
+    priv->sb_cols = priv->use_128x128_superblock ?
+                    ((mi_cols + 31) >> 5) : ((mi_cols + 15) >> 4);
+    priv->sb_rows = priv->use_128x128_superblock ?
+                    ((mi_rows + 31) >> 5) : ((mi_rows + 15) >> 4);
+    sb_shift = priv->use_128x128_superblock ? 5 : 4;
+    sb_size  = sb_shift + 2;
+    priv->max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size;
+    max_tile_area_sb = AV1_MAX_TILE_AREA  >> (2 * sb_size);
+
+    priv->min_log2_tile_cols = tile_log2(priv->max_tile_width_sb, priv->sb_cols);
+    priv->max_log2_tile_cols = tile_log2(1, FFMIN(priv->sb_cols, AV1_MAX_TILE_COLS));
+    priv->max_log2_tile_rows = tile_log2(1, FFMIN(priv->sb_rows, AV1_MAX_TILE_ROWS));
+    min_log2_tiles = FFMAX(priv->min_log2_tile_cols,
+                           tile_log2(max_tile_area_sb, priv->sb_rows * priv->sb_cols));
+
+    if (priv->tile_cols != av_clip(priv->tile_cols, (priv->sb_cols + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv->sb_cols)) {
+        priv->tile_cols = av_clip(priv->tile_cols, (priv->sb_cols + priv->max_tile_width_sb - 1) / priv->max_tile_width_sb, priv->sb_cols);
+        av_log(avctx, AV_LOG_WARNING, "Invalid tile cols, correct to %d.\n", priv->tile_cols);
+    }
+
+    priv->tile_cols_log2 = tile_log2(1, priv->tile_cols);
+    tile_width_sb = (priv->sb_cols + (1 << priv->tile_cols_log2) - 1) >>
+                    priv->tile_cols_log2;
+
+    if (priv->tile_rows > priv->sb_rows) {
+        priv->tile_rows = priv->sb_rows;
+        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct to %d.\n", priv->tile_rows);
+    }
+
+    for (tile_rows_tmp = priv->tile_rows; tile_rows_tmp <= priv->sb_rows && tile_rows_tmp <= AV1_MAX_TILE_ROWS; tile_rows_tmp++) {
+        /** try uniformed tile. */
+        priv->tile_rows_log2 = tile_log2(1, tile_rows_tmp);
+        if ((priv->sb_cols + tile_width_sb - 1) / tile_width_sb == priv->tile_cols) {
+            for (i = 0; i < priv->tile_cols - 1; i++)
+                priv->width_in_sbs_minus_1[i] = tile_width_sb - 1;
+            priv->width_in_sbs_minus_1[i] = priv->sb_cols - (priv->tile_cols - 1) * tile_width_sb - 1;
+
+            tile_height_sb = (priv->sb_rows + (1 << priv->tile_rows_log2) - 1) >>
+                             priv->tile_rows_log2;
+
+            if ((priv->sb_rows + tile_height_sb - 1) / tile_height_sb == tile_rows_tmp &&
+                tile_height_sb <= max_tile_area_sb / tile_width_sb) {
+                for (i = 0; i < tile_rows_tmp - 1; i++)
+                    priv->height_in_sbs_minus_1[i] = tile_height_sb - 1;
+                priv->height_in_sbs_minus_1[i] = priv->sb_rows - (tile_rows_tmp - 1) * tile_height_sb - 1;
+
+                priv->uniform_tile = 1;
+                priv->min_log2_tile_rows = FFMAX(min_log2_tiles - priv->tile_cols_log2, 0);
+
+                break;
+            }
+        }
+
+        /** try non-uniformed tile. */
+        widest_tile_sb = 0;
+        for (i = 0; i < priv->tile_cols; i++) {
+            priv->width_in_sbs_minus_1[i] = (i + 1) * priv->sb_cols / priv->tile_cols - i * priv->sb_cols / priv->tile_cols - 1;
+            widest_tile_sb = FFMAX(widest_tile_sb, priv->width_in_sbs_minus_1[i] + 1);
+        }
+
+        if (min_log2_tiles)
+            max_tile_area_sb_varied = (priv->sb_rows * priv->sb_cols) >> (min_log2_tiles + 1);
+        else
+            max_tile_area_sb_varied = priv->sb_rows * priv->sb_cols;
+        priv->max_tile_height_sb = FFMAX(1, max_tile_area_sb_varied / widest_tile_sb);
+
+        if (tile_rows_tmp == av_clip(tile_rows_tmp, (priv->sb_rows + priv->max_tile_height_sb - 1) / priv->max_tile_height_sb, priv->sb_rows)) {
+            for (i = 0; i < tile_rows_tmp; i++)
+                priv->height_in_sbs_minus_1[i] = (i + 1) * priv->sb_rows / tile_rows_tmp - i * priv->sb_rows / tile_rows_tmp - 1;
+
+            break;
+        }
+    }
+
+    if (priv->tile_rows != tile_rows_tmp) {
+        priv->tile_rows = tile_rows_tmp;
+        av_log(avctx, AV_LOG_WARNING, "Invalid tile rows, correct to %d.\n", priv->tile_rows);
+    }
+
+    /** check if tile cols/rows is supported by driver. */
+    if (priv->attr_ext2.bits.max_tile_num_minus1) {
+        if ((priv->tile_cols * priv->tile_rows - 1) > priv->attr_ext2.bits.max_tile_num_minus1) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported tile num %d * %d = %d by driver, "
+                   "should be less than %d.\n", priv->tile_cols, priv->tile_rows,
+                   priv->tile_cols * priv->tile_rows,
+                   priv->attr_ext2.bits.max_tile_num_minus1 + 1);
+            return AVERROR(EINVAL);
+        }
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Setting tile cols/rows to %d/%d.\n",
+           priv->tile_cols, priv->tile_rows);
+
+    /** check if tile group numbers is valid. */
+    if (priv->tile_groups > priv->tile_cols * priv->tile_rows) {
+        av_log(avctx, AV_LOG_WARNING, "Invalid tile groups number %d, "
+        "correct to %d.\n", priv->tile_groups, priv->tile_cols * priv->tile_rows);
+        priv->tile_groups = priv->tile_cols * priv->tile_rows;
+    }
+
+    return 0;
+}
+
+static int vaapi_encode_av1_write_sequence_header(AVCodecContext *avctx,
+                                                  char *data, size_t *data_len)
+{
+    VAAPIEncodeAV1Context *priv = avctx->priv_data;
+
+    memcpy(data, &priv->sh_data, MAX_PARAM_BUFFER_SIZE * sizeof(char));
+    *data_len = priv->sh_data_len;
+
+    return 0;
+}
+
+static int vaapi_encode_av1_init_sequence_params(AVCodecContext *avctx)
+{
+    VAAPIEncodeContext               *ctx = avctx->priv_data;
+    VAAPIEncodeAV1Context           *priv = avctx->priv_data;
+    AV1RawOBU                     *sh_obu = &priv->sh;
+    AV1RawSequenceHeader              *sh = &sh_obu->obu.sequence_header;
+    VAEncSequenceParameterBufferAV1 *vseq = ctx->codec_sequence_params;
+    CodedBitstreamFragment           *obu = &priv->current_obu;
+    const AVPixFmtDescriptor *desc;
+    int ret;
+
+    memset(sh_obu, 0, sizeof(*sh_obu));
+    sh_obu->header.obu_type = AV1_OBU_SEQUENCE_HEADER;
+
+    desc = av_pix_fmt_desc_get(priv->common.input_frames->sw_format);
+    av_assert0(desc);
+
+    sh->seq_profile  = avctx->profile;
+    if (!sh->seq_force_screen_content_tools)
+        sh->seq_force_integer_mv = AV1_SELECT_INTEGER_MV;
+    sh->frame_width_bits_minus_1  = av_log2(avctx->width);
+    sh->frame_height_bits_minus_1 = av_log2(avctx->height);
+    sh->max_frame_width_minus_1   = avctx->width - 1;
+    sh->max_frame_height_minus_1  = avctx->height - 1;
+    sh->seq_tier[0]               = priv->tier;
+    /** enable order hint and reserve maximum 8 bits for it by default. */
+    sh->enable_order_hint         = 1;
+    sh->order_hint_bits_minus_1   = 7;
+
+    sh->color_config = (AV1RawColorConfig) {
+        .high_bitdepth                  = desc->comp[0].depth == 8 ? 0 : 1,
+        .color_primaries                = avctx->color_primaries,
+        .transfer_characteristics       = avctx->color_trc,
+        .matrix_coefficients            = avctx->colorspace,
+        .color_description_present_flag = (avctx->color_primaries != AVCOL_PRI_UNSPECIFIED ||
+                                           avctx->color_trc       != AVCOL_TRC_UNSPECIFIED ||
+                                           avctx->colorspace      != AVCOL_SPC_UNSPECIFIED),
+        .color_range                    = avctx->color_range == AVCOL_RANGE_JPEG,
+        .subsampling_x                  = desc->log2_chroma_w,
+        .subsampling_y                  = desc->log2_chroma_h,
+    };
+
+    if (avctx->level != FF_LEVEL_UNKNOWN) {
+        sh->seq_level_idx[0] = avctx->level;
+    } else {
+        const AV1LevelDescriptor *level;
+        float framerate;
+
+        if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
+            framerate = avctx->framerate.num / avctx->framerate.den;
+        else
+            framerate = 0;
+
+        level = ff_av1_guess_level(avctx->bit_rate, priv->tier,
+                                   ctx->surface_width, ctx->surface_height,
+                                   priv->tile_rows * priv->tile_cols,
+                                   priv->tile_cols, framerate);
+        if (level) {
+            av_log(avctx, AV_LOG_VERBOSE, "Using level %s.\n", level->name);
+            sh->seq_level_idx[0] = level->level_idx;
+        } else {
+            av_log(avctx, AV_LOG_VERBOSE, "Stream will not conform to "
+                   "any normal level, using level 6.3 by default.\n");
+            sh->seq_level_idx[0] = 19;
+            sh->seq_tier[0] = 1;
+        }
+    }
+    vseq->seq_profile             = sh->seq_profile;
+    vseq->seq_level_idx           = sh->seq_level_idx[0];
+    vseq->seq_tier                = sh->seq_tier[0];
+    vseq->order_hint_bits_minus_1 = sh->order_hint_bits_minus_1;
+    vseq->intra_period            = ctx->gop_size;
+    vseq->ip_period               = ctx->b_per_p + 1;
+
+    vseq->seq_fields.bits.enable_order_hint = sh->enable_order_hint;
+
+    if (!(ctx->va_rc_mode & VA_RC_CQP)) {
+        vseq->bits_per_second = ctx->va_bit_rate;
+        vseq->seq_fields.bits.enable_cdef = sh->enable_cdef = 1;
+    }
+
+    ret = vaapi_encode_av1_add_obu(avctx, obu, AV1_OBU_SEQUENCE_HEADER, &priv->sh);
+    if (ret < 0)
+        goto end;
+
+    ret = vaapi_encode_av1_write_obu(avctx, priv->sh_data, &priv->sh_data_len, obu);
+    if (ret < 0)
+        goto end;
+
+end:
+    ff_cbs_fragment_reset(obu);
+    return ret;
+}
+
+static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
+                                                VAAPIEncodePicture *pic)
+{
+    VAAPIEncodeAV1Context          *priv = avctx->priv_data;
+    VAAPIEncodeAV1Picture          *hpic = pic->priv_data;
+    AV1RawOBU                    *fh_obu = &priv->fh;
+    AV1RawFrameHeader                *fh = &fh_obu->obu.frame.header;
+    VAEncPictureParameterBufferAV1 *vpic = pic->codec_picture_params;
+    CodedBitstreamFragment          *obu = &priv->current_obu;
+    VAAPIEncodePicture    *ref;
+    VAAPIEncodeAV1Picture *href;
+    int slot, i;
+    int ret;
+    static const int8_t default_loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME] =
+        { 1, 0, 0, 0, -1, 0, -1, -1 };
+
+    memset(fh_obu, 0, sizeof(*fh_obu));
+    pic->nb_slices = priv->tile_groups;
+    fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
+    fh_obu->header.obu_has_size_field = 1;
+
+    switch (pic->type) {
+    case PICTURE_TYPE_IDR:
+        av_assert0(pic->nb_refs[0] == 0 || pic->nb_refs[1]);
+        fh->frame_type = AV1_FRAME_KEY;
+        fh->refresh_frame_flags = 0xFF;
+        fh->base_q_idx = priv->q_idx_idr;
+        hpic->slot = 0;
+        hpic->last_idr_frame = pic->display_order;
+        break;
+    case PICTURE_TYPE_P:
+        av_assert0(pic->nb_refs[0]);
+        fh->frame_type = AV1_FRAME_INTER;
+        fh->base_q_idx = priv->q_idx_p;
+        ref = pic->refs[0][pic->nb_refs[0] - 1];
+        href = ref->priv_data;
+        hpic->slot = !href->slot;
+        hpic->last_idr_frame = href->last_idr_frame;
+        fh->refresh_frame_flags = 1 << hpic->slot;
+
+        /** set the nearest frame in L0 as all reference frame. */
+        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+            fh->ref_frame_idx[i] = href->slot;
+        }
+        fh->primary_ref_frame = href->slot;
+        fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
+        vpic->ref_frame_ctrl_l0.fields.search_idx0 = AV1_REF_FRAME_LAST;
+
+        /** set the 2nd nearest frame in L0 as Golden frame. */
+        if (pic->nb_refs[0] > 1) {
+            ref = pic->refs[0][pic->nb_refs[0] - 2];
+            href = ref->priv_data;
+            fh->ref_frame_idx[3] = href->slot;
+            fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
+            vpic->ref_frame_ctrl_l0.fields.search_idx1 = AV1_REF_FRAME_GOLDEN;
+        }
+        break;
+    case PICTURE_TYPE_B:
+        av_assert0(pic->nb_refs[0] && pic->nb_refs[1]);
+        fh->frame_type = AV1_FRAME_INTER;
+        fh->base_q_idx = priv->q_idx_b;
+        fh->refresh_frame_flags = 0x0;
+        fh->reference_select = 1;
+
+        /** B frame will not be referenced, disable its recon frame. */
+        vpic->picture_flags.bits.disable_frame_recon = 1;
+
+        /** Use LAST_FRAME and BWDREF_FRAME for reference. */
+        vpic->ref_frame_ctrl_l0.fields.search_idx0 = AV1_REF_FRAME_LAST;
+        vpic->ref_frame_ctrl_l1.fields.search_idx0 = AV1_REF_FRAME_BWDREF;
+
+        ref                            = pic->refs[0][pic->nb_refs[0] - 1];
+        href                           = ref->priv_data;
+        hpic->last_idr_frame           = href->last_idr_frame;
+        fh->primary_ref_frame          = href->slot;
+        fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
+        for (i = 0; i < AV1_REF_FRAME_GOLDEN; i++) {
+            fh->ref_frame_idx[i] = href->slot;
+        }
+
+        ref                            = pic->refs[1][pic->nb_refs[1] - 1];
+        href                           = ref->priv_data;
+        fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
+        for (i = AV1_REF_FRAME_GOLDEN; i < AV1_REFS_PER_FRAME; i++) {
+            fh->ref_frame_idx[i] = href->slot;
+        }
+        break;
+    default:
+        av_assert0(0 && "invalid picture type");
+    }
+
+    fh->show_frame                = pic->display_order <= pic->encode_order;
+    fh->showable_frame            = fh->frame_type != AV1_FRAME_KEY;
+    fh->frame_width_minus_1       = avctx->width - 1;
+    fh->frame_height_minus_1      = avctx->height - 1;
+    fh->render_width_minus_1      = fh->frame_width_minus_1;
+    fh->render_height_minus_1     = fh->frame_height_minus_1;
+    fh->order_hint                = pic->display_order - hpic->last_idr_frame;
+    fh->tile_cols                 = priv->tile_cols;
+    fh->tile_rows                 = priv->tile_rows;
+    fh->tile_cols_log2            = priv->tile_cols_log2;
+    fh->tile_rows_log2            = priv->tile_rows_log2;
+    fh->uniform_tile_spacing_flag = priv->uniform_tile;
+    fh->tile_size_bytes_minus1    = priv->attr_ext2.bits.tile_size_bytes_minus1;
+    fh->reduced_tx_set            = 1;
+
+    /** ignore ONLY_4x4 mode for codedlossless is not fully implemented. */
+    if (priv->attr_ext2.bits.tx_mode_support & 0x04)
+        fh->tx_mode = AV1_TX_MODE_SELECT;
+    else if (priv->attr_ext2.bits.tx_mode_support & 0x02)
+        fh->tx_mode = AV1_TX_MODE_LARGEST;
+    else
+        return AVERROR(EINVAL);
+
+    for (i = 0; i < fh->tile_cols; i++)
+        fh->width_in_sbs_minus_1[i] = vpic->width_in_sbs_minus_1[i] = priv->width_in_sbs_minus_1[i];
+
+    for (i = 0; i < fh->tile_rows; i++)
+        fh->height_in_sbs_minus_1[i] = vpic->height_in_sbs_minus_1[i] = priv->height_in_sbs_minus_1[i];
+
+    memcpy(fh->loop_filter_ref_deltas, default_loop_filter_ref_deltas,
+           AV1_TOTAL_REFS_PER_FRAME * sizeof(int8_t));
+
+    if (fh->frame_type == AV1_FRAME_KEY && fh->show_frame) {
+        fh->error_resilient_mode = 1;
+    }
+
+    if (fh->frame_type == AV1_FRAME_KEY || fh->error_resilient_mode)
+        fh->primary_ref_frame = AV1_PRIMARY_REF_NONE;
+
+    vpic->base_qindex          = fh->base_q_idx;
+    vpic->frame_width_minus_1  = fh->frame_width_minus_1;
+    vpic->frame_height_minus_1 = fh->frame_height_minus_1;
+    vpic->primary_ref_frame    = fh->primary_ref_frame;
+    vpic->reconstructed_frame  = pic->recon_surface;
+    vpic->coded_buf            = pic->output_buffer;
+    vpic->tile_cols            = fh->tile_cols;
+    vpic->tile_rows            = fh->tile_rows;
+    vpic->order_hint           = fh->order_hint;
+#if VA_CHECK_VERSION(1, 15, 0)
+    vpic->refresh_frame_flags  = fh->refresh_frame_flags;
+#endif
+
+    vpic->picture_flags.bits.enable_frame_obu     = 0;
+    vpic->picture_flags.bits.frame_type           = fh->frame_type;
+    vpic->picture_flags.bits.reduced_tx_set       = fh->reduced_tx_set;
+    vpic->picture_flags.bits.error_resilient_mode = fh->error_resilient_mode;
+
+    /** let driver decide to use single or compound reference prediction mode. */
+    vpic->mode_control_flags.bits.reference_mode = fh->reference_select ? 2 : 0;
+    vpic->mode_control_flags.bits.tx_mode = fh->tx_mode;
+
+    vpic->tile_group_obu_hdr_info.bits.obu_has_size_field = 1;
+
+    /** set reference. */
+    for (i = 0; i < AV1_REFS_PER_FRAME; i++)
+        vpic->ref_frame_idx[i] = fh->ref_frame_idx[i];
+
+    for (i = 0; i < FF_ARRAY_ELEMS(vpic->reference_frames); i++)
+        vpic->reference_frames[i] = VA_INVALID_SURFACE;
+
+    for (i = 0; i < MAX_REFERENCE_LIST_NUM; i++) {
+        for (int j = 0; j < pic->nb_refs[i]; j++) {
+            VAAPIEncodePicture *ref_pic = pic->refs[i][j];
+
+            slot = ((VAAPIEncodeAV1Picture*)ref_pic->priv_data)->slot;
+            av_assert0(vpic->reference_frames[slot] == VA_INVALID_SURFACE);
+
+            vpic->reference_frames[slot] = ref_pic->recon_surface;
+        }
+    }
+
+    /** pack frame header, and set va params offset like bit_offset_qindex etc. */
+    ret = vaapi_encode_av1_write_frame_header(avctx, pic, priv->fh_data, &priv->fh_data_len);
+    if (ret < 0)
+        goto end;
+
+end:
+    ff_cbs_fragment_reset(obu);
+    return ret;
+}
+
+static int vaapi_encode_av1_init_slice_params(AVCodecContext *avctx,
+                                              VAAPIEncodePicture *pic,
+                                              VAAPIEncodeSlice *slice)
+{
+    VAAPIEncodeAV1Context      *priv = avctx->priv_data;
+    VAEncTileGroupBufferAV1  *vslice = slice->codec_slice_params;
+    CodedBitstreamAV1Context  *cbctx = priv->cbc->priv_data;
+    int div;
+
+    /** Set tile group info. */
+    div = priv->tile_cols * priv->tile_rows / priv->tile_groups;
+    vslice->tg_start = slice->index * div;
+    if (slice->index == (priv->tile_groups - 1)) {
+        vslice->tg_end = priv->tile_cols * priv->tile_rows - 1;
+        cbctx->seen_frame_header = 0;
+    } else {
+        vslice->tg_end = (slice->index + 1) * div - 1;
+    }
+
+    return 0;
+}
+
+static int vaapi_encode_av1_write_picture_header(AVCodecContext *avctx,
+                                                 VAAPIEncodePicture *pic,
+                                                 char *data, size_t *data_len)
+{
+    VAAPIEncodeAV1Context     *priv = avctx->priv_data;
+    CodedBitstreamFragment     *obu = &priv->current_obu;
+    AV1RawOBU               *fh_obu = &priv->fh;
+    AV1RawFrameHeader       *rep_fh = &fh_obu->obu.frame_header;
+    VAAPIEncodeAV1Picture *href;
+    int ret = 0;
+
+    pic->tail_size = 0;
+    /** Pack repeat frame header. */
+    if (pic->display_order > pic->encode_order) {
+        memset(fh_obu, 0, sizeof(*fh_obu));
+        href = pic->refs[0][pic->nb_refs[0] - 1]->priv_data;
+        fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
+        fh_obu->header.obu_has_size_field = 1;
+
+        rep_fh->show_existing_frame   = 1;
+        rep_fh->frame_to_show_map_idx = href->slot == 0;
+        rep_fh->frame_type            = AV1_FRAME_INTER;
+        rep_fh->frame_width_minus_1   = avctx->width - 1;
+        rep_fh->frame_height_minus_1  = avctx->height - 1;
+        rep_fh->render_width_minus_1  = rep_fh->frame_width_minus_1;
+        rep_fh->render_height_minus_1 = rep_fh->frame_height_minus_1;
+
+        ret = vaapi_encode_av1_write_frame_header(avctx, pic, pic->tail_data, &pic->tail_size);
+        if (ret < 0)
+            goto end;
+
+        pic->tail_size /= 8;
+    }
+
+    memcpy(data, &priv->fh_data, MAX_PARAM_BUFFER_SIZE * sizeof(char));
+    *data_len = priv->fh_data_len;
+
+end:
+    ff_cbs_fragment_reset(obu);
+    return ret;
+}
+
+static const VAAPIEncodeProfile vaapi_encode_av1_profiles[] = {
+    { FF_PROFILE_AV1_MAIN,  8, 3, 1, 1, VAProfileAV1Profile0 },
+    { FF_PROFILE_AV1_MAIN, 10, 3, 1, 1, VAProfileAV1Profile0 },
+    { FF_PROFILE_UNKNOWN }
+};
+
+static const VAAPIEncodeType vaapi_encode_type_av1 = {
+    .profiles        = vaapi_encode_av1_profiles,
+    .flags           = FLAG_B_PICTURES,
+    .default_quality = 25,
+    .configure       = &vaapi_encode_av1_configure,
+
+    .sequence_header_type  = VAEncPackedHeaderSequence,
+    .sequence_params_size  = sizeof(VAEncSequenceParameterBufferAV1),
+    .init_sequence_params  = &vaapi_encode_av1_init_sequence_params,
+    .write_sequence_header = &vaapi_encode_av1_write_sequence_header,
+
+    .picture_priv_data_size = sizeof(VAAPIEncodeAV1Picture),
+    .picture_header_type    = VAEncPackedHeaderPicture,
+    .picture_params_size    = sizeof(VAEncPictureParameterBufferAV1),
+    .init_picture_params    = &vaapi_encode_av1_init_picture_params,
+    .write_picture_header   = &vaapi_encode_av1_write_picture_header,
+
+    .slice_params_size = sizeof(VAEncTileGroupBufferAV1),
+    .init_slice_params = &vaapi_encode_av1_init_slice_params,
+};
+
+static av_cold int vaapi_encode_av1_init(AVCodecContext *avctx)
+{
+    VAAPIEncodeContext      *ctx = avctx->priv_data;
+    VAAPIEncodeAV1Context  *priv = avctx->priv_data;
+    VAConfigAttrib attr;
+    VAStatus vas;
+    int ret;
+
+    ctx->codec = &vaapi_encode_type_av1;
+
+    ctx->desired_packed_headers =
+        VA_ENC_PACKED_HEADER_SEQUENCE |
+        VA_ENC_PACKED_HEADER_PICTURE;
+
+    if (avctx->profile == FF_PROFILE_UNKNOWN)
+        avctx->profile = priv->profile;
+    if (avctx->level == FF_LEVEL_UNKNOWN)
+        avctx->level = priv->level;
+
+    if (avctx->level != FF_LEVEL_UNKNOWN && avctx->level & ~0x1f) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid level %d\n", avctx->level);
+        return AVERROR(EINVAL);
+    }
+
+    ret = ff_vaapi_encode_init(avctx);
+    if (ret < 0)
+        return ret;
+
+    attr.type = VAConfigAttribEncAV1;
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile,
+                                ctx->va_entrypoint,
+                                &attr, 1);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query "
+               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        priv->attr.value = 0;
+        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
+               "supported.\n", attr.type);
+    } else {
+        priv->attr.value = attr.value;
+    }
+
+    attr.type = VAConfigAttribEncAV1Ext1;
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile,
+                                ctx->va_entrypoint,
+                                &attr, 1);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query "
+               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    } else if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        priv->attr_ext1.value = 0;
+        av_log(avctx, AV_LOG_WARNING, "Attribute type:%d is not "
+               "supported.\n", attr.type);
+    } else {
+        priv->attr_ext1.value = attr.value;
+    }
+
+    /** This attr provides essential indicators, return error if not support. */
+    attr.type = VAConfigAttribEncAV1Ext2;
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile,
+                                ctx->va_entrypoint,
+                                &attr, 1);
+    if (vas != VA_STATUS_SUCCESS || attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query "
+               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    } else {
+        priv->attr_ext2.value = attr.value;
+    }
+
+    ret = vaapi_encode_av1_set_tile(avctx);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static av_cold int vaapi_encode_av1_close(AVCodecContext *avctx)
+{
+    VAAPIEncodeAV1Context *priv = avctx->priv_data;
+
+    ff_cbs_fragment_free(&priv->current_obu);
+    ff_cbs_close(&priv->cbc);
+
+    return ff_vaapi_encode_close(avctx);
+}
+
+#define OFFSET(x) offsetof(VAAPIEncodeAV1Context, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
+
+static const AVOption vaapi_encode_av1_options[] = {
+    VAAPI_ENCODE_COMMON_OPTIONS,
+    VAAPI_ENCODE_RC_OPTIONS,
+    { "profile", "Set profile (seq_profile)",
+      OFFSET(profile), AV_OPT_TYPE_INT,
+      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 0xff, FLAGS, "profile" },
+
+#define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
+    { .i64 = value }, 0, 0, FLAGS, "profile"
+    { PROFILE("main",               FF_PROFILE_AV1_MAIN) },
+    { PROFILE("high",               FF_PROFILE_AV1_HIGH) },
+    { PROFILE("professional",       FF_PROFILE_AV1_PROFESSIONAL) },
+#undef PROFILE
+
+    { "tier", "Set tier (seq_tier)",
+      OFFSET(tier), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS, "tier" },
+    { "main", NULL, 0, AV_OPT_TYPE_CONST,
+      { .i64 = 0 }, 0, 0, FLAGS, "tier" },
+    { "high", NULL, 0, AV_OPT_TYPE_CONST,
+      { .i64 = 1 }, 0, 0, FLAGS, "tier" },
+    { "level", "Set level (seq_level_idx)",
+      OFFSET(level), AV_OPT_TYPE_INT,
+      { .i64 = FF_LEVEL_UNKNOWN }, FF_LEVEL_UNKNOWN, 0x1f, FLAGS, "level" },
+
+#define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
+      { .i64 = value }, 0, 0, FLAGS, "level"
+    { LEVEL("2.0",  0) },
+    { LEVEL("2.1",  1) },
+    { LEVEL("3.0",  4) },
+    { LEVEL("3.1",  5) },
+    { LEVEL("4.0",  8) },
+    { LEVEL("4.1",  9) },
+    { LEVEL("5.0", 12) },
+    { LEVEL("5.1", 13) },
+    { LEVEL("5.2", 14) },
+    { LEVEL("5.3", 15) },
+    { LEVEL("6.0", 16) },
+    { LEVEL("6.1", 17) },
+    { LEVEL("6.2", 18) },
+    { LEVEL("6.3", 19) },
+#undef LEVEL
+
+    { "tiles", "Tile columns x rows",
+      OFFSET(tile_cols), AV_OPT_TYPE_IMAGE_SIZE, { .str = "1x1" }, 1, AV1_MAX_TILE_COLS, FLAGS },
+    { "tile_groups", "Number of tile groups for encoding",
+      OFFSET(tile_groups), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, AV1_MAX_TILE_ROWS * AV1_MAX_TILE_COLS, FLAGS },
+
+    { NULL },
+};
+
+static const FFCodecDefault vaapi_encode_av1_defaults[] = {
+    { "b",              "0"   },
+    { "bf",             "7"   },
+    { "g",              "120" },
+    { "qmin",           "1"   },
+    { "qmax",           "255" },
+    { NULL },
+};
+
+static const AVClass vaapi_encode_av1_class = {
+    .class_name = "av1_vaapi",
+    .item_name  = av_default_item_name,
+    .option     = vaapi_encode_av1_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const FFCodec ff_av1_vaapi_encoder = {
+    .p.name         = "av1_vaapi",
+    CODEC_LONG_NAME("AV1 (VAAPI)"),
+    .p.type         = AVMEDIA_TYPE_VIDEO,
+    .p.id           = AV_CODEC_ID_AV1,
+    .priv_data_size = sizeof(VAAPIEncodeAV1Context),
+    .init           = &vaapi_encode_av1_init,
+    FF_CODEC_RECEIVE_PACKET_CB(&ff_vaapi_encode_receive_packet),
+    .close          = &vaapi_encode_av1_close,
+    .p.priv_class   = &vaapi_encode_av1_class,
+    .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE |
+                      AV_CODEC_CAP_DR1 | AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
+    .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+    .defaults       = vaapi_encode_av1_defaults,
+    .p.pix_fmts = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_VAAPI,
+        AV_PIX_FMT_NONE,
+    },
+    .hw_configs     = ff_vaapi_encode_hw_configs,
+    .p.wrapper_name = "vaapi",
+};
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 9411511e04..728ab8839d 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -29,7 +29,7 @@ 
 
 #include "version_major.h"
 
-#define LIBAVCODEC_VERSION_MINOR  22
+#define LIBAVCODEC_VERSION_MINOR  23
 #define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \