diff mbox series

[FFmpeg-devel] dxva2: add AV1 decode support

Message ID 20201105155339.131-1-h.leppkes@gmail.com
State New
Headers show
Series [FFmpeg-devel] dxva2: add AV1 decode support | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Hendrik Leppkes Nov. 5, 2020, 3:53 p.m. UTC
---
 Changelog              |   1 +
 configure              |   7 +
 libavcodec/Makefile    |   2 +
 libavcodec/av1dec.c    |  25 +-
 libavcodec/dxva2.c     |  10 +-
 libavcodec/dxva2_av1.c | 504 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/hwaccels.h  |   3 +
 libavcodec/version.h   |   2 +-
 8 files changed, 550 insertions(+), 4 deletions(-)
 create mode 100644 libavcodec/dxva2_av1.c

Comments

Hendrik Leppkes Nov. 5, 2020, 4:09 p.m. UTC | #1
On Thu, Nov 5, 2020 at 4:54 PM Hendrik Leppkes <h.leppkes@gmail.com> wrote:
>
> ---
>  Changelog              |   1 +
>  configure              |   7 +
>  libavcodec/Makefile    |   2 +
>  libavcodec/av1dec.c    |  25 +-
>  libavcodec/dxva2.c     |  10 +-
>  libavcodec/dxva2_av1.c | 504 +++++++++++++++++++++++++++++++++++++++++
>  libavcodec/hwaccels.h  |   3 +
>  libavcodec/version.h   |   2 +-
>  8 files changed, 550 insertions(+), 4 deletions(-)
>  create mode 100644 libavcodec/dxva2_av1.c
>

First of all, I have locally changed the commit message to read
"avcodec/dxva2:" in the topic line, only noticed it looking a bit
weird when I saw it on the ML, apologies.

This was tested and developed on an NVIDIA 30-series card, and
cross-tested against decoding results from dav1d with bit for bit
identical results in every sample I've used so far, including film
grain. On that topic, technically film grain is supposed to be
activated separately and extra surfaces provided, but in testing it
was shown that at least the NVIDIA driver manages film grain
transparently, perhaps by allocating an additional surface layer
internally (which isn't that unheard of). It would be interesting to
see behavior on Intel graphics or future AMD graphics regarding film
grain handling, but I do not have access to that hardware.

Reviews would be appreciated, but as windows-specific hardware code is
hard to review for many, i'll push this after the appropriate grace
period, during which testing with more samples on my end will
continue.

- Hendrik
James Almer Nov. 5, 2020, 4:45 p.m. UTC | #2
On 11/5/2020 12:53 PM, Hendrik Leppkes wrote:
> ---
>   Changelog              |   1 +
>   configure              |   7 +
>   libavcodec/Makefile    |   2 +
>   libavcodec/av1dec.c    |  25 +-
>   libavcodec/dxva2.c     |  10 +-
>   libavcodec/dxva2_av1.c | 504 +++++++++++++++++++++++++++++++++++++++++
>   libavcodec/hwaccels.h  |   3 +
>   libavcodec/version.h   |   2 +-
>   8 files changed, 550 insertions(+), 4 deletions(-)
>   create mode 100644 libavcodec/dxva2_av1.c
> 
> diff --git a/Changelog b/Changelog
> index 3fdcafc355..886e69a1cc 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -40,6 +40,7 @@ version <next>:
>   - High Voltage Software ADPCM encoder
>   - LEGO Racers ALP (.tun & .pcm) muxer
>   - AV1 VAAPI decoder
> +- DXVA2/D3D11VA hardware accelerated AV1 decoding
>   
>   
>   version 4.3:
> diff --git a/configure b/configure
> index 8a9e9b3cd7..e55e910477 100755
> --- a/configure
> +++ b/configure
> @@ -2918,6 +2918,12 @@ videotoolbox_hwaccel_deps="videotoolbox pthreads"
>   videotoolbox_hwaccel_extralibs="-framework QuartzCore"
>   xvmc_deps="X11_extensions_XvMClib_h"
>   
> +av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
> +av1_d3d11va_hwaccel_select="av1_decoder"
> +av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
> +av1_d3d11va2_hwaccel_select="av1_decoder"
> +av1_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_AV1"
> +av1_dxva2_hwaccel_select="av1_decoder"
>   av1_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferAV1_bit_depth_idx"
>   av1_vaapi_hwaccel_select="av1_decoder"
>   h263_vaapi_hwaccel_deps="vaapi"
> @@ -6203,6 +6209,7 @@ enabled videotoolbox && {
>   
>   check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
>   
> +check_type "windows.h dxva.h" "DXVA_PicParams_AV1" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
>   check_type "windows.h dxva.h" "DXVA_PicParams_HEVC" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
>   check_type "windows.h dxva.h" "DXVA_PicParams_VP9" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
>   check_type "windows.h d3d11.h" "ID3D11VideoDecoder"
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index 9d75dd68af..505960df0a 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -912,6 +912,8 @@ OBJS-$(CONFIG_VAAPI)                      += vaapi_decode.o
>   OBJS-$(CONFIG_VIDEOTOOLBOX)               += videotoolbox.o
>   OBJS-$(CONFIG_VDPAU)                      += vdpau.o
>   
> +OBJS-$(CONFIG_AV1_D3D11VA_HWACCEL)        += dxva2_av1.o
> +OBJS-$(CONFIG_AV1_DXVA2_HWACCEL)          += dxva2_av1.o
>   OBJS-$(CONFIG_AV1_VAAPI_HWACCEL)          += vaapi_av1.o
>   OBJS-$(CONFIG_H263_VAAPI_HWACCEL)         += vaapi_mpeg4.o
>   OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
> diff --git a/libavcodec/av1dec.c b/libavcodec/av1dec.c
> index 56712279aa..01cf92fab5 100644
> --- a/libavcodec/av1dec.c
> +++ b/libavcodec/av1dec.c
> @@ -215,7 +215,7 @@ static int get_pixel_format(AVCodecContext *avctx)
>       uint8_t bit_depth;
>       int ret;
>       enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
> -#define HWACCEL_MAX (CONFIG_AV1_VAAPI_HWACCEL)
> +#define HWACCEL_MAX (CONFIG_AV1_DXVA2_HWACCEL + CONFIG_AV1_D3D11VA_HWACCEL * 2 + CONFIG_AV1_VAAPI_HWACCEL)
>       enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
>   
>       if (seq->seq_profile == 2 && seq->color_config.high_bitdepth)
> @@ -278,11 +278,25 @@ static int get_pixel_format(AVCodecContext *avctx)
>   
>       switch (s->pix_fmt) {
>       case AV_PIX_FMT_YUV420P:
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
> +#endif
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
> +        *fmtp++ = AV_PIX_FMT_D3D11;
> +#endif
>   #if CONFIG_AV1_VAAPI_HWACCEL
>           *fmtp++ = AV_PIX_FMT_VAAPI;
>   #endif
>           break;
>       case AV_PIX_FMT_YUV420P10:
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
> +#endif
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
> +        *fmtp++ = AV_PIX_FMT_D3D11;
> +#endif
>   #if CONFIG_AV1_VAAPI_HWACCEL
>           *fmtp++ = AV_PIX_FMT_VAAPI;
>   #endif
> @@ -853,6 +867,15 @@ AVCodec ff_av1_decoder = {
>       .flush                 = av1_decode_flush,
>       .profiles              = NULL_IF_CONFIG_SMALL(ff_av1_profiles),
>       .hw_configs            = (const AVCodecHWConfigInternal * []) {
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +        HWACCEL_DXVA2(av1),
> +#endif
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +        HWACCEL_D3D11VA(av1),
> +#endif
> +#if CONFIG_AV1_D3D11VA2_HWACCEL
> +        HWACCEL_D3D11VA2(av1),
> +#endif
>   #if CONFIG_AV1_VAAPI_HWACCEL
>           HWACCEL_VAAPI(av1),
>   #endif
> diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
> index 32416112bf..b57ea21941 100644
> --- a/libavcodec/dxva2.c
> +++ b/libavcodec/dxva2.c
> @@ -45,6 +45,7 @@ DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main,  0x5b11d51b, 0x2f4c,0x4452,0xbc,0xc3,0x0
>   DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main10,0x107af0e0, 0xef1a,0x4d19,0xab,0xa8,0x67,0xa1,0x63,0x07,0x3d,0x13);
>   DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_Profile0,0x463707f8,0xa1d0,0x4585,0x87,0x6d,0x83,0xaa,0x6d,0x60,0xb8,0x9e);
>   DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_10bit_Profile2,0xa4c749ef,0x6ecf,0x48aa,0x84,0x48,0x50,0xa7,0xa1,0x16,0x5f,0xf7);
> +DEFINE_GUID(ff_DXVA2_ModeAV1_VLD_Profile0,0xb8be4ccb,0xcf53,0x46ba,0x8d,0x59,0xd6,0xb8,0xa6,0xda,0x5d,0x2a);
>   DEFINE_GUID(ff_DXVA2_NoEncrypt,          0x1b81beD0, 0xa0c7,0x11d3,0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5);
>   DEFINE_GUID(ff_GUID_NULL,                0x00000000, 0x0000,0x0000,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
>   DEFINE_GUID(ff_IID_IDirectXVideoDecoderService, 0xfc51a551,0xd5e7,0x11d9,0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02);
> @@ -72,6 +73,8 @@ static const int prof_vp9_profile0[] = {FF_PROFILE_VP9_0,
>                                           FF_PROFILE_UNKNOWN};
>   static const int prof_vp9_profile2[] = {FF_PROFILE_VP9_2,
>                                           FF_PROFILE_UNKNOWN};
> +static const int prof_av1_profile0[] = {FF_PROFILE_AV1_MAIN,
> +                                        FF_PROFILE_UNKNOWN};
>   
>   static const dxva_mode dxva_modes[] = {
>       /* MPEG-2 */
> @@ -98,6 +101,9 @@ static const dxva_mode dxva_modes[] = {
>       { &ff_DXVA2_ModeVP9_VLD_Profile0,       AV_CODEC_ID_VP9, prof_vp9_profile0 },
>       { &ff_DXVA2_ModeVP9_VLD_10bit_Profile2, AV_CODEC_ID_VP9, prof_vp9_profile2 },
>   
> +    /* AV1 */
> +    { &ff_DXVA2_ModeAV1_VLD_Profile0,       AV_CODEC_ID_AV1, prof_av1_profile0 },
> +
>       { NULL,                          0 },
>   };
>   
> @@ -604,7 +610,7 @@ int ff_dxva2_common_frame_params(AVCodecContext *avctx,
>           surface_alignment = 32;
>       /* the HEVC DXVA2 spec asks for 128 pixel aligned surfaces to ensure
>       all coding features have enough room to work with */
> -    else if (avctx->codec_id == AV_CODEC_ID_HEVC)
> +    else if (avctx->codec_id == AV_CODEC_ID_HEVC || avctx->codec_id == AV_CODEC_ID_AV1)
>           surface_alignment = 128;
>       else
>           surface_alignment = 16;
> @@ -615,7 +621,7 @@ int ff_dxva2_common_frame_params(AVCodecContext *avctx,
>       /* add surfaces based on number of possible refs */
>       if (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_HEVC)
>           num_surfaces += 16;
> -    else if (avctx->codec_id == AV_CODEC_ID_VP9)
> +    else if (avctx->codec_id == AV_CODEC_ID_VP9 || avctx->codec_id == AV_CODEC_ID_AV1)
>           num_surfaces += 8;
>       else
>           num_surfaces += 2;
> diff --git a/libavcodec/dxva2_av1.c b/libavcodec/dxva2_av1.c
> new file mode 100644
> index 0000000000..d04c96becf
> --- /dev/null
> +++ b/libavcodec/dxva2_av1.c
> @@ -0,0 +1,504 @@
> +/*
> + * DXVA2 AV1 HW acceleration.
> + *
> + * copyright (c) 2020 Hendrik Leppkes
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "dxva2_internal.h"
> +#include "av1dec.h"
> +
> +#define MAX_TILES 256
> +
> +struct AV1DXVAContext {
> +    FFDXVASharedContext shared;
> +
> +    unsigned int bitstream_allocated;
> +    uint8_t *bitstream_cache;
> +};
> +
> +struct av1_dxva2_picture_context {
> +    DXVA_PicParams_AV1    pp;
> +    unsigned              tile_count;
> +    DXVA_Tile_AV1         tiles[MAX_TILES];
> +    uint8_t              *bitstream;
> +    unsigned              bitstream_size;
> +};
> +
> +static int get_bit_depth_from_seq(const AV1RawSequenceHeader *seq)
> +{
> +    if (seq->seq_profile == 2 && seq->color_config.high_bitdepth)
> +        return seq->color_config.twelve_bit ? 12 : 10;
> +    else if (seq->seq_profile <= 2 && seq->color_config.high_bitdepth)
> +        return 10;
> +    else
> +        return 8;
> +}
> +
> +static int fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *ctx, const AV1DecContext *h,
> +                                    DXVA_PicParams_AV1 *pp)
> +{
> +    int i,j, uses_lr;
> +    const AV1RawSequenceHeader *seq = h->raw_seq;
> +    const AV1RawFrameHeader *frame_header = h->raw_frame_header;
> +
> +    unsigned char remap_lr_type[4] = { AV1_RESTORE_NONE, AV1_RESTORE_SWITCHABLE, AV1_RESTORE_WIENER, AV1_RESTORE_SGRPROJ };
> +
> +    memset(pp, 0, sizeof(*pp));
> +
> +    pp->width  = avctx->width;
> +    pp->height = avctx->height;
> +
> +    pp->max_width  = seq->max_frame_width_minus_1 + 1;
> +    pp->max_height = seq->max_frame_height_minus_1 + 1;
> +
> +    pp->CurrPicTextureIndex = ff_dxva2_get_surface_index(avctx, ctx, h->cur_frame.tf.f);
> +    pp->superres_denom      = frame_header->use_superres ? frame_header->coded_denom : AV1_SUPERRES_NUM;
> +    pp->bitdepth            = get_bit_depth_from_seq(seq);
> +    pp->seq_profile         = seq->seq_profile;
> +
> +    /* Tiling info */
> +    pp->tiles.cols = frame_header->tile_cols;
> +    pp->tiles.rows = frame_header->tile_rows;
> +    pp->tiles.context_update_id = frame_header->context_update_tile_id;
> +
> +    for (i = 0; i < pp->tiles.cols; i++)
> +        pp->tiles.widths[i] = frame_header->width_in_sbs_minus_1[i] + 1;
> +
> +    for (i = 0; i < pp->tiles.rows; i++)
> +        pp->tiles.heights[i] = frame_header->height_in_sbs_minus_1[i] + 1;
> +
> +    /* Coding tools */
> +    pp->coding.use_128x128_superblock       = seq->use_128x128_superblock;
> +    pp->coding.intra_edge_filter            = seq->enable_intra_edge_filter;
> +    pp->coding.interintra_compound          = seq->enable_interintra_compound;
> +    pp->coding.masked_compound              = seq->enable_masked_compound;
> +    pp->coding.warped_motion                = frame_header->allow_warped_motion;
> +    pp->coding.dual_filter                  = seq->enable_dual_filter;
> +    pp->coding.jnt_comp                     = seq->enable_jnt_comp;
> +    pp->coding.screen_content_tools         = frame_header->allow_screen_content_tools;
> +    pp->coding.integer_mv                   = frame_header->force_integer_mv || !(frame_header->frame_type & 1);
> +    pp->coding.cdef                         = seq->enable_cdef;
> +    pp->coding.restoration                  = seq->enable_restoration;
> +    pp->coding.film_grain                   = seq->film_grain_params_present;
> +    pp->coding.intrabc                      = frame_header->allow_intrabc;
> +    pp->coding.high_precision_mv            = frame_header->allow_high_precision_mv;
> +    pp->coding.switchable_motion_mode       = frame_header->is_motion_mode_switchable;
> +    pp->coding.filter_intra                 = seq->enable_filter_intra;
> +    pp->coding.disable_frame_end_update_cdf = frame_header->disable_frame_end_update_cdf;
> +    pp->coding.disable_cdf_update           = frame_header->disable_cdf_update;
> +    pp->coding.reference_mode               = frame_header->reference_select;
> +    pp->coding.skip_mode                    = frame_header->skip_mode_present;
> +    pp->coding.reduced_tx_set               = frame_header->reduced_tx_set;
> +    pp->coding.superres                     = frame_header->use_superres;
> +    pp->coding.tx_mode                      = frame_header->tx_mode;
> +    pp->coding.use_ref_frame_mvs            = frame_header->use_ref_frame_mvs;
> +    pp->coding.enable_ref_frame_mvs         = seq->enable_ref_frame_mvs;
> +    pp->coding.reference_frame_update       = !(frame_header->show_existing_frame == 1 && frame_header->frame_type == AV1_FRAME_KEY);

hwaccel->start_frame() is not called for 
frame_header->show_existing_frame == 1 frames (Those are essentially 
just a header telling the decoder to output a previously decoded frame, 
and maybe update the reference frame state), so that check is 
superfluous, and by extension the whole thing. Just hardcode it to 1.

Is this field documented anywhere?

> +
> +    /* Format & Picture Info flags */
> +    pp->format.frame_type     = frame_header->frame_type;
> +    pp->format.show_frame     = frame_header->show_frame;
> +    pp->format.showable_frame = frame_header->showable_frame;
> +    pp->format.subsampling_x  = seq->color_config.subsampling_x;
> +    pp->format.subsampling_y  = seq->color_config.subsampling_y;
> +    pp->format.mono_chrome    = seq->color_config.mono_chrome;
> +
> +    /* References */
> +    pp->primary_ref_frame = frame_header->primary_ref_frame;
> +    pp->order_hint        = frame_header->order_hint;
> +    pp->order_hint_bits   = seq->enable_order_hint ? seq->order_hint_bits_minus_1 + 1 : 0;
> +
> +    memset(pp->RefFrameMapTextureIndex, 0xFF, sizeof(pp->RefFrameMapTextureIndex));
> +    for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> +        int8_t ref_idx = frame_header->ref_frame_idx[i];
> +        AVFrame *ref_frame = h->ref[ref_idx].tf.f;
> +
> +        pp->frame_refs[i].width  = ref_frame->width;
> +        pp->frame_refs[i].height = ref_frame->height;
> +        pp->frame_refs[i].Index  = ref_frame->buf[0] ? ref_idx : 0xFF;
> +
> +        /* Global Motion */
> +        pp->frame_refs[i].wminvalid = (h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i] == AV1_WARP_MODEL_IDENTITY);
> +        pp->frame_refs[i].wmtype    = h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i];
> +        for (j = 0; j < 6; ++j) {
> +             pp->frame_refs[i].wmmat[j] = h->cur_frame.gm_params[AV1_REF_FRAME_LAST + i][j];
> +        }
> +    }
> +    for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
> +        AVFrame *ref_frame = h->ref[i].tf.f;
> +        if (ref_frame->buf[0])
> +            pp->RefFrameMapTextureIndex[i] = ff_dxva2_get_surface_index(avctx, ctx, ref_frame);
> +    }
> +
> +    /* Loop filter parameters */
> +    pp->loop_filter.filter_level[0]        = frame_header->loop_filter_level[0];
> +    pp->loop_filter.filter_level[1]        = frame_header->loop_filter_level[1];
> +    pp->loop_filter.filter_level_u         = frame_header->loop_filter_level[2];
> +    pp->loop_filter.filter_level_v         = frame_header->loop_filter_level[3];
> +    pp->loop_filter.sharpness_level        = frame_header->loop_filter_sharpness;
> +    pp->loop_filter.mode_ref_delta_enabled = frame_header->loop_filter_delta_enabled;
> +    pp->loop_filter.mode_ref_delta_update  = frame_header->loop_filter_delta_update;
> +    pp->loop_filter.delta_lf_multi         = frame_header->delta_lf_multi;
> +    pp->loop_filter.delta_lf_present       = frame_header->delta_lf_present;
> +    pp->loop_filter.delta_lf_res           = frame_header->delta_lf_res;
> +
> +    for (i = 0; i < AV1_TOTAL_REFS_PER_FRAME; i++) {
> +        pp->loop_filter.ref_deltas[i] = frame_header->loop_filter_ref_deltas[i];
> +    }
> +
> +    pp->loop_filter.mode_deltas[0]                = frame_header->loop_filter_mode_deltas[0];
> +    pp->loop_filter.mode_deltas[1]                = frame_header->loop_filter_mode_deltas[1];
> +    pp->loop_filter.frame_restoration_type[0]     = remap_lr_type[frame_header->lr_type[0]];
> +    pp->loop_filter.frame_restoration_type[1]     = remap_lr_type[frame_header->lr_type[1]];
> +    pp->loop_filter.frame_restoration_type[2]     = remap_lr_type[frame_header->lr_type[2]];
> +    uses_lr = frame_header->lr_type[0] || frame_header->lr_type[1] || frame_header->lr_type[2];
> +    pp->loop_filter.log2_restoration_unit_size[0] = uses_lr ? (6 + frame_header->lr_unit_shift) : 8;
> +    pp->loop_filter.log2_restoration_unit_size[1] = uses_lr ? (6 + frame_header->lr_unit_shift - frame_header->lr_uv_shift) : 8;
> +    pp->loop_filter.log2_restoration_unit_size[2] = uses_lr ? (6 + frame_header->lr_unit_shift - frame_header->lr_uv_shift) : 8;
> +
> +    /* Quantization */
> +    pp->quantization.delta_q_present = frame_header->delta_q_present;
> +    pp->quantization.delta_q_res     = frame_header->delta_q_res;
> +    pp->quantization.base_qindex     = frame_header->base_q_idx;
> +    pp->quantization.y_dc_delta_q    = frame_header->delta_q_y_dc;
> +    pp->quantization.u_dc_delta_q    = frame_header->delta_q_u_dc;
> +    pp->quantization.v_dc_delta_q    = frame_header->delta_q_v_dc;
> +    pp->quantization.u_ac_delta_q    = frame_header->delta_q_u_ac;
> +    pp->quantization.v_ac_delta_q    = frame_header->delta_q_v_ac;
> +    pp->quantization.qm_y            = frame_header->using_qmatrix ? frame_header->qm_y : 0xFF;
> +    pp->quantization.qm_u            = frame_header->using_qmatrix ? frame_header->qm_u : 0xFF;
> +    pp->quantization.qm_v            = frame_header->using_qmatrix ? frame_header->qm_v : 0xFF;
> +
> +    /* Cdef parameters */
> +    pp->cdef.damping = frame_header->cdef_damping_minus_3;
> +    pp->cdef.bits    = frame_header->cdef_bits;
> +    for (i = 0; i < 8; i++) {
> +        pp->cdef.y_strengths[i].primary    = frame_header->cdef_y_pri_strength[i];
> +        pp->cdef.y_strengths[i].secondary  = frame_header->cdef_y_sec_strength[i];
> +        pp->cdef.uv_strengths[i].primary   = frame_header->cdef_uv_pri_strength[i];
> +        pp->cdef.uv_strengths[i].secondary = frame_header->cdef_uv_sec_strength[i];
> +    }
> +
> +    /* Misc flags */
> +    pp->interp_filter = frame_header->interpolation_filter;
> +
> +    /* Segmentation */
> +    pp->segmentation.enabled         = frame_header->segmentation_enabled;
> +    pp->segmentation.update_map      = frame_header->segmentation_update_map;
> +    pp->segmentation.update_data     = frame_header->segmentation_update_data;
> +    pp->segmentation.temporal_update = frame_header->segmentation_temporal_update;
> +    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
> +        for (j = 0; j < AV1_SEG_LVL_MAX; j++) {
> +            pp->segmentation.feature_mask[i].mask |= frame_header->feature_enabled[i][j] << j;
> +            pp->segmentation.feature_data[i][j]    = frame_header->feature_value[i][j];
> +        }
> +    }
> +
> +    /* Film grain */
> +    if (frame_header->apply_grain) {
> +        pp->film_grain.apply_grain              = 1;
> +        pp->film_grain.scaling_shift_minus8     = frame_header->grain_scaling_minus_8;
> +        pp->film_grain.chroma_scaling_from_luma = frame_header->chroma_scaling_from_luma;
> +        pp->film_grain.ar_coeff_lag             = frame_header->ar_coeff_lag;
> +        pp->film_grain.ar_coeff_shift_minus6    = frame_header->ar_coeff_shift_minus_6;
> +        pp->film_grain.grain_scale_shift        = frame_header->grain_scale_shift;
> +        pp->film_grain.overlap_flag             = frame_header->overlap_flag;
> +        pp->film_grain.clip_to_restricted_range = frame_header->clip_to_restricted_range;
> +        pp->film_grain.matrix_coeff_is_identity = (seq->color_config.matrix_coefficients == AVCOL_SPC_RGB);
> +
> +        pp->film_grain.grain_seed               = frame_header->grain_seed;
> +        pp->film_grain.num_y_points             = frame_header->num_y_points;
> +        for (i = 0; i < frame_header->num_y_points; i++) {
> +            pp->film_grain.scaling_points_y[i][0] = frame_header->point_y_value[i];
> +            pp->film_grain.scaling_points_y[i][1] = frame_header->point_y_scaling[i];
> +        }
> +        pp->film_grain.num_cb_points            = frame_header->num_cb_points;
> +        for (i = 0; i < frame_header->num_cb_points; i++) {
> +            pp->film_grain.scaling_points_cb[i][0] = frame_header->point_cb_value[i];
> +            pp->film_grain.scaling_points_cb[i][1] = frame_header->point_cb_scaling[i];
> +        }
> +        pp->film_grain.num_cr_points            = frame_header->num_cr_points;
> +        for (i = 0; i < frame_header->num_cr_points; i++) {
> +            pp->film_grain.scaling_points_cr[i][0] = frame_header->point_cr_value[i];
> +            pp->film_grain.scaling_points_cr[i][1] = frame_header->point_cr_scaling[i];
> +        }
> +        for (i = 0; i < 24; i++) {
> +            pp->film_grain.ar_coeffs_y[i] = frame_header->ar_coeffs_y_plus_128[i];
> +        }
> +        for (i = 0; i < 25; i++) {
> +            pp->film_grain.ar_coeffs_cb[i] = frame_header->ar_coeffs_cb_plus_128[i];
> +            pp->film_grain.ar_coeffs_cr[i] = frame_header->ar_coeffs_cr_plus_128[i];
> +        }
> +        pp->film_grain.cb_mult      = frame_header->cb_mult;
> +        pp->film_grain.cb_luma_mult = frame_header->cb_luma_mult;
> +        pp->film_grain.cr_mult      = frame_header->cr_mult;
> +        pp->film_grain.cr_luma_mult = frame_header->cr_luma_mult;
> +        pp->film_grain.cb_offset    = frame_header->cb_offset;
> +        pp->film_grain.cr_offset    = frame_header->cr_offset;
> +        pp->film_grain.cr_offset    = frame_header->cr_offset;
> +    }
> +
> +    // XXX: setting the StatusReportFeedbackNumber breaks decoding on some drivers
> +    // we never use the status reporting functionality, so just skip on that
> +    //pp->StatusReportFeedbackNumber = 1 + DXVA_CONTEXT_REPORT_ID(avctx, ctx)++;
> +    return 0;
> +}
> +
> +static int dxva2_av1_start_frame(AVCodecContext *avctx,
> +                                 av_unused const uint8_t *buffer,
> +                                 av_unused uint32_t size)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +
> +    if (!DXVA_CONTEXT_VALID(avctx, ctx))
> +        return -1;
> +    av_assert0(ctx_pic);
> +
> +    /* Fill up DXVA_PicParams_AV1 */
> +    if (fill_picture_parameters(avctx, ctx, h, &ctx_pic->pp) < 0)
> +        return -1;
> +
> +    ctx_pic->bitstream_size = 0;
> +    ctx_pic->bitstream      = NULL;
> +    return 0;
> +}
> +
> +static int dxva2_av1_decode_slice(AVCodecContext *avctx,
> +                                  const uint8_t *buffer,
> +                                  uint32_t size)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    const AV1RawFrameHeader *frame_header = h->raw_frame_header;
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +    struct AV1DXVAContext *ctx = avctx->internal->hwaccel_priv_data;
> +    void *tmp;
> +
> +    ctx_pic->tile_count = frame_header->tile_cols * frame_header->tile_rows;
> +
> +    /* too many tiles, exceeding all defined levels in the AV1 spec */
> +    if (ctx_pic->tile_count > MAX_TILES)
> +        return AVERROR(ENOSYS);
> +
> +    /* Shortcut if all tiles are in the same buffer */
> +    if (ctx_pic->tile_count == h->tg_end - h->tg_start + 1) {
> +        ctx_pic->bitstream = (uint8_t *)buffer;
> +        ctx_pic->bitstream_size = size;
> +
> +        for (uint32_t tile_num = 0; tile_num < ctx_pic->tile_count; tile_num++) {
> +            ctx_pic->tiles[tile_num].DataOffset   = h->tile_group_info[tile_num].tile_offset;
> +            ctx_pic->tiles[tile_num].DataSize     = h->tile_group_info[tile_num].tile_size;
> +            ctx_pic->tiles[tile_num].row          = h->tile_group_info[tile_num].tile_row;
> +            ctx_pic->tiles[tile_num].column       = h->tile_group_info[tile_num].tile_column;
> +            ctx_pic->tiles[tile_num].anchor_frame = 0xFF;
> +        }
> +
> +        return 0;
> +    }
> +
> +    /* allocate an internal buffer */
> +    tmp = av_fast_realloc(ctx->bitstream_cache, &ctx->bitstream_allocated,
> +                          ctx_pic->bitstream_size + size);
> +    if (!tmp) {
> +        return AVERROR(ENOMEM);
> +    }
> +    ctx_pic->bitstream = ctx->bitstream_cache = tmp;
> +
> +    memcpy(ctx_pic->bitstream + ctx_pic->bitstream_size, buffer, size);
> +
> +    for (uint32_t tile_num = h->tg_start; tile_num <= h->tg_end; tile_num++) {
> +        ctx_pic->tiles[tile_num].DataOffset   = ctx_pic->bitstream_size + h->tile_group_info[tile_num].tile_offset;
> +        ctx_pic->tiles[tile_num].DataSize     = h->tile_group_info[tile_num].tile_size;
> +        ctx_pic->tiles[tile_num].row          = h->tile_group_info[tile_num].tile_row;
> +        ctx_pic->tiles[tile_num].column       = h->tile_group_info[tile_num].tile_column;
> +        ctx_pic->tiles[tile_num].anchor_frame = 0xFF;
> +    }
> +
> +    ctx_pic->bitstream_size += size;
> +
> +    return 0;
> +}
> +
> +static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
> +                                             DECODER_BUFFER_DESC *bs,
> +                                             DECODER_BUFFER_DESC *sc)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +    void     *dxva_data_ptr;
> +    uint8_t  *dxva_data;
> +    unsigned dxva_size;
> +    unsigned padding;
> +    unsigned type;
> +
> +#if CONFIG_D3D11VA
> +    if (ff_dxva2_is_d3d11(avctx)) {
> +        type = D3D11_VIDEO_DECODER_BUFFER_BITSTREAM;
> +        if (FAILED(ID3D11VideoContext_GetDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context,
> +                                                       D3D11VA_CONTEXT(ctx)->decoder,
> +                                                       type,
> +                                                       &dxva_size, &dxva_data_ptr)))
> +            return -1;
> +    }
> +#endif
> +#if CONFIG_DXVA2
> +    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
> +        type = DXVA2_BitStreamDateBufferType;
> +        if (FAILED(IDirectXVideoDecoder_GetBuffer(DXVA2_CONTEXT(ctx)->decoder,
> +                                                  type,
> +                                                  &dxva_data_ptr, &dxva_size)))
> +            return -1;
> +    }
> +#endif
> +
> +    dxva_data = dxva_data_ptr;
> +
> +    if (ctx_pic->bitstream_size > dxva_size) {
> +        av_log(avctx, AV_LOG_ERROR, "Bitstream size exceeds hardware buffer");
> +        return -1;
> +    }
> +
> +    memcpy(dxva_data, ctx_pic->bitstream, ctx_pic->bitstream_size);

This is a memcpy after a (potential) memcpy in dxva2_av1_decode_slice(). 
Is there no way to avoid it?

> +
> +    padding = FFMIN(128 - ((ctx_pic->bitstream_size) & 127), dxva_size - ctx_pic->bitstream_size);
> +    if (padding > 0) {
> +        memset(dxva_data + ctx_pic->bitstream_size, 0, padding);
> +        ctx_pic->bitstream_size += padding;
> +    }
> +
> +#if CONFIG_D3D11VA
> +    if (ff_dxva2_is_d3d11(avctx))
> +        if (FAILED(ID3D11VideoContext_ReleaseDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context, D3D11VA_CONTEXT(ctx)->decoder, type)))
> +            return -1;
> +#endif
> +#if CONFIG_DXVA2
> +    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD)
> +        if (FAILED(IDirectXVideoDecoder_ReleaseBuffer(DXVA2_CONTEXT(ctx)->decoder, type)))
> +            return -1;
> +#endif
> +
> +#if CONFIG_D3D11VA
> +    if (ff_dxva2_is_d3d11(avctx)) {
> +        D3D11_VIDEO_DECODER_BUFFER_DESC *dsc11 = bs;
> +        memset(dsc11, 0, sizeof(*dsc11));
> +        dsc11->BufferType           = type;
> +        dsc11->DataSize             = ctx_pic->bitstream_size;
> +        dsc11->NumMBsInBuffer       = 0;
> +
> +        type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
> +    }
> +#endif
> +#if CONFIG_DXVA2
> +    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
> +        DXVA2_DecodeBufferDesc *dsc2 = bs;
> +        memset(dsc2, 0, sizeof(*dsc2));
> +        dsc2->CompressedBufferType = type;
> +        dsc2->DataSize             = ctx_pic->bitstream_size;
> +        dsc2->NumMBsInBuffer       = 0;
> +
> +        type = DXVA2_SliceControlBufferType;
> +    }
> +#endif
> +
> +    return ff_dxva2_commit_buffer(avctx, ctx, sc, type,
> +                                  ctx_pic->tiles, sizeof(*ctx_pic->tiles) * ctx_pic->tile_count, 0);
> +}
> +
> +static int dxva2_av1_end_frame(AVCodecContext *avctx)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +    int ret;
> +
> +    if (ctx_pic->bitstream_size <= 0)
> +        return -1;
> +
> +    ret = ff_dxva2_common_end_frame(avctx, h->cur_frame.tf.f,
> +                                    &ctx_pic->pp, sizeof(ctx_pic->pp),
> +                                    NULL, 0,
> +                                    commit_bitstream_and_slice_buffer);
> +
> +    return ret;
> +}
> +
> +static int dxva2_av1_uninit(AVCodecContext *avctx)
> +{
> +    struct AV1DXVAContext *ctx = avctx->internal->hwaccel_priv_data;
> +
> +    av_freep(&ctx->bitstream_cache);
> +    ctx->bitstream_allocated = 0;
> +
> +    return ff_dxva2_decode_uninit(avctx);
> +}
> +
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +const AVHWAccel ff_av1_dxva2_hwaccel = {
> +    .name           = "av1_dxva2",
> +    .type           = AVMEDIA_TYPE_VIDEO,
> +    .id             = AV_CODEC_ID_AV1,
> +    .pix_fmt        = AV_PIX_FMT_DXVA2_VLD,
> +    .init           = ff_dxva2_decode_init,
> +    .uninit         = dxva2_av1_uninit,
> +    .start_frame    = dxva2_av1_start_frame,
> +    .decode_slice   = dxva2_av1_decode_slice,
> +    .end_frame      = dxva2_av1_end_frame,
> +    .frame_params   = ff_dxva2_common_frame_params,
> +    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
> +    .priv_data_size = sizeof(struct AV1DXVAContext),
> +};
> +#endif
> +
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +const AVHWAccel ff_av1_d3d11va_hwaccel = {
> +    .name           = "av1_d3d11va",
> +    .type           = AVMEDIA_TYPE_VIDEO,
> +    .id             = AV_CODEC_ID_AV1,
> +    .pix_fmt        = AV_PIX_FMT_D3D11VA_VLD,
> +    .init           = ff_dxva2_decode_init,
> +    .uninit         = dxva2_av1_uninit,
> +    .start_frame    = dxva2_av1_start_frame,
> +    .decode_slice   = dxva2_av1_decode_slice,
> +    .end_frame      = dxva2_av1_end_frame,
> +    .frame_params   = ff_dxva2_common_frame_params,
> +    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
> +    .priv_data_size = sizeof(struct AV1DXVAContext),
> +};
> +#endif
> +
> +#if CONFIG_AV1_D3D11VA2_HWACCEL
> +const AVHWAccel ff_av1_d3d11va2_hwaccel = {
> +    .name           = "av1_d3d11va2",
> +    .type           = AVMEDIA_TYPE_VIDEO,
> +    .id             = AV_CODEC_ID_AV1,
> +    .pix_fmt        = AV_PIX_FMT_D3D11,
> +    .init           = ff_dxva2_decode_init,
> +    .uninit         = dxva2_av1_uninit,
> +    .start_frame    = dxva2_av1_start_frame,
> +    .decode_slice   = dxva2_av1_decode_slice,
> +    .end_frame      = dxva2_av1_end_frame,
> +    .frame_params   = ff_dxva2_common_frame_params,
> +    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
> +    .priv_data_size = sizeof(struct AV1DXVAContext),
> +};
> +#endif
> diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
> index 18e9079c55..9869ce9f72 100644
> --- a/libavcodec/hwaccels.h
> +++ b/libavcodec/hwaccels.h
> @@ -21,6 +21,9 @@
>   
>   #include "avcodec.h"
>   
> +extern const AVHWAccel ff_av1_d3d11va_hwaccel;
> +extern const AVHWAccel ff_av1_d3d11va2_hwaccel;
> +extern const AVHWAccel ff_av1_dxva2_hwaccel;
>   extern const AVHWAccel ff_av1_vaapi_hwaccel;
>   extern const AVHWAccel ff_h263_vaapi_hwaccel;
>   extern const AVHWAccel ff_h263_videotoolbox_hwaccel;
> diff --git a/libavcodec/version.h b/libavcodec/version.h
> index 5173d0f090..a595e32832 100644
> --- a/libavcodec/version.h
> +++ b/libavcodec/version.h
> @@ -29,7 +29,7 @@
>   
>   #define LIBAVCODEC_VERSION_MAJOR  58
>   #define LIBAVCODEC_VERSION_MINOR 112
> -#define LIBAVCODEC_VERSION_MICRO 101
> +#define LIBAVCODEC_VERSION_MICRO 102
>   
>   #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
>                                                  LIBAVCODEC_VERSION_MINOR, \
>
Hendrik Leppkes Nov. 5, 2020, 4:55 p.m. UTC | #3
On Thu, Nov 5, 2020 at 5:45 PM James Almer <jamrial@gmail.com> wrote:
> > +    pp->coding.reference_frame_update       = !(frame_header->show_existing_frame == 1 && frame_header->frame_type == AV1_FRAME_KEY);
>
> hwaccel->start_frame() is not called for
> frame_header->show_existing_frame == 1 frames (Those are essentially
> just a header telling the decoder to output a previously decoded frame,
> and maybe update the reference frame state), so that check is
> superfluous, and by extension the whole thing. Just hardcode it to 1.
>
> Is this field documented anywhere?
>

DXVA Spec:
"Indicates that the reference frame update process as specified by
section 7.20 of the specification should be performed after decoding
this frame.  Otherwise section 7.21 should be performed."

AV1 Spec (7.4):
"Otherwise (show_existing_frame is equal to 1), if frame_type is equal
to KEY_FRAME, the reference frame loading process as specified in
section 7.21 is invoked"

That was my interpretation of that flag, and also matches the
implementation that was provided to dav1d a few months ago by one of
the authors of the DXVA AV1 spec.
I can hardcode it, but I also don't see the harm in keeping it as-is
for documentary purposes.

> > +
> > +    if (ctx_pic->bitstream_size > dxva_size) {
> > +        av_log(avctx, AV_LOG_ERROR, "Bitstream size exceeds hardware buffer");
> > +        return -1;
> > +    }
> > +
> > +    memcpy(dxva_data, ctx_pic->bitstream, ctx_pic->bitstream_size);
>
> This is a memcpy after a (potential) memcpy in dxva2_av1_decode_slice().
> Is there no way to avoid it?
>

That depends on the behavior of av1dec. Are all tile groups guaranteed
to be available at the same time, so that I can simply save pointers
to them, and copy them into the final buffer later?
Or could the first one become invalid? In which case I need to copy
its data. The call order is not strict enough to allow the buffer
management to keep the hardware buffer mapped persistently.

- Hendrik
James Almer Nov. 5, 2020, 6:11 p.m. UTC | #4
On 11/5/2020 1:55 PM, Hendrik Leppkes wrote:
> On Thu, Nov 5, 2020 at 5:45 PM James Almer <jamrial@gmail.com> wrote:
>>> +    pp->coding.reference_frame_update       = !(frame_header->show_existing_frame == 1 && frame_header->frame_type == AV1_FRAME_KEY);
>>
>> hwaccel->start_frame() is not called for
>> frame_header->show_existing_frame == 1 frames (Those are essentially
>> just a header telling the decoder to output a previously decoded frame,
>> and maybe update the reference frame state), so that check is
>> superfluous, and by extension the whole thing. Just hardcode it to 1.
>>
>> Is this field documented anywhere?
>>
> 
> DXVA Spec:
> "Indicates that the reference frame update process as specified by
> section 7.20 of the specification should be performed after decoding
> this frame.  Otherwise section 7.21 should be performed."
> 
> AV1 Spec (7.4):
> "Otherwise (show_existing_frame is equal to 1), if frame_type is equal
> to KEY_FRAME, the reference frame loading process as specified in
> section 7.21 is invoked"
> 
> That was my interpretation of that flag, and also matches the
> implementation that was provided to dav1d a few months ago by one of
> the authors of the DXVA AV1 spec.
> I can hardcode it, but I also don't see the harm in keeping it as-is
> for documentary purposes.
> 
>>> +
>>> +    if (ctx_pic->bitstream_size > dxva_size) {
>>> +        av_log(avctx, AV_LOG_ERROR, "Bitstream size exceeds hardware buffer");
>>> +        return -1;
>>> +    }
>>> +
>>> +    memcpy(dxva_data, ctx_pic->bitstream, ctx_pic->bitstream_size);
>>
>> This is a memcpy after a (potential) memcpy in dxva2_av1_decode_slice().
>> Is there no way to avoid it?
>>
> 
> That depends on the behavior of av1dec. Are all tile groups guaranteed
> to be available at the same time, so that I can simply save pointers
> to them, and copy them into the final buffer later?

Not currently, in theory, but can be easily done by storing a reference 
to the AVBufferRef in each TileGroupInfo.

That being said, i don't know if the decoder can handle split/incomplete 
Temporal Units as is. Guess it's a matter of injecting the 
av1_frame_split bsf and see how badly it explodes.
If we enforce Temporal Units to be fully contained in a packet (As they 
are muxed into ivf, mp4, webm, annexb, etc), or at least individual 
Frames (Header + Tile Group/s combinations), we could also avoid the 
memcpy in dxva2_av1_decode_slice().

> Or could the first one become invalid? In which case I need to copy
> its data. The call order is not strict enough to allow the buffer
> management to keep the hardware buffer mapped persistently.
> 
> - Hendrik
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Hendrik Leppkes Nov. 5, 2020, 6:29 p.m. UTC | #5
On Thu, Nov 5, 2020 at 7:12 PM James Almer <jamrial@gmail.com> wrote:
> >
> > That depends on the behavior of av1dec. Are all tile groups guaranteed
> > to be available at the same time, so that I can simply save pointers
> > to them, and copy them into the final buffer later?
>
> Not currently, in theory, but can be easily done by storing a reference
> to the AVBufferRef in each TileGroupInfo.
>
> That being said, i don't know if the decoder can handle split/incomplete
> Temporal Units as is. Guess it's a matter of injecting the
> av1_frame_split bsf and see how badly it explodes.
> If we enforce Temporal Units to be fully contained in a packet (As they
> are muxed into ivf, mp4, webm, annexb, etc), or at least individual
> Frames (Header + Tile Group/s combinations), we could also avoid the
> memcpy in dxva2_av1_decode_slice().

One memcpy will always have to remain, from the packet to the hardware
buffer. If we can guarantee that all tile groups are present at the
same time, then I could copy them straight to the hardware buffer
during end_frame, instead of buffering them.
But then tile groups are not that common, so I wouldn't see this as a
big blocker and we can clean that up when av1dec can ensure the
presence of the data.

- Hendrik
Mark Thompson Nov. 6, 2020, 12:27 a.m. UTC | #6
On 05/11/2020 15:53, Hendrik Leppkes wrote:
> ---
>   Changelog              |   1 +
>   configure              |   7 +
>   libavcodec/Makefile    |   2 +
>   libavcodec/av1dec.c    |  25 +-
>   libavcodec/dxva2.c     |  10 +-
>   libavcodec/dxva2_av1.c | 504 +++++++++++++++++++++++++++++++++++++++++
>   libavcodec/hwaccels.h  |   3 +
>   libavcodec/version.h   |   2 +-
>   8 files changed, 550 insertions(+), 4 deletions(-)
>   create mode 100644 libavcodec/dxva2_av1.c
> 
> ...
> diff --git a/libavcodec/dxva2_av1.c b/libavcodec/dxva2_av1.c
> new file mode 100644
> index 0000000000..d04c96becf
> --- /dev/null
> +++ b/libavcodec/dxva2_av1.c
> @@ -0,0 +1,504 @@
> ...
> +    // XXX: setting the StatusReportFeedbackNumber breaks decoding on some drivers
> +    // we never use the status reporting functionality, so just skip on that
> +    //pp->StatusReportFeedbackNumber = 1 + DXVA_CONTEXT_REPORT_ID(avctx, ctx)++;

If it's worth mentioning this at all then it would probably help to note exactly what drivers are breaking.

> +    return 0;
> +}
> +

Is there is an Intel implementation of this yet?  Maybe poke one of the Intel people to try it if possible.

I don't have anything to test this, but from just reading through it looks pretty good to me.

Thanks,

- Mark
Hendrik Leppkes Nov. 6, 2020, 8:39 a.m. UTC | #7
On Fri, Nov 6, 2020 at 2:29 AM Mark Thompson <sw@jkqxz.net> wrote:
>
> On 05/11/2020 15:53, Hendrik Leppkes wrote:
> > ---
> >   Changelog              |   1 +
> >   configure              |   7 +
> >   libavcodec/Makefile    |   2 +
> >   libavcodec/av1dec.c    |  25 +-
> >   libavcodec/dxva2.c     |  10 +-
> >   libavcodec/dxva2_av1.c | 504 +++++++++++++++++++++++++++++++++++++++++
> >   libavcodec/hwaccels.h  |   3 +
> >   libavcodec/version.h   |   2 +-
> >   8 files changed, 550 insertions(+), 4 deletions(-)
> >   create mode 100644 libavcodec/dxva2_av1.c
> >
> > ...
> > diff --git a/libavcodec/dxva2_av1.c b/libavcodec/dxva2_av1.c
> > new file mode 100644
> > index 0000000000..d04c96becf
> > --- /dev/null
> > +++ b/libavcodec/dxva2_av1.c
> > @@ -0,0 +1,504 @@
> > ...
> > +    // XXX: setting the StatusReportFeedbackNumber breaks decoding on some drivers
> > +    // we never use the status reporting functionality, so just skip on that
> > +    //pp->StatusReportFeedbackNumber = 1 + DXVA_CONTEXT_REPORT_ID(avctx, ctx)++;
>
> If it's worth mentioning this at all then it would probably help to note exactly what drivers are breaking.

The status report functionality is a bit arcane and I have never seen
a proper use of it yet, but historically we have always set those
numbers just in case someone wants to use it, and I chased weird
breakage for a day until I found the reason, so the comment is more to
the benefit for anyone ever being in the same situation.
I could document which exact driver this was with, but on the other
hand it is unlikely to ever get enabled because it is just not used.

>
> > +    return 0;
> > +}
> > +
>
> Is there is an Intel implementation of this yet?  Maybe poke one of the Intel people to try it if possible.
>

I'm currently in contact with someone from Intel to test the patch on
their Gen12 GPUs.

- Hendrik
diff mbox series

Patch

diff --git a/Changelog b/Changelog
index 3fdcafc355..886e69a1cc 100644
--- a/Changelog
+++ b/Changelog
@@ -40,6 +40,7 @@  version <next>:
 - High Voltage Software ADPCM encoder
 - LEGO Racers ALP (.tun & .pcm) muxer
 - AV1 VAAPI decoder
+- DXVA2/D3D11VA hardware accelerated AV1 decoding
 
 
 version 4.3:
diff --git a/configure b/configure
index 8a9e9b3cd7..e55e910477 100755
--- a/configure
+++ b/configure
@@ -2918,6 +2918,12 @@  videotoolbox_hwaccel_deps="videotoolbox pthreads"
 videotoolbox_hwaccel_extralibs="-framework QuartzCore"
 xvmc_deps="X11_extensions_XvMClib_h"
 
+av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
+av1_d3d11va_hwaccel_select="av1_decoder"
+av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
+av1_d3d11va2_hwaccel_select="av1_decoder"
+av1_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_AV1"
+av1_dxva2_hwaccel_select="av1_decoder"
 av1_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferAV1_bit_depth_idx"
 av1_vaapi_hwaccel_select="av1_decoder"
 h263_vaapi_hwaccel_deps="vaapi"
@@ -6203,6 +6209,7 @@  enabled videotoolbox && {
 
 check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
 
+check_type "windows.h dxva.h" "DXVA_PicParams_AV1" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
 check_type "windows.h dxva.h" "DXVA_PicParams_HEVC" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
 check_type "windows.h dxva.h" "DXVA_PicParams_VP9" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
 check_type "windows.h d3d11.h" "ID3D11VideoDecoder"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 9d75dd68af..505960df0a 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -912,6 +912,8 @@  OBJS-$(CONFIG_VAAPI)                      += vaapi_decode.o
 OBJS-$(CONFIG_VIDEOTOOLBOX)               += videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                      += vdpau.o
 
+OBJS-$(CONFIG_AV1_D3D11VA_HWACCEL)        += dxva2_av1.o
+OBJS-$(CONFIG_AV1_DXVA2_HWACCEL)          += dxva2_av1.o
 OBJS-$(CONFIG_AV1_VAAPI_HWACCEL)          += vaapi_av1.o
 OBJS-$(CONFIG_H263_VAAPI_HWACCEL)         += vaapi_mpeg4.o
 OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
diff --git a/libavcodec/av1dec.c b/libavcodec/av1dec.c
index 56712279aa..01cf92fab5 100644
--- a/libavcodec/av1dec.c
+++ b/libavcodec/av1dec.c
@@ -215,7 +215,7 @@  static int get_pixel_format(AVCodecContext *avctx)
     uint8_t bit_depth;
     int ret;
     enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
-#define HWACCEL_MAX (CONFIG_AV1_VAAPI_HWACCEL)
+#define HWACCEL_MAX (CONFIG_AV1_DXVA2_HWACCEL + CONFIG_AV1_D3D11VA_HWACCEL * 2 + CONFIG_AV1_VAAPI_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
 
     if (seq->seq_profile == 2 && seq->color_config.high_bitdepth)
@@ -278,11 +278,25 @@  static int get_pixel_format(AVCodecContext *avctx)
 
     switch (s->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
+#if CONFIG_AV1_DXVA2_HWACCEL
+        *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
+#endif
+#if CONFIG_AV1_D3D11VA_HWACCEL
+        *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
+        *fmtp++ = AV_PIX_FMT_D3D11;
+#endif
 #if CONFIG_AV1_VAAPI_HWACCEL
         *fmtp++ = AV_PIX_FMT_VAAPI;
 #endif
         break;
     case AV_PIX_FMT_YUV420P10:
+#if CONFIG_AV1_DXVA2_HWACCEL
+        *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
+#endif
+#if CONFIG_AV1_D3D11VA_HWACCEL
+        *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
+        *fmtp++ = AV_PIX_FMT_D3D11;
+#endif
 #if CONFIG_AV1_VAAPI_HWACCEL
         *fmtp++ = AV_PIX_FMT_VAAPI;
 #endif
@@ -853,6 +867,15 @@  AVCodec ff_av1_decoder = {
     .flush                 = av1_decode_flush,
     .profiles              = NULL_IF_CONFIG_SMALL(ff_av1_profiles),
     .hw_configs            = (const AVCodecHWConfigInternal * []) {
+#if CONFIG_AV1_DXVA2_HWACCEL
+        HWACCEL_DXVA2(av1),
+#endif
+#if CONFIG_AV1_D3D11VA_HWACCEL
+        HWACCEL_D3D11VA(av1),
+#endif
+#if CONFIG_AV1_D3D11VA2_HWACCEL
+        HWACCEL_D3D11VA2(av1),
+#endif
 #if CONFIG_AV1_VAAPI_HWACCEL
         HWACCEL_VAAPI(av1),
 #endif
diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
index 32416112bf..b57ea21941 100644
--- a/libavcodec/dxva2.c
+++ b/libavcodec/dxva2.c
@@ -45,6 +45,7 @@  DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main,  0x5b11d51b, 0x2f4c,0x4452,0xbc,0xc3,0x0
 DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main10,0x107af0e0, 0xef1a,0x4d19,0xab,0xa8,0x67,0xa1,0x63,0x07,0x3d,0x13);
 DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_Profile0,0x463707f8,0xa1d0,0x4585,0x87,0x6d,0x83,0xaa,0x6d,0x60,0xb8,0x9e);
 DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_10bit_Profile2,0xa4c749ef,0x6ecf,0x48aa,0x84,0x48,0x50,0xa7,0xa1,0x16,0x5f,0xf7);
+DEFINE_GUID(ff_DXVA2_ModeAV1_VLD_Profile0,0xb8be4ccb,0xcf53,0x46ba,0x8d,0x59,0xd6,0xb8,0xa6,0xda,0x5d,0x2a);
 DEFINE_GUID(ff_DXVA2_NoEncrypt,          0x1b81beD0, 0xa0c7,0x11d3,0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5);
 DEFINE_GUID(ff_GUID_NULL,                0x00000000, 0x0000,0x0000,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
 DEFINE_GUID(ff_IID_IDirectXVideoDecoderService, 0xfc51a551,0xd5e7,0x11d9,0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02);
@@ -72,6 +73,8 @@  static const int prof_vp9_profile0[] = {FF_PROFILE_VP9_0,
                                         FF_PROFILE_UNKNOWN};
 static const int prof_vp9_profile2[] = {FF_PROFILE_VP9_2,
                                         FF_PROFILE_UNKNOWN};
+static const int prof_av1_profile0[] = {FF_PROFILE_AV1_MAIN,
+                                        FF_PROFILE_UNKNOWN};
 
 static const dxva_mode dxva_modes[] = {
     /* MPEG-2 */
@@ -98,6 +101,9 @@  static const dxva_mode dxva_modes[] = {
     { &ff_DXVA2_ModeVP9_VLD_Profile0,       AV_CODEC_ID_VP9, prof_vp9_profile0 },
     { &ff_DXVA2_ModeVP9_VLD_10bit_Profile2, AV_CODEC_ID_VP9, prof_vp9_profile2 },
 
+    /* AV1 */
+    { &ff_DXVA2_ModeAV1_VLD_Profile0,       AV_CODEC_ID_AV1, prof_av1_profile0 },
+
     { NULL,                          0 },
 };
 
@@ -604,7 +610,7 @@  int ff_dxva2_common_frame_params(AVCodecContext *avctx,
         surface_alignment = 32;
     /* the HEVC DXVA2 spec asks for 128 pixel aligned surfaces to ensure
     all coding features have enough room to work with */
-    else if (avctx->codec_id == AV_CODEC_ID_HEVC)
+    else if (avctx->codec_id == AV_CODEC_ID_HEVC || avctx->codec_id == AV_CODEC_ID_AV1)
         surface_alignment = 128;
     else
         surface_alignment = 16;
@@ -615,7 +621,7 @@  int ff_dxva2_common_frame_params(AVCodecContext *avctx,
     /* add surfaces based on number of possible refs */
     if (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_HEVC)
         num_surfaces += 16;
-    else if (avctx->codec_id == AV_CODEC_ID_VP9)
+    else if (avctx->codec_id == AV_CODEC_ID_VP9 || avctx->codec_id == AV_CODEC_ID_AV1)
         num_surfaces += 8;
     else
         num_surfaces += 2;
diff --git a/libavcodec/dxva2_av1.c b/libavcodec/dxva2_av1.c
new file mode 100644
index 0000000000..d04c96becf
--- /dev/null
+++ b/libavcodec/dxva2_av1.c
@@ -0,0 +1,504 @@ 
+/*
+ * DXVA2 AV1 HW acceleration.
+ *
+ * copyright (c) 2020 Hendrik Leppkes
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+
+#include "dxva2_internal.h"
+#include "av1dec.h"
+
+#define MAX_TILES 256
+
+struct AV1DXVAContext {
+    FFDXVASharedContext shared;
+
+    unsigned int bitstream_allocated;
+    uint8_t *bitstream_cache;
+};
+
+struct av1_dxva2_picture_context {
+    DXVA_PicParams_AV1    pp;
+    unsigned              tile_count;
+    DXVA_Tile_AV1         tiles[MAX_TILES];
+    uint8_t              *bitstream;
+    unsigned              bitstream_size;
+};
+
+static int get_bit_depth_from_seq(const AV1RawSequenceHeader *seq)
+{
+    if (seq->seq_profile == 2 && seq->color_config.high_bitdepth)
+        return seq->color_config.twelve_bit ? 12 : 10;
+    else if (seq->seq_profile <= 2 && seq->color_config.high_bitdepth)
+        return 10;
+    else
+        return 8;
+}
+
+static int fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *ctx, const AV1DecContext *h,
+                                    DXVA_PicParams_AV1 *pp)
+{
+    int i,j, uses_lr;
+    const AV1RawSequenceHeader *seq = h->raw_seq;
+    const AV1RawFrameHeader *frame_header = h->raw_frame_header;
+
+    unsigned char remap_lr_type[4] = { AV1_RESTORE_NONE, AV1_RESTORE_SWITCHABLE, AV1_RESTORE_WIENER, AV1_RESTORE_SGRPROJ };
+
+    memset(pp, 0, sizeof(*pp));
+
+    pp->width  = avctx->width;
+    pp->height = avctx->height;
+
+    pp->max_width  = seq->max_frame_width_minus_1 + 1;
+    pp->max_height = seq->max_frame_height_minus_1 + 1;
+
+    pp->CurrPicTextureIndex = ff_dxva2_get_surface_index(avctx, ctx, h->cur_frame.tf.f);
+    pp->superres_denom      = frame_header->use_superres ? frame_header->coded_denom : AV1_SUPERRES_NUM;
+    pp->bitdepth            = get_bit_depth_from_seq(seq);
+    pp->seq_profile         = seq->seq_profile;
+
+    /* Tiling info */
+    pp->tiles.cols = frame_header->tile_cols;
+    pp->tiles.rows = frame_header->tile_rows;
+    pp->tiles.context_update_id = frame_header->context_update_tile_id;
+
+    for (i = 0; i < pp->tiles.cols; i++)
+        pp->tiles.widths[i] = frame_header->width_in_sbs_minus_1[i] + 1;
+
+    for (i = 0; i < pp->tiles.rows; i++)
+        pp->tiles.heights[i] = frame_header->height_in_sbs_minus_1[i] + 1;
+
+    /* Coding tools */
+    pp->coding.use_128x128_superblock       = seq->use_128x128_superblock;
+    pp->coding.intra_edge_filter            = seq->enable_intra_edge_filter;
+    pp->coding.interintra_compound          = seq->enable_interintra_compound;
+    pp->coding.masked_compound              = seq->enable_masked_compound;
+    pp->coding.warped_motion                = frame_header->allow_warped_motion;
+    pp->coding.dual_filter                  = seq->enable_dual_filter;
+    pp->coding.jnt_comp                     = seq->enable_jnt_comp;
+    pp->coding.screen_content_tools         = frame_header->allow_screen_content_tools;
+    pp->coding.integer_mv                   = frame_header->force_integer_mv || !(frame_header->frame_type & 1);
+    pp->coding.cdef                         = seq->enable_cdef;
+    pp->coding.restoration                  = seq->enable_restoration;
+    pp->coding.film_grain                   = seq->film_grain_params_present;
+    pp->coding.intrabc                      = frame_header->allow_intrabc;
+    pp->coding.high_precision_mv            = frame_header->allow_high_precision_mv;
+    pp->coding.switchable_motion_mode       = frame_header->is_motion_mode_switchable;
+    pp->coding.filter_intra                 = seq->enable_filter_intra;
+    pp->coding.disable_frame_end_update_cdf = frame_header->disable_frame_end_update_cdf;
+    pp->coding.disable_cdf_update           = frame_header->disable_cdf_update;
+    pp->coding.reference_mode               = frame_header->reference_select;
+    pp->coding.skip_mode                    = frame_header->skip_mode_present;
+    pp->coding.reduced_tx_set               = frame_header->reduced_tx_set;
+    pp->coding.superres                     = frame_header->use_superres;
+    pp->coding.tx_mode                      = frame_header->tx_mode;
+    pp->coding.use_ref_frame_mvs            = frame_header->use_ref_frame_mvs;
+    pp->coding.enable_ref_frame_mvs         = seq->enable_ref_frame_mvs;
+    pp->coding.reference_frame_update       = !(frame_header->show_existing_frame == 1 && frame_header->frame_type == AV1_FRAME_KEY);
+
+    /* Format & Picture Info flags */
+    pp->format.frame_type     = frame_header->frame_type;
+    pp->format.show_frame     = frame_header->show_frame;
+    pp->format.showable_frame = frame_header->showable_frame;
+    pp->format.subsampling_x  = seq->color_config.subsampling_x;
+    pp->format.subsampling_y  = seq->color_config.subsampling_y;
+    pp->format.mono_chrome    = seq->color_config.mono_chrome;
+
+    /* References */
+    pp->primary_ref_frame = frame_header->primary_ref_frame;
+    pp->order_hint        = frame_header->order_hint;
+    pp->order_hint_bits   = seq->enable_order_hint ? seq->order_hint_bits_minus_1 + 1 : 0;
+
+    memset(pp->RefFrameMapTextureIndex, 0xFF, sizeof(pp->RefFrameMapTextureIndex));
+    for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+        int8_t ref_idx = frame_header->ref_frame_idx[i];
+        AVFrame *ref_frame = h->ref[ref_idx].tf.f;
+
+        pp->frame_refs[i].width  = ref_frame->width;
+        pp->frame_refs[i].height = ref_frame->height;
+        pp->frame_refs[i].Index  = ref_frame->buf[0] ? ref_idx : 0xFF;
+
+        /* Global Motion */
+        pp->frame_refs[i].wminvalid = (h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i] == AV1_WARP_MODEL_IDENTITY);
+        pp->frame_refs[i].wmtype    = h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i];
+        for (j = 0; j < 6; ++j) {
+             pp->frame_refs[i].wmmat[j] = h->cur_frame.gm_params[AV1_REF_FRAME_LAST + i][j];
+        }
+    }
+    for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
+        AVFrame *ref_frame = h->ref[i].tf.f;
+        if (ref_frame->buf[0])
+            pp->RefFrameMapTextureIndex[i] = ff_dxva2_get_surface_index(avctx, ctx, ref_frame);
+    }
+
+    /* Loop filter parameters */
+    pp->loop_filter.filter_level[0]        = frame_header->loop_filter_level[0];
+    pp->loop_filter.filter_level[1]        = frame_header->loop_filter_level[1];
+    pp->loop_filter.filter_level_u         = frame_header->loop_filter_level[2];
+    pp->loop_filter.filter_level_v         = frame_header->loop_filter_level[3];
+    pp->loop_filter.sharpness_level        = frame_header->loop_filter_sharpness;
+    pp->loop_filter.mode_ref_delta_enabled = frame_header->loop_filter_delta_enabled;
+    pp->loop_filter.mode_ref_delta_update  = frame_header->loop_filter_delta_update;
+    pp->loop_filter.delta_lf_multi         = frame_header->delta_lf_multi;
+    pp->loop_filter.delta_lf_present       = frame_header->delta_lf_present;
+    pp->loop_filter.delta_lf_res           = frame_header->delta_lf_res;
+
+    for (i = 0; i < AV1_TOTAL_REFS_PER_FRAME; i++) {
+        pp->loop_filter.ref_deltas[i] = frame_header->loop_filter_ref_deltas[i];
+    }
+
+    pp->loop_filter.mode_deltas[0]                = frame_header->loop_filter_mode_deltas[0];
+    pp->loop_filter.mode_deltas[1]                = frame_header->loop_filter_mode_deltas[1];
+    pp->loop_filter.frame_restoration_type[0]     = remap_lr_type[frame_header->lr_type[0]];
+    pp->loop_filter.frame_restoration_type[1]     = remap_lr_type[frame_header->lr_type[1]];
+    pp->loop_filter.frame_restoration_type[2]     = remap_lr_type[frame_header->lr_type[2]];
+    uses_lr = frame_header->lr_type[0] || frame_header->lr_type[1] || frame_header->lr_type[2];
+    pp->loop_filter.log2_restoration_unit_size[0] = uses_lr ? (6 + frame_header->lr_unit_shift) : 8;
+    pp->loop_filter.log2_restoration_unit_size[1] = uses_lr ? (6 + frame_header->lr_unit_shift - frame_header->lr_uv_shift) : 8;
+    pp->loop_filter.log2_restoration_unit_size[2] = uses_lr ? (6 + frame_header->lr_unit_shift - frame_header->lr_uv_shift) : 8;
+
+    /* Quantization */
+    pp->quantization.delta_q_present = frame_header->delta_q_present;
+    pp->quantization.delta_q_res     = frame_header->delta_q_res;
+    pp->quantization.base_qindex     = frame_header->base_q_idx;
+    pp->quantization.y_dc_delta_q    = frame_header->delta_q_y_dc;
+    pp->quantization.u_dc_delta_q    = frame_header->delta_q_u_dc;
+    pp->quantization.v_dc_delta_q    = frame_header->delta_q_v_dc;
+    pp->quantization.u_ac_delta_q    = frame_header->delta_q_u_ac;
+    pp->quantization.v_ac_delta_q    = frame_header->delta_q_v_ac;
+    pp->quantization.qm_y            = frame_header->using_qmatrix ? frame_header->qm_y : 0xFF;
+    pp->quantization.qm_u            = frame_header->using_qmatrix ? frame_header->qm_u : 0xFF;
+    pp->quantization.qm_v            = frame_header->using_qmatrix ? frame_header->qm_v : 0xFF;
+
+    /* Cdef parameters */
+    pp->cdef.damping = frame_header->cdef_damping_minus_3;
+    pp->cdef.bits    = frame_header->cdef_bits;
+    for (i = 0; i < 8; i++) {
+        pp->cdef.y_strengths[i].primary    = frame_header->cdef_y_pri_strength[i];
+        pp->cdef.y_strengths[i].secondary  = frame_header->cdef_y_sec_strength[i];
+        pp->cdef.uv_strengths[i].primary   = frame_header->cdef_uv_pri_strength[i];
+        pp->cdef.uv_strengths[i].secondary = frame_header->cdef_uv_sec_strength[i];
+    }
+
+    /* Misc flags */
+    pp->interp_filter = frame_header->interpolation_filter;
+
+    /* Segmentation */
+    pp->segmentation.enabled         = frame_header->segmentation_enabled;
+    pp->segmentation.update_map      = frame_header->segmentation_update_map;
+    pp->segmentation.update_data     = frame_header->segmentation_update_data;
+    pp->segmentation.temporal_update = frame_header->segmentation_temporal_update;
+    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
+        for (j = 0; j < AV1_SEG_LVL_MAX; j++) {
+            pp->segmentation.feature_mask[i].mask |= frame_header->feature_enabled[i][j] << j;
+            pp->segmentation.feature_data[i][j]    = frame_header->feature_value[i][j];
+        }
+    }
+
+    /* Film grain */
+    if (frame_header->apply_grain) {
+        pp->film_grain.apply_grain              = 1;
+        pp->film_grain.scaling_shift_minus8     = frame_header->grain_scaling_minus_8;
+        pp->film_grain.chroma_scaling_from_luma = frame_header->chroma_scaling_from_luma;
+        pp->film_grain.ar_coeff_lag             = frame_header->ar_coeff_lag;
+        pp->film_grain.ar_coeff_shift_minus6    = frame_header->ar_coeff_shift_minus_6;
+        pp->film_grain.grain_scale_shift        = frame_header->grain_scale_shift;
+        pp->film_grain.overlap_flag             = frame_header->overlap_flag;
+        pp->film_grain.clip_to_restricted_range = frame_header->clip_to_restricted_range;
+        pp->film_grain.matrix_coeff_is_identity = (seq->color_config.matrix_coefficients == AVCOL_SPC_RGB);
+
+        pp->film_grain.grain_seed               = frame_header->grain_seed;
+        pp->film_grain.num_y_points             = frame_header->num_y_points;
+        for (i = 0; i < frame_header->num_y_points; i++) {
+            pp->film_grain.scaling_points_y[i][0] = frame_header->point_y_value[i];
+            pp->film_grain.scaling_points_y[i][1] = frame_header->point_y_scaling[i];
+        }
+        pp->film_grain.num_cb_points            = frame_header->num_cb_points;
+        for (i = 0; i < frame_header->num_cb_points; i++) {
+            pp->film_grain.scaling_points_cb[i][0] = frame_header->point_cb_value[i];
+            pp->film_grain.scaling_points_cb[i][1] = frame_header->point_cb_scaling[i];
+        }
+        pp->film_grain.num_cr_points            = frame_header->num_cr_points;
+        for (i = 0; i < frame_header->num_cr_points; i++) {
+            pp->film_grain.scaling_points_cr[i][0] = frame_header->point_cr_value[i];
+            pp->film_grain.scaling_points_cr[i][1] = frame_header->point_cr_scaling[i];
+        }
+        for (i = 0; i < 24; i++) {
+            pp->film_grain.ar_coeffs_y[i] = frame_header->ar_coeffs_y_plus_128[i];
+        }
+        for (i = 0; i < 25; i++) {
+            pp->film_grain.ar_coeffs_cb[i] = frame_header->ar_coeffs_cb_plus_128[i];
+            pp->film_grain.ar_coeffs_cr[i] = frame_header->ar_coeffs_cr_plus_128[i];
+        }
+        pp->film_grain.cb_mult      = frame_header->cb_mult;
+        pp->film_grain.cb_luma_mult = frame_header->cb_luma_mult;
+        pp->film_grain.cr_mult      = frame_header->cr_mult;
+        pp->film_grain.cr_luma_mult = frame_header->cr_luma_mult;
+        pp->film_grain.cb_offset    = frame_header->cb_offset;
+        pp->film_grain.cr_offset    = frame_header->cr_offset;
+        pp->film_grain.cr_offset    = frame_header->cr_offset;
+    }
+
+    // XXX: setting the StatusReportFeedbackNumber breaks decoding on some drivers
+    // we never use the status reporting functionality, so just skip on that
+    //pp->StatusReportFeedbackNumber = 1 + DXVA_CONTEXT_REPORT_ID(avctx, ctx)++;
+    return 0;
+}
+
+static int dxva2_av1_start_frame(AVCodecContext *avctx,
+                                 av_unused const uint8_t *buffer,
+                                 av_unused uint32_t size)
+{
+    const AV1DecContext *h = avctx->priv_data;
+    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
+    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
+
+    if (!DXVA_CONTEXT_VALID(avctx, ctx))
+        return -1;
+    av_assert0(ctx_pic);
+
+    /* Fill up DXVA_PicParams_AV1 */
+    if (fill_picture_parameters(avctx, ctx, h, &ctx_pic->pp) < 0)
+        return -1;
+
+    ctx_pic->bitstream_size = 0;
+    ctx_pic->bitstream      = NULL;
+    return 0;
+}
+
+static int dxva2_av1_decode_slice(AVCodecContext *avctx,
+                                  const uint8_t *buffer,
+                                  uint32_t size)
+{
+    const AV1DecContext *h = avctx->priv_data;
+    const AV1RawFrameHeader *frame_header = h->raw_frame_header;
+    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
+    struct AV1DXVAContext *ctx = avctx->internal->hwaccel_priv_data;
+    void *tmp;
+
+    ctx_pic->tile_count = frame_header->tile_cols * frame_header->tile_rows;
+
+    /* too many tiles, exceeding all defined levels in the AV1 spec */
+    if (ctx_pic->tile_count > MAX_TILES)
+        return AVERROR(ENOSYS);
+
+    /* Shortcut if all tiles are in the same buffer */
+    if (ctx_pic->tile_count == h->tg_end - h->tg_start + 1) {
+        ctx_pic->bitstream = (uint8_t *)buffer;
+        ctx_pic->bitstream_size = size;
+
+        for (uint32_t tile_num = 0; tile_num < ctx_pic->tile_count; tile_num++) {
+            ctx_pic->tiles[tile_num].DataOffset   = h->tile_group_info[tile_num].tile_offset;
+            ctx_pic->tiles[tile_num].DataSize     = h->tile_group_info[tile_num].tile_size;
+            ctx_pic->tiles[tile_num].row          = h->tile_group_info[tile_num].tile_row;
+            ctx_pic->tiles[tile_num].column       = h->tile_group_info[tile_num].tile_column;
+            ctx_pic->tiles[tile_num].anchor_frame = 0xFF;
+        }
+
+        return 0;
+    }
+
+    /* allocate an internal buffer */
+    tmp = av_fast_realloc(ctx->bitstream_cache, &ctx->bitstream_allocated,
+                          ctx_pic->bitstream_size + size);
+    if (!tmp) {
+        return AVERROR(ENOMEM);
+    }
+    ctx_pic->bitstream = ctx->bitstream_cache = tmp;
+
+    memcpy(ctx_pic->bitstream + ctx_pic->bitstream_size, buffer, size);
+
+    for (uint32_t tile_num = h->tg_start; tile_num <= h->tg_end; tile_num++) {
+        ctx_pic->tiles[tile_num].DataOffset   = ctx_pic->bitstream_size + h->tile_group_info[tile_num].tile_offset;
+        ctx_pic->tiles[tile_num].DataSize     = h->tile_group_info[tile_num].tile_size;
+        ctx_pic->tiles[tile_num].row          = h->tile_group_info[tile_num].tile_row;
+        ctx_pic->tiles[tile_num].column       = h->tile_group_info[tile_num].tile_column;
+        ctx_pic->tiles[tile_num].anchor_frame = 0xFF;
+    }
+
+    ctx_pic->bitstream_size += size;
+
+    return 0;
+}
+
+static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
+                                             DECODER_BUFFER_DESC *bs,
+                                             DECODER_BUFFER_DESC *sc)
+{
+    const AV1DecContext *h = avctx->priv_data;
+    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
+    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
+    void     *dxva_data_ptr;
+    uint8_t  *dxva_data;
+    unsigned dxva_size;
+    unsigned padding;
+    unsigned type;
+
+#if CONFIG_D3D11VA
+    if (ff_dxva2_is_d3d11(avctx)) {
+        type = D3D11_VIDEO_DECODER_BUFFER_BITSTREAM;
+        if (FAILED(ID3D11VideoContext_GetDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context,
+                                                       D3D11VA_CONTEXT(ctx)->decoder,
+                                                       type,
+                                                       &dxva_size, &dxva_data_ptr)))
+            return -1;
+    }
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
+        type = DXVA2_BitStreamDateBufferType;
+        if (FAILED(IDirectXVideoDecoder_GetBuffer(DXVA2_CONTEXT(ctx)->decoder,
+                                                  type,
+                                                  &dxva_data_ptr, &dxva_size)))
+            return -1;
+    }
+#endif
+
+    dxva_data = dxva_data_ptr;
+
+    if (ctx_pic->bitstream_size > dxva_size) {
+        av_log(avctx, AV_LOG_ERROR, "Bitstream size exceeds hardware buffer");
+        return -1;
+    }
+
+    memcpy(dxva_data, ctx_pic->bitstream, ctx_pic->bitstream_size);
+
+    padding = FFMIN(128 - ((ctx_pic->bitstream_size) & 127), dxva_size - ctx_pic->bitstream_size);
+    if (padding > 0) {
+        memset(dxva_data + ctx_pic->bitstream_size, 0, padding);
+        ctx_pic->bitstream_size += padding;
+    }
+
+#if CONFIG_D3D11VA
+    if (ff_dxva2_is_d3d11(avctx))
+        if (FAILED(ID3D11VideoContext_ReleaseDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context, D3D11VA_CONTEXT(ctx)->decoder, type)))
+            return -1;
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD)
+        if (FAILED(IDirectXVideoDecoder_ReleaseBuffer(DXVA2_CONTEXT(ctx)->decoder, type)))
+            return -1;
+#endif
+
+#if CONFIG_D3D11VA
+    if (ff_dxva2_is_d3d11(avctx)) {
+        D3D11_VIDEO_DECODER_BUFFER_DESC *dsc11 = bs;
+        memset(dsc11, 0, sizeof(*dsc11));
+        dsc11->BufferType           = type;
+        dsc11->DataSize             = ctx_pic->bitstream_size;
+        dsc11->NumMBsInBuffer       = 0;
+
+        type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
+    }
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
+        DXVA2_DecodeBufferDesc *dsc2 = bs;
+        memset(dsc2, 0, sizeof(*dsc2));
+        dsc2->CompressedBufferType = type;
+        dsc2->DataSize             = ctx_pic->bitstream_size;
+        dsc2->NumMBsInBuffer       = 0;
+
+        type = DXVA2_SliceControlBufferType;
+    }
+#endif
+
+    return ff_dxva2_commit_buffer(avctx, ctx, sc, type,
+                                  ctx_pic->tiles, sizeof(*ctx_pic->tiles) * ctx_pic->tile_count, 0);
+}
+
+static int dxva2_av1_end_frame(AVCodecContext *avctx)
+{
+    const AV1DecContext *h = avctx->priv_data;
+    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
+    int ret;
+
+    if (ctx_pic->bitstream_size <= 0)
+        return -1;
+
+    ret = ff_dxva2_common_end_frame(avctx, h->cur_frame.tf.f,
+                                    &ctx_pic->pp, sizeof(ctx_pic->pp),
+                                    NULL, 0,
+                                    commit_bitstream_and_slice_buffer);
+
+    return ret;
+}
+
+static int dxva2_av1_uninit(AVCodecContext *avctx)
+{
+    struct AV1DXVAContext *ctx = avctx->internal->hwaccel_priv_data;
+
+    av_freep(&ctx->bitstream_cache);
+    ctx->bitstream_allocated = 0;
+
+    return ff_dxva2_decode_uninit(avctx);
+}
+
+#if CONFIG_AV1_DXVA2_HWACCEL
+const AVHWAccel ff_av1_dxva2_hwaccel = {
+    .name           = "av1_dxva2",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AV1,
+    .pix_fmt        = AV_PIX_FMT_DXVA2_VLD,
+    .init           = ff_dxva2_decode_init,
+    .uninit         = dxva2_av1_uninit,
+    .start_frame    = dxva2_av1_start_frame,
+    .decode_slice   = dxva2_av1_decode_slice,
+    .end_frame      = dxva2_av1_end_frame,
+    .frame_params   = ff_dxva2_common_frame_params,
+    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
+    .priv_data_size = sizeof(struct AV1DXVAContext),
+};
+#endif
+
+#if CONFIG_AV1_D3D11VA_HWACCEL
+const AVHWAccel ff_av1_d3d11va_hwaccel = {
+    .name           = "av1_d3d11va",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AV1,
+    .pix_fmt        = AV_PIX_FMT_D3D11VA_VLD,
+    .init           = ff_dxva2_decode_init,
+    .uninit         = dxva2_av1_uninit,
+    .start_frame    = dxva2_av1_start_frame,
+    .decode_slice   = dxva2_av1_decode_slice,
+    .end_frame      = dxva2_av1_end_frame,
+    .frame_params   = ff_dxva2_common_frame_params,
+    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
+    .priv_data_size = sizeof(struct AV1DXVAContext),
+};
+#endif
+
+#if CONFIG_AV1_D3D11VA2_HWACCEL
+const AVHWAccel ff_av1_d3d11va2_hwaccel = {
+    .name           = "av1_d3d11va2",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AV1,
+    .pix_fmt        = AV_PIX_FMT_D3D11,
+    .init           = ff_dxva2_decode_init,
+    .uninit         = dxva2_av1_uninit,
+    .start_frame    = dxva2_av1_start_frame,
+    .decode_slice   = dxva2_av1_decode_slice,
+    .end_frame      = dxva2_av1_end_frame,
+    .frame_params   = ff_dxva2_common_frame_params,
+    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
+    .priv_data_size = sizeof(struct AV1DXVAContext),
+};
+#endif
diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
index 18e9079c55..9869ce9f72 100644
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -21,6 +21,9 @@ 
 
 #include "avcodec.h"
 
+extern const AVHWAccel ff_av1_d3d11va_hwaccel;
+extern const AVHWAccel ff_av1_d3d11va2_hwaccel;
+extern const AVHWAccel ff_av1_dxva2_hwaccel;
 extern const AVHWAccel ff_av1_vaapi_hwaccel;
 extern const AVHWAccel ff_h263_vaapi_hwaccel;
 extern const AVHWAccel ff_h263_videotoolbox_hwaccel;
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 5173d0f090..a595e32832 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -29,7 +29,7 @@ 
 
 #define LIBAVCODEC_VERSION_MAJOR  58
 #define LIBAVCODEC_VERSION_MINOR 112
-#define LIBAVCODEC_VERSION_MICRO 101
+#define LIBAVCODEC_VERSION_MICRO 102
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
                                                LIBAVCODEC_VERSION_MINOR, \