diff mbox series

[FFmpeg-devel,v6,14/14] vvcdec: add full vvc decoder

Message ID TYSPR06MB6433D78D091970B26ED5AB75AA85A@TYSPR06MB6433.apcprd06.prod.outlook.com
State Superseded
Headers show
Series [FFmpeg-devel,v6,01/14] vvcdec: add vvc decoder stub | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Nuo Mi Dec. 5, 2023, 2:45 p.m. UTC
vvc decoder plug-in to avcodec.
split frames into slices/tiles and send them to vvc_thread for further decoding
reorder and wait for the frame decoding to be done and output the frame

Features:
    + Support I, P, B frames
    + Support 8/10/12 bits, chroma 400, 420, 422, and 444 and range extension
    + Support VVC new tools like MIP, CCLM, AFFINE, GPM, DMVR, PROF, BDOF, LMCS, ALF
    + 295 conformace clips passed
    - Not support RPR, IBC, PALETTE, and other minor features yet

Performance:
    C code FPS on i7-12700 (x86):
        BQTerrace_1920x1080_60_10_420_22_RA.vvc      93.0
        Chimera_8bit_1080P_1000_frames.vvc          184.3
        NovosobornayaSquare_1920x1080.bin           191.3
        RitualDance_1920x1080_60_10_420_32_LD.266   150.7
        RitualDance_1920x1080_60_10_420_37_RA.266   170.0
        Tango2_3840x2160_60_10_420_27_LD.266         33.7

    C code FPS on M1 Mac Pro (ARM):
        BQTerrace_1920x1080_60_10_420_22_RA.vvc     58.7
        Chimera_8bit_1080P_1000_frames.vvc          153.3
        NovosobornayaSquare_1920x1080.bin           150.3
        RitualDance_1920x1080_60_10_420_32_LD.266   105.0
        RitualDance_1920x1080_60_10_420_37_RA.266   133.0
        Tango2_3840x2160_60_10_420_27_LD.266        21.7

    Asm optimizations still working in progress. please check
    https://github.com/ffvvc/FFmpeg/wiki#performance-data for the latest

Contributors(based on code merge order):
    Nuo Mi <nuomi2021@gmail.com>
    Xu Mu <toxumu@outlook.com>
    frankplow <post@frankplowman.com>
    Shaun Loo <shaunloo10@gmail.com>
---
 libavcodec/vvc/vvcdec.c | 1007 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 1007 insertions(+)

Comments

Nuo Mi Dec. 5, 2023, 3:20 p.m. UTC | #1
On Tue, Dec 5, 2023 at 10:46 PM Nuo Mi <nuomi2021@gmail.com> wrote:

> vvc decoder plug-in to avcodec.
> split frames into slices/tiles and send them to vvc_thread for further
> decoding
> reorder and wait for the frame decoding to be done and output the frame
>
> Features:
>     + Support I, P, B frames
>     + Support 8/10/12 bits, chroma 400, 420, 422, and 444 and range
> extension
>     + Support VVC new tools like MIP, CCLM, AFFINE, GPM, DMVR, PROF, BDOF,
> LMCS, ALF
>     + 295 conformace clips passed
>     - Not support RPR, IBC, PALETTE, and other minor features yet
>
> Performance:
>     C code FPS on i7-12700 (x86):
>         BQTerrace_1920x1080_60_10_420_22_RA.vvc      93.0
>         Chimera_8bit_1080P_1000_frames.vvc          184.3
>         NovosobornayaSquare_1920x1080.bin           191.3
>         RitualDance_1920x1080_60_10_420_32_LD.266   150.7
>         RitualDance_1920x1080_60_10_420_37_RA.266   170.0
>         Tango2_3840x2160_60_10_420_27_LD.266         33.7
>
>     C code FPS on M1 Mac Pro (ARM):
>         BQTerrace_1920x1080_60_10_420_22_RA.vvc     58.7
>         Chimera_8bit_1080P_1000_frames.vvc          153.3
>         NovosobornayaSquare_1920x1080.bin           150.3
>         RitualDance_1920x1080_60_10_420_32_LD.266   105.0
>         RitualDance_1920x1080_60_10_420_37_RA.266   133.0
>         Tango2_3840x2160_60_10_420_27_LD.266        21.7
>
>     Asm optimizations still working in progress. please check
>     https://github.com/ffvvc/FFmpeg/wiki#performance-data for the latest
>
> Contributors(based on code merge order):
>     Nuo Mi <nuomi2021@gmail.com>
>     Xu Mu <toxumu@outlook.com>
>     frankplow <post@frankplowman.com>
>     Shaun Loo <shaunloo10@gmail.com>
>
> changes since v5:
Fix c header guard for "make fate-source"
Andreas Rheinhardt Dec. 8, 2023, 12:19 p.m. UTC | #2
Nuo Mi:
> vvc decoder plug-in to avcodec.
> split frames into slices/tiles and send them to vvc_thread for further decoding
> reorder and wait for the frame decoding to be done and output the frame
> 
> Features:
>     + Support I, P, B frames
>     + Support 8/10/12 bits, chroma 400, 420, 422, and 444 and range extension
>     + Support VVC new tools like MIP, CCLM, AFFINE, GPM, DMVR, PROF, BDOF, LMCS, ALF
>     + 295 conformace clips passed
>     - Not support RPR, IBC, PALETTE, and other minor features yet
> 
> Performance:
>     C code FPS on i7-12700 (x86):
>         BQTerrace_1920x1080_60_10_420_22_RA.vvc      93.0
>         Chimera_8bit_1080P_1000_frames.vvc          184.3
>         NovosobornayaSquare_1920x1080.bin           191.3
>         RitualDance_1920x1080_60_10_420_32_LD.266   150.7
>         RitualDance_1920x1080_60_10_420_37_RA.266   170.0
>         Tango2_3840x2160_60_10_420_27_LD.266         33.7
> 
>     C code FPS on M1 Mac Pro (ARM):
>         BQTerrace_1920x1080_60_10_420_22_RA.vvc     58.7
>         Chimera_8bit_1080P_1000_frames.vvc          153.3
>         NovosobornayaSquare_1920x1080.bin           150.3
>         RitualDance_1920x1080_60_10_420_32_LD.266   105.0
>         RitualDance_1920x1080_60_10_420_37_RA.266   133.0
>         Tango2_3840x2160_60_10_420_27_LD.266        21.7
> 
>     Asm optimizations still working in progress. please check
>     https://github.com/ffvvc/FFmpeg/wiki#performance-data for the latest
> 
> Contributors(based on code merge order):
>     Nuo Mi <nuomi2021@gmail.com>
>     Xu Mu <toxumu@outlook.com>
>     frankplow <post@frankplowman.com>
>     Shaun Loo <shaunloo10@gmail.com>
> ---
>  libavcodec/vvc/vvcdec.c | 1007 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 1007 insertions(+)
> 
> diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c
> index 3c591ce875..e40eb7339f 100644
> --- a/libavcodec/vvc/vvcdec.c
> +++ b/libavcodec/vvc/vvcdec.c
> @@ -21,28 +21,1035 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>   */
>  #include "libavcodec/codec_internal.h"
> +#include "libavcodec/decode.h"
>  #include "libavcodec/profiles.h"
> +#include "libavcodec/refstruct.h"
> +#include "libavutil/cpu.h"
>  
>  #include "vvcdec.h"
> +#include "vvc_ctu.h"
> +#include "vvc_data.h"
> +#include "vvc_refs.h"
> +#include "vvc_thread.h"
> +
> +static int vvc_frame_start(VVCContext *s, VVCFrameContext *fc, SliceContext *sc)
> +{
> +    const VVCPH *ph                 = &fc->ps.ph;
> +    const H266RawSliceHeader *rsh   = sc->sh.r;
> +    int ret;
> +
> +    // 8.3.1 Decoding process for picture order count
> +    if (!s->temporal_id && !ph->r->ph_non_ref_pic_flag && !(IS_RASL(s) || IS_RADL(s)))
> +        s->poc_tid0 = ph->poc;
> +
> +    if ((ret = ff_vvc_set_new_ref(s, fc, &fc->frame)) < 0)
> +        goto fail;
> +
> +    if (!IS_IDR(s))
> +        ff_vvc_bump_frame(s, fc);
> +
> +    av_frame_unref(fc->output_frame);
> +
> +    if ((ret = ff_vvc_output_frame(s, fc, fc->output_frame,rsh->sh_no_output_of_prior_pics_flag, 0)) < 0)
> +        goto fail;
> +
> +    if ((ret = ff_vvc_frame_rpl(s, fc, sc)) < 0)
> +        goto fail;
> +
> +    if ((ret = ff_vvc_frame_thread_init(fc)) < 0)
> +        goto fail;
> +    return 0;
> +fail:
> +    if (fc->ref)
> +        ff_vvc_unref_frame(fc, fc->ref, ~0);
> +    fc->ref = NULL;
> +    return ret;
> +}
> +
> +static void ctb_arrays_free(VVCFrameContext *fc)
> +{
> +    av_freep(&fc->tab.deblock);
> +    av_freep(&fc->tab.sao);
> +    av_freep(&fc->tab.alf);
> +    av_freep(&fc->tab.slice_idx);
> +    av_freep(&fc->tab.coeffs);
> +    if (fc->tab.ctus) {
> +        for (int i = 0; i < fc->tab.ctu_count; i++)
> +            ff_vvc_ctu_free_cus(fc->tab.ctus + i);
> +        av_freep(&fc->tab.ctus);
> +    }
> +    ff_refstruct_pool_uninit(&fc->rpl_tab_pool);
> +}
> +
> +static int ctb_arrays_init(VVCFrameContext *fc, const int ctu_count, const int ctu_size)
> +{
> +    if (fc->tab.ctu_count != ctu_count || fc->tab.ctu_size != ctu_size) {
> +        ctb_arrays_free(fc);
> +        fc->tab.deblock         = av_calloc(ctu_count, sizeof(*fc->tab.deblock));
> +        fc->tab.sao             = av_calloc(ctu_count, sizeof(*fc->tab.sao));
> +        fc->tab.alf             = av_calloc(ctu_count, sizeof(*fc->tab.alf));
> +        fc->tab.ctus            = av_calloc(ctu_count, sizeof(*fc->tab.ctus));
> +        fc->tab.slice_idx       = av_malloc(ctu_count * sizeof(*fc->tab.slice_idx));
> +        if (!fc->tab.deblock || !fc->tab.sao || !fc->tab.alf || !fc->tab.ctus || !fc->tab.slice_idx )
> +            return AVERROR(ENOMEM);
> +        fc->tab.coeffs = av_malloc(ctu_count * sizeof(*fc->tab.coeffs) * ctu_size * VVC_MAX_SAMPLE_ARRAYS);
> +        if (!fc->tab.coeffs)
> +            return AVERROR(ENOMEM);
> +        fc->rpl_tab_pool = ff_refstruct_pool_alloc(ctu_count * sizeof(RefPicListTab), 0);
> +        if (!fc->rpl_tab_pool)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        memset(fc->tab.deblock, 0, ctu_count * sizeof(*fc->tab.deblock));
> +        memset(fc->tab.sao, 0, ctu_count * sizeof(*fc->tab.sao));
> +        memset(fc->tab.alf, 0, ctu_count * sizeof(*fc->tab.alf));
> +        for (int i = 0; i < fc->tab.ctu_count; i++)
> +            ff_vvc_ctu_free_cus(fc->tab.ctus + i);
> +        memset(fc->tab.ctus, 0, ctu_count * sizeof(*fc->tab.ctus));
> +    }
> +    memset(fc->tab.slice_idx, -1, ctu_count * sizeof(*fc->tab.slice_idx));
> +
> +    return 0;
> +}
> +
> +static void min_cb_arrays_free(VVCFrameContext *fc)
> +{
> +    for (int i = LUMA; i <= CHROMA; i++) {
> +        av_freep(&fc->tab.cb_pos_x[i]);
> +        av_freep(&fc->tab.cb_pos_y[i]);
> +        av_freep(&fc->tab.cb_width[i]);
> +        av_freep(&fc->tab.cb_height[i]);
> +        av_freep(&fc->tab.cqt_depth[i]);
> +        av_freep(&fc->tab.cpm[i]);
> +        av_freep(&fc->tab.cp_mv[i]);
> +    }
> +
> +    av_freep(&fc->tab.ipm);
> +    av_freep(&fc->tab.imf);
> +    av_freep(&fc->tab.imtf);
> +    av_freep(&fc->tab.imm);
> +    av_freep(&fc->tab.skip);
> +}
> +
> +static int min_cb_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_cb)
> +{
> +    if (fc->tab.pic_size_in_min_cb != pic_size_in_min_cb) {
> +        min_cb_arrays_free(fc);
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            fc->tab.cb_pos_x[i]  = av_mallocz(pic_size_in_min_cb * sizeof(int));
> +            fc->tab.cb_pos_y[i]  = av_mallocz(pic_size_in_min_cb * sizeof(int));
> +            fc->tab.cb_width[i]  = av_mallocz(pic_size_in_min_cb);
> +            fc->tab.cb_height[i] = av_mallocz(pic_size_in_min_cb);
> +            fc->tab.cqt_depth[i] = av_mallocz(pic_size_in_min_cb);
> +            if (!fc->tab.cb_pos_x[i] || !fc->tab.cb_pos_y[i] || !fc->tab.cb_width[i] || !fc->tab.cb_height[i] || !fc->tab.cqt_depth[i])
> +                return AVERROR(ENOMEM);
> +
> +            fc->tab.cpm[i]   = av_mallocz(pic_size_in_min_cb);
> +            fc->tab.cp_mv[i] = av_mallocz(pic_size_in_min_cb * sizeof(Mv) * MAX_CONTROL_POINTS);
> +            if (!fc->tab.cpm[i] || !fc->tab.cp_mv[i])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        fc->tab.ipm  = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.imf  = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.imtf = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.imm  = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.skip = av_mallocz(pic_size_in_min_cb);
> +        if (!fc->tab.ipm || !fc->tab.imf || !fc->tab.imtf || !fc->tab.imm || !fc->tab.skip)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            memset(fc->tab.cb_pos_x[i], 0, pic_size_in_min_cb * sizeof(int));
> +            memset(fc->tab.cb_pos_y[i], 0, pic_size_in_min_cb * sizeof(int));
> +            memset(fc->tab.cb_width[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cb_height[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cqt_depth[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cpm[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cp_mv[i], 0, pic_size_in_min_cb * sizeof(Mv) * MAX_CONTROL_POINTS);
> +        }
> +
> +        memset(fc->tab.ipm, 0, pic_size_in_min_cb);
> +        memset(fc->tab.imf, 0, pic_size_in_min_cb);
> +        memset(fc->tab.imtf, 0, pic_size_in_min_cb);
> +        memset(fc->tab.imm, 0, pic_size_in_min_cb);
> +        memset(fc->tab.skip, 0, pic_size_in_min_cb);
> +    }
> +    return 0;
> +}
> +
> +static void min_tu_arrays_free(VVCFrameContext *fc)
> +{
> +    for (int i = LUMA; i <= CHROMA; i++) {
> +        av_freep(&fc->tab.tb_pos_x0[i]);
> +        av_freep(&fc->tab.tb_pos_y0[i]);
> +        av_freep(&fc->tab.tb_width[i]);
> +        av_freep(&fc->tab.tb_height[i]);
> +        av_freep(&fc->tab.pcmf[i]);
> +    }
> +
> +    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +        av_freep(&fc->tab.qp[i]);
> +        av_freep(&fc->tab.tu_coded_flag[i]);
> +    }
> +
> +    av_freep(&fc->tab.tu_joint_cbcr_residual_flag);
> +}
> +
> +static int min_tu_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_tu)
> +{
> +    if (fc->tab.pic_size_in_min_tu != pic_size_in_min_tu) {
> +        min_tu_arrays_free(fc);
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            fc->tab.tb_pos_x0[i] = av_mallocz(pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_x0[0]));
> +            fc->tab.tb_pos_y0[i] = av_mallocz(pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_y0[0])) ;
> +            fc->tab.tb_width[i]  = av_mallocz(pic_size_in_min_tu);
> +            fc->tab.tb_height[i] = av_mallocz(pic_size_in_min_tu);
> +            fc->tab.pcmf[i]      = av_mallocz(pic_size_in_min_tu);
> +            if (!fc->tab.tb_pos_x0[i] || !fc->tab.tb_pos_y0[i] ||
> +                !fc->tab.tb_width[i] || !fc->tab.tb_height[i] || !fc->tab.pcmf[i])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            fc->tab.tu_coded_flag[i] = av_mallocz(pic_size_in_min_tu);
> +            if (!fc->tab.tu_coded_flag[i])
> +                return AVERROR(ENOMEM);
> +
> +            fc->tab.qp[i] = av_mallocz(pic_size_in_min_tu);
> +            if (!fc->tab.qp[i])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        fc->tab.tu_joint_cbcr_residual_flag  = av_mallocz(pic_size_in_min_tu);
> +        if (!fc->tab.tu_joint_cbcr_residual_flag)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            memset(fc->tab.tb_pos_x0[i], 0, pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_x0[0]));
> +            memset(fc->tab.tb_pos_y0[i], 0, pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_y0[0])) ;
> +            memset(fc->tab.tb_width[i], 0, pic_size_in_min_tu);
> +            memset(fc->tab.tb_height[i], 0, pic_size_in_min_tu);
> +            memset(fc->tab.pcmf[i], 0, pic_size_in_min_tu);
> +        }
> +
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            memset(fc->tab.tu_coded_flag[i], 0, pic_size_in_min_tu);
> +            memset(fc->tab.qp[i], 0, pic_size_in_min_tu);
> +        }
> +        memset(fc->tab.tu_joint_cbcr_residual_flag, 0, pic_size_in_min_tu);
> +    }
> +    return 0;
> +}
> +
> +static void min_pu_arrays_free(VVCFrameContext *fc)
> +{
> +    av_freep(&fc->tab.mvf);
> +    av_freep(&fc->tab.msf);
> +    av_freep(&fc->tab.iaf);
> +    av_freep(&fc->tab.mmi);
> +    ff_refstruct_pool_uninit(&fc->tab_dmvr_mvf_pool);
> +}
> +
> +static int min_pu_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_pu)
> +{
> +    if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
> +        min_pu_arrays_free(fc);
> +        fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
> +        fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
> +        fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
> +        fc->tab.mvf  = av_mallocz(pic_size_in_min_pu * sizeof(*fc->tab.mvf));

Do these have to be separate allocations? If there were allocated
jointly, one memset below would suffice.

> +        if (!fc->tab.msf || !fc->tab.iaf || !fc->tab.mmi || !fc->tab.mvf)
> +            return AVERROR(ENOMEM);
> +        fc->tab_dmvr_mvf_pool  = ff_refstruct_pool_alloc(pic_size_in_min_pu * sizeof(MvField), FF_REFSTRUCT_POOL_FLAG_ZERO_EVERY_TIME);
> +        if (!fc->tab_dmvr_mvf_pool)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        memset(fc->tab.msf, 0, pic_size_in_min_pu);
> +        memset(fc->tab.iaf, 0, pic_size_in_min_pu);
> +        memset(fc->tab.mmi, 0, pic_size_in_min_pu);
> +        memset(fc->tab.mvf, 0, pic_size_in_min_pu * sizeof(*fc->tab.mvf));
> +    }
> +
> +    return 0;
> +}
> +
> +static void bs_arrays_free(VVCFrameContext *fc)
> +{
> +    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +        av_freep(&fc->tab.horizontal_bs[i]);
> +        av_freep(&fc->tab.vertical_bs[i]);
> +    }
> +    av_freep(&fc->tab.horizontal_q);
> +    av_freep(&fc->tab.horizontal_p);
> +    av_freep(&fc->tab.vertical_p);
> +    av_freep(&fc->tab.vertical_q);
> +}
> +
> +static int bs_arrays_init(VVCFrameContext *fc, const int bs_width, const int bs_height)
> +{
> +    if (fc->tab.bs_width != bs_width || fc->tab.bs_height != bs_height) {
> +        bs_arrays_free(fc);
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            fc->tab.horizontal_bs[i] = av_calloc(bs_width, bs_height);
> +            fc->tab.vertical_bs[i]   = av_calloc(bs_width, bs_height);
> +            if (!fc->tab.horizontal_bs[i] || !fc->tab.vertical_bs[i])
> +                return AVERROR(ENOMEM);
> +        }
> +        fc->tab.horizontal_q = av_calloc(bs_width, bs_height);
> +        fc->tab.horizontal_p = av_calloc(bs_width, bs_height);
> +        fc->tab.vertical_p   = av_calloc(bs_width, bs_height);
> +        fc->tab.vertical_q   = av_calloc(bs_width, bs_height);
> +        if (!fc->tab.horizontal_q || !fc->tab.horizontal_p || !fc->tab.vertical_p || !fc->tab.vertical_q)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            memset(fc->tab.horizontal_bs[i], 0, bs_width * bs_height);
> +            memset(fc->tab.vertical_bs[i], 0, bs_width * bs_height);
> +        }
> +        memset(fc->tab.horizontal_q, 0, bs_width * bs_height);
> +        memset(fc->tab.horizontal_p, 0, bs_width * bs_height);
> +        memset(fc->tab.vertical_p, 0, bs_width * bs_height);
> +        memset(fc->tab.vertical_q, 0, bs_width * bs_height);
> +    }
> +    return 0;
> +}
> +
> +static void pixel_buffer_free(VVCFrameContext *fc)
> +{
> +    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +        av_freep(&fc->tab.sao_pixel_buffer_h[i]);
> +        av_freep(&fc->tab.sao_pixel_buffer_v[i]);
> +        for (int j = 0; j < 2; j++) {
> +            av_freep(&fc->tab.alf_pixel_buffer_h[i][j]);
> +            av_freep(&fc->tab.alf_pixel_buffer_v[i][j]);
> +        }
> +    }
> +}
> +
> +static int pixel_buffer_init(VVCFrameContext *fc, const int width, const int height,
> +    const int ctu_width, const int ctu_height, const int chroma_format_idc, const int ps)
> +{
> +    const VVCSPS *sps = fc->ps.sps;
> +    const int c_end   = chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
> +
> +    if (fc->tab.chroma_format_idc != chroma_format_idc ||
> +        fc->tab.width != width || fc->tab.height != height ||
> +        fc->tab.ctu_width != ctu_width || fc->tab.ctu_height != ctu_height) {
> +        pixel_buffer_free(fc);
> +        for (int c_idx = 0; c_idx < c_end; c_idx++) {
> +            const int w = width >> sps->hshift[c_idx];
> +            const int h = height >> sps->vshift[c_idx];
> +            fc->tab.sao_pixel_buffer_h[c_idx] = av_malloc((w * 2 * ctu_height) << ps);
> +            fc->tab.sao_pixel_buffer_v[c_idx] = av_malloc((h * 2 * ctu_width)  << ps);
> +            if (!fc->tab.sao_pixel_buffer_h[c_idx] || !fc->tab.sao_pixel_buffer_v[c_idx])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        for (int c_idx = 0; c_idx < c_end; c_idx++) {
> +            const int w = width >> sps->hshift[c_idx];
> +            const int h = height >> sps->vshift[c_idx];
> +            const int border_pixels = c_idx ? ALF_BORDER_CHROMA : ALF_BORDER_LUMA;
> +            for (int i = 0; i < 2; i++) {
> +                fc->tab.alf_pixel_buffer_h[c_idx][i] = av_malloc((w * border_pixels * ctu_height) << ps);
> +                fc->tab.alf_pixel_buffer_v[c_idx][i] = av_malloc(h * ALF_PADDING_SIZE * ctu_width);
> +                if (!fc->tab.alf_pixel_buffer_h[c_idx][i] || !fc->tab.alf_pixel_buffer_v[c_idx][i])
> +                    return AVERROR(ENOMEM);
> +            }
> +        }
> +    }
> +    return 0;
> +}
> +
> +static void pic_arrays_free(VVCFrameContext *fc)
> +{
> +    ctb_arrays_free(fc);
> +    min_cb_arrays_free(fc);
> +    min_pu_arrays_free(fc);
> +    min_tu_arrays_free(fc);
> +    bs_arrays_free(fc);
> +    ff_refstruct_pool_uninit(&fc->cu_pool);
> +    ff_refstruct_pool_uninit(&fc->tu_pool);
> +    pixel_buffer_free(fc);
> +
> +    for (int i = 0; i < 2; i++)
> +        av_freep(&fc->tab.msm[i]);
> +    av_freep(&fc->tab.ispmf);
> +
> +    fc->tab.ctu_count = 0;
> +    fc->tab.ctu_size  = 0;
> +    fc->tab.pic_size_in_min_cb = 0;
> +    fc->tab.pic_size_in_min_pu = 0;
> +    fc->tab.pic_size_in_min_tu = 0;
> +    fc->tab.width              = 0;
> +    fc->tab.height             = 0;
> +    fc->tab.ctu_width          = 0;
> +    fc->tab.ctu_height         = 0;
> +    fc->tab.bs_width           = 0;
> +    fc->tab.bs_height          = 0;
> +}
> +
> +static int pic_arrays_init(VVCContext *s, VVCFrameContext *fc)
> +{
> +    const VVCSPS *sps               = fc->ps.sps;
> +    const VVCPPS *pps               = fc->ps.pps;
> +    const int ctu_size              = 1 << sps->ctb_log2_size_y << sps->ctb_log2_size_y;
> +    const int pic_size_in_min_cb    = pps->min_cb_width * pps->min_cb_height;
> +    const int pic_size_in_min_pu    = pps->min_pu_width * pps->min_pu_height;
> +    const int pic_size_in_min_tu    = pps->min_tu_width * pps->min_tu_height;
> +    const int w32                   = AV_CEIL_RSHIFT(pps->width,  5);
> +    const int h32                   = AV_CEIL_RSHIFT(pps->height,  5);
> +    const int w64                   = AV_CEIL_RSHIFT(pps->width,  6);
> +    const int h64                   = AV_CEIL_RSHIFT(pps->height,  6);
> +    const int bs_width              = (fc->ps.pps->width >> 2) + 1;
> +    const int bs_height             = (fc->ps.pps->height >> 2) + 1;
> +    int ret;
> +
> +    if ((ret = ctb_arrays_init(fc, pps->ctb_count, ctu_size)) < 0)
> +        goto fail;
> +
> +    if ((ret = min_cb_arrays_init(fc, pic_size_in_min_cb)) < 0)
> +        goto fail;
> +
> +    if ((ret = min_pu_arrays_init(fc, pic_size_in_min_pu)) < 0)
> +        goto fail;
> +
> +    if ((ret = min_tu_arrays_init(fc, pic_size_in_min_tu)) < 0)
> +        goto fail;
> +
> +    if ((ret = bs_arrays_init(fc, bs_width, bs_height)) < 0)
> +        goto fail;
> +
> +    if ((ret = pixel_buffer_init(fc, pps->width, pps->height, pps->ctb_width, pps->ctb_height,
> +        sps->r->sps_chroma_format_idc, sps->pixel_shift)) < 0)
> +        goto fail;
> +
> +    if (AV_CEIL_RSHIFT(fc->tab.width,  5) != w32 || AV_CEIL_RSHIFT(fc->tab.height,  5) != h32) {
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            av_freep(&fc->tab.msm[i]);
> +            fc->tab.msm[i] = av_calloc(w32, h32);
> +            if (!fc->tab.msm[i])
> +                goto fail;
> +        }
> +    } else {
> +        for (int i = LUMA; i <= CHROMA; i++)
> +            memset(fc->tab.msm[i], 0, w32 * h32);
> +    }
> +    if (AV_CEIL_RSHIFT(fc->tab.width,  6) != w64 || AV_CEIL_RSHIFT(fc->tab.height,  6) != h64) {
> +        av_freep(&fc->tab.ispmf);
> +        fc->tab.ispmf = av_calloc(w64, h64);
> +        if (!fc->tab.ispmf)
> +            goto fail;
> +    } else {
> +        memset(fc->tab.ispmf, 0, w64 * h64);
> +    }
> +
> +    if (!fc->cu_pool) {
> +        fc->cu_pool = ff_refstruct_pool_alloc(sizeof(CodingUnit), 0);
> +        if (!fc->cu_pool)
> +            goto fail;

The size of the objects contained in this pool don't depend on any
bitstream parameters. You can therefore simply use a single pool (in
VVCContext) that is allocated in vvc_decode_init() and freed in
vvc_decode_free().
The same goes for tu_pool below.


> +    }
> +
> +    if (!fc->tu_pool) {
> +        fc->tu_pool = ff_refstruct_pool_alloc(sizeof(TransformUnit), 0);
> +        if (!fc->tu_pool)
> +            goto fail;
> +    }
> +
> +    fc->tab.ctu_count = pps->ctb_count;
> +    fc->tab.ctu_size  = ctu_size;
> +    fc->tab.pic_size_in_min_cb = pic_size_in_min_cb;
> +    fc->tab.pic_size_in_min_pu = pic_size_in_min_pu;
> +    fc->tab.pic_size_in_min_tu = pic_size_in_min_tu;
> +    fc->tab.width              = pps->width;
> +    fc->tab.height             = pps->height;
> +    fc->tab.ctu_width          = pps->ctb_width;
> +    fc->tab.ctu_height         = pps->ctb_height;
> +    fc->tab.chroma_format_idc  = sps->r->sps_chroma_format_idc;
> +    fc->tab.pixel_shift        = sps->pixel_shift;
> +    fc->tab.bs_width           = bs_width;
> +    fc->tab.bs_height          = bs_height;
> +
> +    return 0;
> +fail:
> +    pic_arrays_free(fc);
> +    return ret;
> +}
> +
> +static int min_positive(const int idx, const int diff, const int min_diff)
> +{
> +    return diff > 0 && (idx < 0 || diff < min_diff);
> +}
> +
> +static int max_negtive(const int idx, const int diff, const int max_diff)
> +{
> +    return diff < 0 && (idx < 0 || diff > max_diff);
> +}
> +
> +typedef int (*smvd_find_fxn)(const int idx, const int diff, const int old_diff);
> +
> +static int8_t smvd_find(const VVCFrameContext *fc, const SliceContext *sc, int lx, smvd_find_fxn find)
> +{
> +    const H266RawSliceHeader *rsh   = sc->sh.r;
> +    const RefPicList *rpl           = sc->rpl + lx;
> +    const int poc                   = fc->ref->poc;
> +    int8_t idx                      = -1;
> +    int old_diff                    = -1;
> +    for (int i = 0; i < rsh->num_ref_idx_active[lx]; i++) {
> +        if (!rpl->isLongTerm[i]) {
> +            int diff = poc - rpl->list[i];
> +            if (find(idx, diff, old_diff)) {
> +                idx = i;
> +                old_diff = diff;
> +            }
> +        }
> +    }
> +    return idx;
> +}
> +
> +static void vvc_smvd_ref_idx(const VVCFrameContext *fc, SliceContext *sc)
> +{
> +    VVCSH *sh = &sc->sh;
> +    if (IS_B(sh->r)) {
> +        sh->ref_idx_sym[0] = smvd_find(fc, sc, 0, min_positive);
> +        sh->ref_idx_sym[1] = smvd_find(fc, sc, 1, max_negtive);
> +        if (sh->ref_idx_sym[0] == -1 || sh->ref_idx_sym[1] == -1) {
> +            sh->ref_idx_sym[0] = smvd_find(fc, sc, 0, max_negtive);
> +            sh->ref_idx_sym[1] = smvd_find(fc, sc, 1, min_positive);
> +        }
> +    }
> +}
> +
> +static void eps_free(SliceContext *slice)
> +{
> +    av_freep(&slice->eps);
> +}
> +
> +static void slices_free(VVCFrameContext *fc)
> +{
> +    if (fc->slices) {
> +        for (int i = 0; i < fc->nb_slices_allocated; i++) {
> +            SliceContext *slice = fc->slices[i];
> +            if (slice) {
> +                ff_refstruct_unref(&slice->sh.r);
> +                eps_free(slice);
> +                av_free(slice);
> +            }
> +        }
> +        av_freep(&fc->slices);
> +    }
> +    fc->nb_slices_allocated = 0;
> +    fc->nb_slices = 0;
> +}
> +
> +static int slices_realloc(VVCFrameContext *fc)
> +{
> +    void *p;
> +    const int size = (fc->nb_slices_allocated + 1) * 3 / 2;
> +
> +    if (fc->nb_slices < fc->nb_slices_allocated)
> +        return 0;
> +
> +    p = av_realloc(fc->slices, size * sizeof(*fc->slices));

av_realloc_array()

> +    if (!p)
> +        return AVERROR(ENOMEM);
> +
> +    fc->slices = p;
> +    for (int i = fc->nb_slices_allocated; i < size; i++) {
> +        fc->slices[i] = av_calloc(1, sizeof(*fc->slices[0]));

av_mallocz().

> +        if (!fc->slices[i]) {
> +            for (int j = fc->nb_slices_allocated; j < i; j++)
> +                av_freep(&fc->slices[j]);
> +            return AVERROR(ENOMEM);

Can't you simply set fc->nb_slices_allocated to i in order to avoid this
loop?

> +        }
> +        fc->slices[i]->slice_idx = i;
> +    }
> +    fc->nb_slices_allocated = size;
> +    return 0;
> +}
> +
> +static void ep_init_cabac_decoder(SliceContext *sc, const int index, const H2645NAL *nal, GetBitContext *gb)
> +{
> +    const H266RawSliceHeader *rsh   = sc->sh.r;
> +    EntryPoint *ep                  = sc->eps + index;
> +    int size;
> +
> +    if (index < rsh->num_entry_points) {
> +        int skipped = 0;
> +        int64_t start =  (gb->index >> 3);
> +        int64_t end = start + rsh->sh_entry_point_offset_minus1[index] + 1;
> +        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] <= start) {
> +            skipped++;
> +        }
> +        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] < end) {
> +            end--;
> +            skipped++;
> +        }
> +        size = end - start;
> +    } else {
> +        size = get_bits_left(gb) / 8;
> +    }
> +    ff_init_cabac_decoder (&ep->cc, gb->buffer + get_bits_count(gb) / 8, size);
> +    skip_bits(gb, size * 8);
> +}
> +
> +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
> +{
> +    const VVCSH *sh             = &sc->sh;
> +    const H266RawSlice *slice   = (const H266RawSlice *)unit->content;

Please no pointless casts. Also, why is there unnecessary whitespace in
front of '='?

> +    int nb_eps                  = sh->r->num_entry_points + 1;
> +    int ctu_addr                = 0;
> +    GetBitContext gb;
> +
> +    if (sc->nb_eps != nb_eps) {
> +        eps_free(sc);
> +        sc->eps = av_calloc(nb_eps, sizeof(*sc->eps));
> +        if (!sc->eps)
> +            return AVERROR(ENOMEM);

In case of error, sc->eps is NULL, yet sc->nb_eps may be != 0. Stuff
like this can (and does) lead to crashes.

> +        sc->nb_eps = nb_eps;
> +    }
> +
> +    init_get_bits8(&gb, slice->data, slice->data_size);
> +    for (int i = 0; i < sc->nb_eps; i++)
> +    {
> +        EntryPoint *ep = sc->eps + i;
> +
> +        ep->ctu_start = ctu_addr;
> +        ep->ctu_end   = (i + 1 == sc->nb_eps ? sh->num_ctus_in_curr_slice : sh->entry_point_start_ctu[i]);
> +
> +        for (int j = ep->ctu_start; j < ep->ctu_end; j++) {
> +            const int rs = sc->sh.ctb_addr_in_curr_slice[j];
> +            fc->tab.slice_idx[rs] = sc->slice_idx;
> +        }
> +
> +        ep_init_cabac_decoder(sc, i, nal, &gb);
> +
> +        if (i + 1 < sc->nb_eps)
> +            ctu_addr = sh->entry_point_start_ctu[i];
> +    }
> +
> +    return 0;
> +}
> +
> +static VVCFrameContext* get_frame_context(const VVCContext *s, const VVCFrameContext *fc, const int delta)
> +{
> +    const int size = s->nb_fcs;
> +    const int idx = (fc - s->fcs + delta  + size) % size;
> +    return s->fcs + idx;
> +}
> +
> +static int vvc_ref_frame(VVCFrameContext *fc, VVCFrame *dst, VVCFrame *src)

src should be const.

> +{
> +    int ret;
> +
> +    ret = av_frame_ref(dst->frame, src->frame);
> +    if (ret < 0)
> +        return ret;
> +
> +    ff_refstruct_replace(&dst->progress, src->progress);
> +
> +    ff_refstruct_replace(&dst->tab_dmvr_mvf, src->tab_dmvr_mvf);
> +
> +    ff_refstruct_replace(&dst->rpl_tab, src->rpl_tab);
> +    ff_refstruct_replace(&dst->rpl, src->rpl);
> +    dst->nb_rpl_elems = src->nb_rpl_elems;
> +
> +    dst->poc = src->poc;
> +    dst->ctb_count = src->ctb_count;
> +    dst->flags = src->flags;
> +    dst->sequence = src->sequence;
> +
> +    return 0;
> +}
> +
> +static av_cold void frame_context_free(VVCFrameContext *fc)
> +{
> +    slices_free(fc);
> +
> +    for (int i = 0; i < FF_ARRAY_ELEMS(fc->DPB); i++) {
> +        ff_vvc_unref_frame(fc, &fc->DPB[i], ~0);
> +        av_frame_free(&fc->DPB[i].frame);
> +    }
> +
> +    ff_vvc_frame_thread_free(fc);
> +    pic_arrays_free(fc);
> +    av_frame_free(&fc->output_frame);
> +    ff_vvc_frame_ps_free(&fc->ps);
> +    av_freep(&fc->avctx);
> +}
> +
> +static av_cold int frame_context_init(VVCFrameContext *fc, AVCodecContext *avctx)
> +{
> +
> +    fc->avctx = av_memdup(avctx, sizeof(*avctx));

When I read this, I presumed you are using multiple AVCodecContexts to
store the ever changing state of the AVCodecContext fields similarly to
update_context_from_thread() in pthread_frame.c. But it seems you don't.
These contexts are only used as a) logcontexts (where the actual
user-facing AVCodecContext should be used, so that the user can make
sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
export_frame_params() where only some basic fields
(dimension-related+pix_fmt) is set. Presumably c) is done for b).

But the user is allowed to change the provided callbacks in the master
context at any time. E.g. the call to ff_thread_get_buffer() in
vvc_refs.c currently uses the VVCFrameContext and therefore uses the
get_buffer2 callback in place now (during av_memdup()). This is wrong.

I think you can just remove VVCFrameContext.avctx and use the
user-facing AVCodecContext if you set the AVFrame properties that are
normally derived from the AVCodecContext directly on the AVFrame before
ff_thread_get_buffer().

> +    if (!fc->avctx)
> +        goto fail;
> +
> +    fc->output_frame = av_frame_alloc();
> +    if (!fc->output_frame)
> +        goto fail;
> +
> +    for (int j = 0; j < FF_ARRAY_ELEMS(fc->DPB); j++) {
> +        fc->DPB[j].frame = av_frame_alloc();
> +        if (!fc->DPB[j].frame)
> +            goto fail;
> +    }
> +
> +    return 0;
> +fail:
> +    return AVERROR(ENOMEM);
> +}
> +
> +static int frame_context_setup(VVCFrameContext *fc, VVCContext *s)
> +{
> +    int ret = 0;
> +
> +    // copy refs from the last frame
> +    if (s->nb_frames && s->nb_fcs > 1) {
> +        VVCFrameContext *prev = get_frame_context(s, fc, -1);
> +        for (int i = 0; i < FF_ARRAY_ELEMS(fc->DPB); i++) {
> +            ff_vvc_unref_frame(fc, &fc->DPB[i], ~0);
> +            if (prev->DPB[i].frame->buf[0]) {
> +                ret = vvc_ref_frame(fc, &fc->DPB[i], &prev->DPB[i]);
> +                if (ret < 0)
> +                    goto fail;
> +            }
> +        }
> +    }
> +
> +    if (IS_IDR(s)) {
> +        s->seq_decode = (s->seq_decode + 1) & 0xff;
> +        ff_vvc_clear_refs(fc);
> +    }
> +
> +    ret = pic_arrays_init(s, fc);
> +    if (ret < 0)
> +        goto fail;
> +    ff_vvc_dsp_init(&fc->vvcdsp, fc->ps.sps->bit_depth);
> +    ff_videodsp_init(&fc->vdsp, fc->ps.sps->bit_depth);
> +
> +fail:
> +    return ret;
> +}
> +
> +static void export_frame_params(VVCFrameContext *fc)
> +{
> +    AVCodecContext *c   = fc->avctx;
> +    const VVCSPS *sps   = fc->ps.sps;
> +    const VVCPPS *pps   = fc->ps.pps;
> +
> +    c->pix_fmt          = sps->pix_fmt;
> +    c->coded_width      = pps->width;
> +    c->coded_height     = pps->height;
> +    c->width            = pps->width  - pps->r->pps_conf_win_left_offset - pps->r->pps_conf_win_right_offset;
> +    c->height           = pps->height - pps->r->pps_conf_win_top_offset - pps->r->pps_conf_win_bottom_offset;
> +}
> +
> +static int decode_slice(VVCContext *s, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
> +{
> +    int ret = 0;
> +    SliceContext *sc;
> +    VVCSH *sh;
> +    const int is_first_slice = !fc->nb_slices;
> +
> +    ret = slices_realloc(fc);
> +    if (ret < 0)
> +        return ret;
> +    sc = fc->slices[fc->nb_slices];
> +
> +    sh = &sc->sh;
> +
> +    if (ret < 0)
> +        goto fail;
> +
> +    s->vcl_unit_type = nal->type;
> +    if (is_first_slice) {
> +        //first slice
> +        ret = ff_vvc_decode_frame_ps(&fc->ps, s);
> +        if (ret < 0)
> +            return ret;
> +
> +        ret = frame_context_setup(fc, s);
> +        if (ret < 0)
> +            goto fail;
> +
> +        export_frame_params(fc);
> +    }
> +
> +    ret = ff_vvc_decode_sh(&sc->sh, &fc->ps, unit);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (is_first_slice) {
> +        ret = vvc_frame_start(s, fc, sc);
> +        if (ret < 0)
> +            return ret;
> +    } else if (fc->ref) {
> +        if (!IS_I(sh->r)) {
> +            ret = ff_vvc_slice_rpl(s, fc, sc);
> +            if (ret < 0) {
> +                av_log(fc->avctx, AV_LOG_WARNING,
> +                       "Error constructing the reference lists for the current slice.\n");
> +                return ret;
> +            }
> +        }
> +    } else {
> +        av_log(fc->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
> +        return ret;
> +    }
> +
> +    if (!IS_I(sh->r))
> +        vvc_smvd_ref_idx(fc, sc);
> +
> +    ret = init_slice_context(sc, fc, nal, unit);
> +    if (ret < 0)
> +        goto fail;
> +    fc->nb_slices++;
> +
> +fail:
> +    return ret;
> +}
> +
> +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
> +{
> +    int  ret;
> +
> +    s->temporal_id   = nal->temporal_id;
> +
> +    switch (unit->type) {
> +    case VVC_VPS_NUT:
> +    case VVC_SPS_NUT:
> +    case VVC_PPS_NUT:
> +        /* vps, sps, sps cached by s->cbc */
> +        break;
> +    case VVC_TRAIL_NUT:
> +    case VVC_STSA_NUT:
> +    case VVC_RADL_NUT:
> +    case VVC_RASL_NUT:
> +    case VVC_IDR_W_RADL:
> +    case VVC_IDR_N_LP:
> +    case VVC_CRA_NUT:
> +    case VVC_GDR_NUT:
> +        ret = decode_slice(s, fc, nal, unit);
> +        if (ret < 0)
> +            goto fail;
> +        break;
> +    case VVC_PREFIX_APS_NUT:
> +    case VVC_SUFFIX_APS_NUT:
> +        ret = ff_vvc_decode_aps(&s->ps, unit);
> +        if (ret < 0)
> +            goto fail;
> +        break;
> +    default:
> +        av_log(s->avctx, AV_LOG_INFO,
> +               "Skipping NAL unit %d\n", unit->type);

This will probably be very noisy (and warn for every SEI). I don't think
it is even needed, as h2645_parse.c already contains debug log messages
to display the unit type.

> +    }
> +
> +    return 0;
> +fail:
> +    return ret;

A fail that is only "return ret" is pointless (not only here).

> +}
> +
> +static int decode_nal_units(VVCContext *s, VVCFrameContext *fc, AVPacket *avpkt)
> +{
> +    const CodedBitstreamH266Context *h266   = (const CodedBitstreamH266Context *)s->cbc->priv_data;
> +    CodedBitstreamFragment *frame           = &s->current_frame;
> +    int i, ret = 0;
> +    int eos_at_start = 1;
> +    s->last_eos = s->eos;
> +    s->eos = 0;
> +
> +    ff_cbs_fragment_reset(frame);
> +    ret = ff_cbs_read_packet(s->cbc, frame, avpkt);
> +    if (ret < 0) {
> +        av_log(s->avctx, AV_LOG_ERROR, "Failed to read packet.\n");
> +        return ret;
> +    }
> +    /* decode the NAL units */
> +    for (i = 0; i < frame->nb_units; i++) {
> +        const H2645NAL *nal             = h266->common.read_packet.nals + i;
> +        const CodedBitstreamUnit *unit  = frame->units + i;
> +
> +        if (unit->type == VVC_EOB_NUT || unit->type == VVC_EOS_NUT) {
> +            if (eos_at_start)
> +                s->last_eos = 1;
> +            else
> +                s->eos = 1;
> +        } else {
> +            ret = decode_nal_unit(s, fc, nal, unit);
> +            if (ret < 0) {
> +                av_log(s->avctx, AV_LOG_WARNING,
> +                        "Error parsing NAL unit #%d.\n", i);
> +                goto fail;
> +            }
> +        }
> +    }
> +    return 0;
> +
> +fail:
> +    if (fc->ref)
> +        ff_vvc_report_frame_finished(fc->ref);
> +    return ret;
> +}
> +
> +static int set_output_format(const VVCContext *s, const AVFrame *output)
> +{
> +    AVCodecContext *c = s->avctx;
> +    int ret;
> +
> +    if (output->width != c->width || output->height != c->height) {
> +        if ((ret = ff_set_dimensions(c, output->width, output->height)) < 0)
> +            return ret;
> +    }
> +    c->pix_fmt = output->format;
> +    return 0;
> +}
> +
> +static int wait_delayed_frame(VVCContext *s, AVFrame *output, int *got_output)
> +{
> +    VVCFrameContext *delayed = get_frame_context(s, s->fcs, s->nb_frames - s->nb_delayed);
> +    int ret = ff_vvc_frame_wait(s, delayed);
> +
> +    if (!ret && delayed->output_frame->buf[0]) {
> +        av_frame_move_ref(output, delayed->output_frame);
> +        ret = set_output_format(s, output);
> +        if (!ret)
> +            *got_output = 1;
> +    }
> +    s->nb_delayed--;
> +
> +    return ret;
> +}
> +
> +static int submit_frame(VVCContext *s, VVCFrameContext *fc, AVFrame *output, int *got_output)
> +{
> +    int ret;
> +    s->nb_frames++;
> +    s->nb_delayed++;
> +    ff_vvc_frame_submit(s, fc);
> +    if (s->nb_delayed >= s->nb_fcs) {
> +        if ((ret = wait_delayed_frame(s, output, got_output)) < 0)
> +            return ret;
> +    }
> +    return 0;
> +}
>  
>  static int vvc_decode_frame(AVCodecContext *avctx, AVFrame *output,
>      int *got_output, AVPacket *avpkt)
>  {
> +    VVCContext *s = avctx->priv_data;
> +    VVCFrameContext *fc;
> +    int ret;
> +
> +    if (!avpkt->size) {
> +        while (s->nb_delayed) {
> +            if ((ret = wait_delayed_frame(s, output, got_output)) < 0)
> +                return ret;
> +            if (*got_output)
> +                return 0;
> +        }
> +        if (s->nb_frames) {
> +            //we still have frames cached in dpb.
> +            VVCFrameContext *last = get_frame_context(s, s->fcs, s->nb_frames - 1);
> +
> +            ret = ff_vvc_output_frame(s, last, output, 0, 1);
> +            if (ret < 0)
> +                return ret;
> +            if (ret) {
> +                *got_output = ret;
> +                if ((ret = set_output_format(s, output)) < 0)
> +                    return ret;
> +            }
> +        }
> +        return 0;
> +    }
> +
> +    fc = get_frame_context(s, s->fcs, s->nb_frames);
> +
> +    fc->nb_slices = 0;
> +    fc->decode_order = s->nb_frames;
> +
> +    ret = decode_nal_units(s, fc, avpkt);
> +    if (ret < 0)
> +        return ret;
> +
> +    ret = submit_frame(s, fc, output, got_output);
> +    if (ret < 0)
> +        return ret;
> +
>      return avpkt->size;
>  }
>  
>  static void vvc_decode_flush(AVCodecContext *avctx)

Should also be av_cold

>  {
> +    VVCContext *s = avctx->priv_data;
> +    int got_output;
> +    AVFrame *output = av_frame_alloc();

Allocating a frame for flushing is bad enough, but you are only flushing
if said allocating succeeds. If it does not, then we never wait for
frames which are currently decoded by other threads, don't we? So there
can be races and even crashes when this function is called from
vvc_decode_free() and allocation.
Instead you could pass NULL to wait_delayed_frame() and make it unref
the frames (instead of moving them) in case the output frame is NULL.

> +
> +    if (output) {
> +        while (s->nb_delayed) {
> +            wait_delayed_frame(s, output, &got_output);
> +            if (got_output) {
> +                av_frame_unref(output);
> +            }
> +        }
> +        av_frame_free(&output);
> +    }
>  }
>  
>  static av_cold int vvc_decode_free(AVCodecContext *avctx)
>  {
> +    VVCContext *s = avctx->priv_data;
> +    int i;
> +
> +    ff_cbs_fragment_free(&s->current_frame);

Is it sure that the fragment is not in use (given that other threads may
be running now before vvc_decode_flush())?

> +    vvc_decode_flush(avctx);
> +    ff_vvc_executor_free(&s->executor);
> +    if (s->fcs) {
> +        for (i = 0; i < s->nb_fcs; i++)

for (int i = 0; is better as it has smaller scope; in this case, it also
allows to save a line of code. Something similar is possible in
decode_nal_units(), please check the other patches, too.

> +            frame_context_free(s->fcs + i);
> +        av_free(s->fcs);
> +    }
> +    ff_vvc_ps_uninit(&s->ps);
> +    ff_cbs_close(&s->cbc);
> +
>      return 0;
>  }
>  
> +#define VVC_MAX_FRMAE_DELAY 16

typo

>  static av_cold int vvc_decode_init(AVCodecContext *avctx)
>  {
> +    VVCContext *s       = avctx->priv_data;
> +    int ret;
> +
> +    s->avctx = avctx;
> +
> +    if (ff_cbs_init(&s->cbc, AV_CODEC_ID_VVC, avctx))
> +        goto fail;

Forward the error code.

> +
> +    s->nb_fcs = (avctx->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 : FFMIN(av_cpu_count(), VVC_MAX_FRMAE_DELAY);

This may evaluate av_cpu_count() multiple times. Furthermore I don't
know why this define is used here at all: With frame threading, the
number of frame threads is not limited by the delay/number of reordering
frames at all (we even have frame-threading for decoders without
frame-reordering at all).

But worst of this is that you do not check avctx->thread_count at all.

> +    s->fcs = av_calloc(s->nb_fcs, sizeof(*s->fcs));
> +    if (!s->fcs)
> +        goto fail;
> +
> +    for (int i = 0; i < s->nb_fcs; i++) {
> +        VVCFrameContext *fc = s->fcs + i;
> +        ret = frame_context_init(fc, avctx);
> +        if (ret < 0)
> +            goto fail;
> +    }
> +
> +    s->executor = ff_vvc_executor_alloc(s, s->nb_fcs);
> +    if (!s->executor)
> +        goto fail;
> +
> +    s->eos = 1;
> +    GDR_SET_RECOVERED(s);
> +    memset(&ff_vvc_default_scale_m, 16, sizeof(ff_vvc_default_scale_m));

This needs to be done once (i.e. protected by an AVOnce) and not every
time a decoder is set up. Otherwise there might be data races.

> +
>      return 0;
> +
> +fail:
> +    vvc_decode_free(avctx);

Unnecessary, as this decoder has the FF_CODEC_CAP_INIT_CLEANUP set. In
fact, given that vvc_decode_free() uses av_free() instead of av_freep()
for s->fcs, calling vvc_decode_free() here can lead to a use-after-free
(namely when vvc_decode_free() is called generically later).

> +    return AVERROR(ENOMEM);
>  }
>  
>  const FFCodec ff_vvc_decoder = {
Nuo Mi Dec. 8, 2023, 4:20 p.m. UTC | #3
Hi Andreas,
thank you for the review.
On Fri, Dec 8, 2023 at 8:17 PM Andreas Rheinhardt <
andreas.rheinhardt@outlook.com> wrote:

>
> > +
> > +static int min_pu_arrays_init(VVCFrameContext *fc, const int
> pic_size_in_min_pu)
> > +{
> > +    if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
> > +        min_pu_arrays_free(fc);
> > +        fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
> > +        fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
> > +        fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
> > +        fc->tab.mvf  = av_mallocz(pic_size_in_min_pu *
> sizeof(*fc->tab.mvf));
>
> Do these have to be separate allocations? If there were allocated
> jointly, one memset below would suffice.
>
They are separate flags, if we combine them. We can't use memset to set
flags for a block.

>
> > +
> > +    if (!fc->cu_pool) {
> > +        fc->cu_pool = ff_refstruct_pool_alloc(sizeof(CodingUnit), 0);
> > +        if (!fc->cu_pool)
> > +            goto fail;
>
> The size of the objects contained in this pool don't depend on any
> bitstream parameters. You can therefore simply use a single pool (in
> VVCContext) that is allocated in vvc_decode_init() and freed in
> vvc_decode_free().
> The same goes for tu_pool below.
>
A global pool may have a performance issue for huge thread number.
Move it to frame_context_init

>
>
>
> > +static int slices_realloc(VVCFrameContext *fc)
> > +{
> > +    void *p;
> > +    const int size = (fc->nb_slices_allocated + 1) * 3 / 2;
> > +
> > +    if (fc->nb_slices < fc->nb_slices_allocated)
> > +        return 0;
> > +
> > +    p = av_realloc(fc->slices, size * sizeof(*fc->slices));
>
> av_realloc_array()
>
 done

>
> > +    if (!p)
> > +        return AVERROR(ENOMEM);
> > +
> > +    fc->slices = p;
> > +    for (int i = fc->nb_slices_allocated; i < size; i++) {
> > +        fc->slices[i] = av_calloc(1, sizeof(*fc->slices[0]));
>
> av_mallocz().
>
done

>
> > +        if (!fc->slices[i]) {
> > +            for (int j = fc->nb_slices_allocated; j < i; j++)
> > +                av_freep(&fc->slices[j]);
> > +            return AVERROR(ENOMEM);
>
> Can't you simply set fc->nb_slices_allocated to i in order to avoid this
> loop?
>
done

> > +
> > +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc,
> const H2645NAL *nal, const CodedBitstreamUnit *unit)
> > +{
> > +    const VVCSH *sh             = &sc->sh;
> > +    const H266RawSlice *slice   = (const H266RawSlice *)unit->content;
>
> Please no pointless casts. Also, why is there unnecessary whitespace in
> front of '='?
>
Fix here and serval other places
The whitespace will make all = in a col.


> > +    int nb_eps                  = sh->r->num_entry_points + 1;
> > +    int ctu_addr                = 0;
> > +    GetBitContext gb;
> > +
> > +    if (sc->nb_eps != nb_eps) {
> > +        eps_free(sc);
> > +        sc->eps = av_calloc(nb_eps, sizeof(*sc->eps));
> > +        if (!sc->eps)
> > +            return AVERROR(ENOMEM);
>
> In case of error, sc->eps is NULL, yet sc->nb_eps may be != 0. Stuff
> like this can (and does) lead to crashes.
>
added "slice->nb_eps = 0;" to eps_free

>
> > +static int vvc_ref_frame(VVCFrameContext *fc, VVCFrame *dst, VVCFrame
> *src)
>
> src should be const.
>
done

>
> > +
> > +static av_cold int frame_context_init(VVCFrameContext *fc,
> AVCodecContext *avctx)
> > +{
> > +
> > +    fc->avctx = av_memdup(avctx, sizeof(*avctx));
>
> When I read this, I presumed you are using multiple AVCodecContexts to
> store the ever changing state of the AVCodecContext fields similarly to
> update_context_from_thread() in pthread_frame.c. But it seems you don't.
> These contexts are only used as a) logcontexts (where the actual
> user-facing AVCodecContext should be used, so that the user can make
> sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
> export_frame_params() where only some basic fields
> (dimension-related+pix_fmt) is set. Presumably c) is done for b).
>
I remember if i did not use a local AVCodecContext  it would trigger some
assert when resolution changed.

>
> But the user is allowed to change the provided callbacks in the master
> context at any time. E.g. the call to ff_thread_get_buffer() in
> vvc_refs.c currently uses the VVCFrameContext and therefore uses the
> get_buffer2 callback in place now (during av_memdup()). This is wrong.
>
This will not happen. av_memdup only happens in vvc_decode_init.
Nobody will call ff_thread_get_buffer at this time

>
> I think you can just remove VVCFrameContext.avctx and use the
> user-facing AVCodecContext if you set the AVFrame properties that are
> normally derived from the AVCodecContext directly on the AVFrame before
> ff_thread_get_buffer().

Could you explain more about how to create a user-facing  AVCodecContext?

>
> > +
> > +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const
> H2645NAL *nal, const CodedBitstreamUnit *unit)
> > +{
> > +    int  ret;
> > +
> > +    s->temporal_id   = nal->temporal_id;
> > +
> > +    switch (unit->type) {
> > +    case VVC_VPS_NUT:
> > +    case VVC_SPS_NUT:
> > +    case VVC_PPS_NUT:
> > +        /* vps, sps, sps cached by s->cbc */
> > +        break;
> > +    case VVC_TRAIL_NUT:
> > +    case VVC_STSA_NUT:
> > +    case VVC_RADL_NUT:
> > +    case VVC_RASL_NUT:
> > +    case VVC_IDR_W_RADL:
> > +    case VVC_IDR_N_LP:
> > +    case VVC_CRA_NUT:
> > +    case VVC_GDR_NUT:
> > +        ret = decode_slice(s, fc, nal, unit);
> > +        if (ret < 0)
> > +            goto fail;
> > +        break;
> > +    case VVC_PREFIX_APS_NUT:
> > +    case VVC_SUFFIX_APS_NUT:
> > +        ret = ff_vvc_decode_aps(&s->ps, unit);
> > +        if (ret < 0)
> > +            goto fail;
> > +        break;
> > +    default:
> > +        av_log(s->avctx, AV_LOG_INFO,
> > +               "Skipping NAL unit %d\n", unit->type);
>
> This will probably be very noisy (and warn for every SEI). I don't think
> it is even needed, as h2645_parse.c already contains debug log messages
> to display the unit type.
>
It's copied from hevcdec. It means we did not handle the nal diffrent than
h2645_parser.c messages

A fail that is only "return ret" is pointless (not only here).
>
At someday if we need to add some cleanup code. we do not need to change
all returns to goto.

>
> >  static void vvc_decode_flush(AVCodecContext *avctx)
>
> Should also be av_cold
>
done

>
> >  {
> > +    VVCContext *s = avctx->priv_data;
> > +    int got_output;
> > +    AVFrame *output = av_frame_alloc();
>
> Allocating a frame for flushing is bad enough, but you are only flushing
> if said allocating succeeds. If it does not, then we never wait for
> frames which are currently decoded by other threads, don't we? So there
> can be races and even crashes when this function is called from
> vvc_decode_free() and allocation.
> Instead you could pass NULL to wait_delayed_frame() and make it unref
> the frames (instead of moving them) in case the output frame is NULL.
>
done

>
> > +
> > +    if (output) {
> > +        while (s->nb_delayed) {
> > +            wait_delayed_frame(s, output, &got_output);
> > +            if (got_output) {
> > +                av_frame_unref(output);
> > +            }
> > +        }
> > +        av_frame_free(&output);
> > +    }
> >  }
> >
> >  static av_cold int vvc_decode_free(AVCodecContext *avctx)
> >  {
> > +    VVCContext *s = avctx->priv_data;
> > +    int i;
> > +
> > +    ff_cbs_fragment_free(&s->current_frame);
>
> Is it sure that the fragment is not in use (given that other threads may
> be running now before vvc_decode_flush())?
>
Do you mean the executor threads? If they want to use some data, they will
take their own hip.
see ff_refstruct_replace(&sps->r, rsps);

>
> > +    vvc_decode_flush(avctx);
> > +    ff_vvc_executor_free(&s->executor);
> > +    if (s->fcs) {
> > +        for (i = 0; i < s->nb_fcs; i++)
>
> for (int i = 0; is better as it has smaller scope; in this case, it also
> allows to save a line of code. Something similar is possible in
> decode_nal_units(), please check the other patches, too.

Yeah, Most of the code uses the smallest scope. But some codes are copied
from hevc or just because I missed them. :)
double double-checked and fixed all(hope so)

>


> > +            frame_context_free(s->fcs + i);
> > +        av_free(s->fcs);
> > +    }
> > +    ff_vvc_ps_uninit(&s->ps);
> > +    ff_cbs_close(&s->cbc);
> > +
> >      return 0;
> >  }
> >
> > +#define VVC_MAX_FRMAE_DELAY 16
>
> typo
>
fixed

>
> >  static av_cold int vvc_decode_init(AVCodecContext *avctx)
> >  {
> > +    VVCContext *s       = avctx->priv_data;
> > +    int ret;
> > +
> > +    s->avctx = avctx;
> > +
> > +    if (ff_cbs_init(&s->cbc, AV_CODEC_ID_VVC, avctx))
> > +        goto fail;
>
> Forward the error code.
>
done

>
> > +
> > +    s->nb_fcs = (avctx->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 :
> FFMIN(av_cpu_count(), VVC_MAX_FRMAE_DELAY);
>
> This may evaluate av_cpu_count() multiple times. Furthermore I don't
> know why this define is used here at all: With frame threading, the
> number of frame threads is not limited by the delay/number of reordering
> frames at all (we even have frame-threading for decoders without
> frame-reordering at all).
>
 vvc_decode_frame only allows 1 frame in 1 frame out. We can remove the
delay if we switch to FFCodec->receive_frame,

>
> But worst of this is that you do not check avctx->thread_count at all.
>
we do not use avctx->thread_count.  we use the executor to manage threads.

>
> > +    s->fcs = av_calloc(s->nb_fcs, sizeof(*s->fcs));
> > +    if (!s->fcs)
> > +        goto fail;
> > +
> > +    for (int i = 0; i < s->nb_fcs; i++) {
> > +        VVCFrameContext *fc = s->fcs + i;
> > +        ret = frame_context_init(fc, avctx);
> > +        if (ret < 0)
> > +            goto fail;
> > +    }
> > +
> > +    s->executor = ff_vvc_executor_alloc(s, s->nb_fcs);
> > +    if (!s->executor)
> > +        goto fail;
> > +
> > +    s->eos = 1;
> > +    GDR_SET_RECOVERED(s);
> > +    memset(&ff_vvc_default_scale_m, 16, sizeof(ff_vvc_default_scale_m));
>
> This needs to be done once (i.e. protected by an AVOnce) and not every
> time a decoder is set up. Otherwise there might be data races.
>
It's not read and set, it will no data races:), I can change it to  AVOnce
.

>
> > +
> >      return 0;
> > +
> > +fail:
> > +    vvc_decode_free(avctx);
>
> Unnecessary, as this decoder has the FF_CODEC_CAP_INIT_CLEANUP set. In
> fact, given that vvc_decode_free() uses av_free() instead of av_freep()
> for s->fcs, calling vvc_decode_free() here can lead to a use-after-free
> (namely when vvc_decode_free() is called generically later).
>
done

>
> > +    return AVERROR(ENOMEM);
> >  }
> >
> >  const FFCodec ff_vvc_decoder = {
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Andreas Rheinhardt Dec. 9, 2023, 5:14 a.m. UTC | #4
Nuo Mi:
> Hi Andreas,
> thank you for the review.
> On Fri, Dec 8, 2023 at 8:17 PM Andreas Rheinhardt <
> andreas.rheinhardt@outlook.com> wrote:
> 
>>
>>> +
>>> +static int min_pu_arrays_init(VVCFrameContext *fc, const int
>> pic_size_in_min_pu)
>>> +{
>>> +    if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
>>> +        min_pu_arrays_free(fc);
>>> +        fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
>>> +        fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
>>> +        fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
>>> +        fc->tab.mvf  = av_mallocz(pic_size_in_min_pu *
>> sizeof(*fc->tab.mvf));
>>
>> Do these have to be separate allocations? If there were allocated
>> jointly, one memset below would suffice.
>>
> They are separate flags, if we combine them. We can't use memset to set
> flags for a block.
> 

I disagree: You would still be able to use different pointers for
different parts of the large allocated block, it is just that you also
save some unnecessary allocations (and frees and errors checks for the
allocations) and also gain the ability to memset them via one memset
call in case one wants to set them to the same value.

>>
>>> +
>>> +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc,
>> const H2645NAL *nal, const CodedBitstreamUnit *unit)
>>> +{
>>> +    const VVCSH *sh             = &sc->sh;
>>> +    const H266RawSlice *slice   = (const H266RawSlice *)unit->content;
>>
>> Please no pointless casts. Also, why is there unnecessary whitespace in
>> front of '='?
>>
> Fix here and serval other places
> The whitespace will make all = in a col.
> 

But there is nothing that needs that much whitespace.

>>> +
>>> +static av_cold int frame_context_init(VVCFrameContext *fc,
>> AVCodecContext *avctx)
>>> +{
>>> +
>>> +    fc->avctx = av_memdup(avctx, sizeof(*avctx));
>>
>> When I read this, I presumed you are using multiple AVCodecContexts to
>> store the ever changing state of the AVCodecContext fields similarly to
>> update_context_from_thread() in pthread_frame.c. But it seems you don't.
>> These contexts are only used as a) logcontexts (where the actual
>> user-facing AVCodecContext should be used, so that the user can make
>> sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
>> export_frame_params() where only some basic fields
>> (dimension-related+pix_fmt) is set. Presumably c) is done for b).
>>
> I remember if i did not use a local AVCodecContext  it would trigger some
> assert when resolution changed.
> 

Can you be more specific about what assert has been triggered? And have
you set the AVFrame fields directly before ff_thread_get_buffer()?

>>
>> But the user is allowed to change the provided callbacks in the master
>> context at any time. E.g. the call to ff_thread_get_buffer() in
>> vvc_refs.c currently uses the VVCFrameContext and therefore uses the
>> get_buffer2 callback in place now (during av_memdup()). This is wrong.
>>
> This will not happen. av_memdup only happens in vvc_decode_init.
> Nobody will call ff_thread_get_buffer at this time
> 

You missed the point: If the user changes the get_buffer2 callback after
init, the new callback will not be used at all.

>>
>> I think you can just remove VVCFrameContext.avctx and use the
>> user-facing AVCodecContext if you set the AVFrame properties that are
>> normally derived from the AVCodecContext directly on the AVFrame before
>> ff_thread_get_buffer().
> 
> Could you explain more about how to create a user-facing  AVCodecContext?
> 

You do not create a user-facing AVCodecContext, the user does (and calls
avcodec_send_packet()/avcodec_receive_frame() with it).

>>
>>> +
>>> +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const
>> H2645NAL *nal, const CodedBitstreamUnit *unit)
>>> +{
>>> +    int  ret;
>>> +
>>> +    s->temporal_id   = nal->temporal_id;
>>> +
>>> +    switch (unit->type) {
>>> +    case VVC_VPS_NUT:
>>> +    case VVC_SPS_NUT:
>>> +    case VVC_PPS_NUT:
>>> +        /* vps, sps, sps cached by s->cbc */
>>> +        break;
>>> +    case VVC_TRAIL_NUT:
>>> +    case VVC_STSA_NUT:
>>> +    case VVC_RADL_NUT:
>>> +    case VVC_RASL_NUT:
>>> +    case VVC_IDR_W_RADL:
>>> +    case VVC_IDR_N_LP:
>>> +    case VVC_CRA_NUT:
>>> +    case VVC_GDR_NUT:
>>> +        ret = decode_slice(s, fc, nal, unit);
>>> +        if (ret < 0)
>>> +            goto fail;
>>> +        break;
>>> +    case VVC_PREFIX_APS_NUT:
>>> +    case VVC_SUFFIX_APS_NUT:
>>> +        ret = ff_vvc_decode_aps(&s->ps, unit);
>>> +        if (ret < 0)
>>> +            goto fail;
>>> +        break;
>>> +    default:
>>> +        av_log(s->avctx, AV_LOG_INFO,
>>> +               "Skipping NAL unit %d\n", unit->type);
>>
>> This will probably be very noisy (and warn for every SEI). I don't think
>> it is even needed, as h2645_parse.c already contains debug log messages
>> to display the unit type.
>>
> It's copied from hevcdec. It means we did not handle the nal diffrent than
> h2645_parser.c messages
> 

1. The message is unnecessary, because a user who wants to know which
NAL units have been handled or not can get the info about which units
are present from h2645_parse.c and then look up in this list whether
this type is processed.
2. hevcdec.c does "handle" quite a lot more NAL units; e.g. it actually
handles SEI messages and it ignores e.g. Access unit delimiters as well
as HEVC_NAL_UNSPEC62. Whereas you do not.

> A fail that is only "return ret" is pointless (not only here).
>>
> At someday if we need to add some cleanup code. we do not need to change
> all returns to goto.
> 

IMO a goto fail should be added if and when it is actually beneficial.

>>> +
>>> +    if (output) {
>>> +        while (s->nb_delayed) {
>>> +            wait_delayed_frame(s, output, &got_output);
>>> +            if (got_output) {
>>> +                av_frame_unref(output);
>>> +            }
>>> +        }
>>> +        av_frame_free(&output);
>>> +    }
>>>  }
>>>
>>>  static av_cold int vvc_decode_free(AVCodecContext *avctx)
>>>  {
>>> +    VVCContext *s = avctx->priv_data;
>>> +    int i;
>>> +
>>> +    ff_cbs_fragment_free(&s->current_frame);
>>
>> Is it sure that the fragment is not in use (given that other threads may
>> be running now before vvc_decode_flush())?
>>
> Do you mean the executor threads? If they want to use some data, they will
> take their own hip.
> see ff_refstruct_replace(&sps->r, rsps);
> 

"hip"?

I have now noticed that the SliceContexts contain a reference to the
packet's data via VVCSH->H266RawSliceHeader, which in reality points to
a H266RawSlice which contains a reference to the actual data, so this
point is moot. But using H266RawSliceHeader* for a H266RawSlice* is not
nice.

>>
>>> +
>>> +    s->nb_fcs = (avctx->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 :
>> FFMIN(av_cpu_count(), VVC_MAX_FRMAE_DELAY);
>>
>> This may evaluate av_cpu_count() multiple times. Furthermore I don't
>> know why this define is used here at all: With frame threading, the
>> number of frame threads is not limited by the delay/number of reordering
>> frames at all (we even have frame-threading for decoders without
>> frame-reordering at all).
>>
>  vvc_decode_frame only allows 1 frame in 1 frame out. We can remove the
> delay if we switch to FFCodec->receive_frame,
> 

I do not get how this is supposed to address my point.

>>
>> But worst of this is that you do not check avctx->thread_count at all.
>>
> we do not use avctx->thread_count.  we use the executor to manage threads.
> 

This is complete nonsense: It is the user who specifies how many threads
to use, regardless of which mechanism is used to manage threads.

>>
>>> +    s->fcs = av_calloc(s->nb_fcs, sizeof(*s->fcs));
>>> +    if (!s->fcs)
>>> +        goto fail;
>>> +
>>> +    for (int i = 0; i < s->nb_fcs; i++) {
>>> +        VVCFrameContext *fc = s->fcs + i;
>>> +        ret = frame_context_init(fc, avctx);
>>> +        if (ret < 0)
>>> +            goto fail;
>>> +    }
>>> +
>>> +    s->executor = ff_vvc_executor_alloc(s, s->nb_fcs);
>>> +    if (!s->executor)
>>> +        goto fail;
>>> +
>>> +    s->eos = 1;
>>> +    GDR_SET_RECOVERED(s);
>>> +    memset(&ff_vvc_default_scale_m, 16, sizeof(ff_vvc_default_scale_m));
>>
>> This needs to be done once (i.e. protected by an AVOnce) and not every
>> time a decoder is set up. Otherwise there might be data races.
>>
> It's not read and set, it will no data races:), I can change it to  AVOnce
> .

This is wrong: It is set here and presumably it will be read somewhere,
so there absolutely can be data races.
If you believe that there is no data race because every memset sets it
to the same value, then you should be aware that the C specification
disagrees with you (all references are to the C11 spec):

a) 5.1.2.4 25: "The execution of a program contains a data race if it
contains two conflicting actions in different threads, at least one of
which is not atomic, and neither happens before the other. Any such data
race results in undefined behavior."
b) 5.1.2.4 4: "Two expression evaluations conflict if one of them
modifies a memory location and the other one reads or modifies the same
memory location."
c) Note 2 in 3.1 (to the definition of "access"):
"‘‘Modify’’ includes the case where the new value being stored is the
same as the previous value."

With the current code data races will happen if a) two different decoder
instances are initialized without synchronisation (given that lavc does
not serialize initialization of codecs (except in rare cases based upon
a flag which this decoder does not set), this synchronization would have
to be performed by the user, but we do not require our users to do this)
or b) a decoder is initialized while another decoder runs and reads from
ff_vvc_default_scale_m:

Because the accesses performed by the initing thread are always
modifications according to c), the accesses by the different threads
conflict by definition b). memset() is not required to perform atomic
modifications (and according to the standard atomic modifications can
only happen with atomic objects, which ff_vvc_default_scale_m is not)
and by our assumption there is no synchronisation between these actions,
so it is a data race according to a). And data races are undefined
behaviour.

This clause allows compilers to optimize lots of code as if the program
were single-threaded (because concurrent accesses were UB, so presumably
they don't happen). In particular, speculative writes are legal (and
happen sometimes, but probably not in memset). In fact, it would be
legal for memset to always zero the memory it is supposed to set and
then overwrite it with the final value.

- Andreas
Nuo Mi Dec. 10, 2023, 12:54 p.m. UTC | #5
On Sat, Dec 9, 2023 at 1:13 PM Andreas Rheinhardt <
andreas.rheinhardt@outlook.com> wrote:

> Nuo Mi:
> > Hi Andreas,
> > thank you for the review.
> > On Fri, Dec 8, 2023 at 8:17 PM Andreas Rheinhardt <
> > andreas.rheinhardt@outlook.com> wrote:
> >
> >>
> >>> +
> >>> +static int min_pu_arrays_init(VVCFrameContext *fc, const int
> >> pic_size_in_min_pu)
> >>> +{
> >>> +    if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
> >>> +        min_pu_arrays_free(fc);
> >>> +        fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
> >>> +        fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
> >>> +        fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
> >>> +        fc->tab.mvf  = av_mallocz(pic_size_in_min_pu *
> >> sizeof(*fc->tab.mvf));
> >>
> >> Do these have to be separate allocations? If there were allocated
> >> jointly, one memset below would suffice.
> >>
> > They are separate flags, if we combine them. We can't use memset to set
> > flags for a block.
> >
>
> I disagree: You would still be able to use different pointers for
> different parts of the large allocated block, it is just that you also
> save some unnecessary allocations (and frees and errors checks for the
> allocations) and also gain the ability to memset them via one memset
> call in case one wants to set them to the same value.
>
Good idea. done

>
> >>
> >>> +
> >>> +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc,
> >> const H2645NAL *nal, const CodedBitstreamUnit *unit)
> >>> +{
> >>> +    const VVCSH *sh             = &sc->sh;
> >>> +    const H266RawSlice *slice   = (const H266RawSlice *)unit->content;
> >>
> >> Please no pointless casts. Also, why is there unnecessary whitespace in
> >> front of '='?
> >>
> > Fix here and serval other places
> > The whitespace will make all = in a col.
> >
>
> But there is nothing that needs that much whitespace.
>
> >>> +
> >>> +static av_cold int frame_context_init(VVCFrameContext *fc,
> >> AVCodecContext *avctx)
> >>> +{
> >>> +
> >>> +    fc->avctx = av_memdup(avctx, sizeof(*avctx));
> >>
> >> When I read this, I presumed you are using multiple AVCodecContexts to
> >> store the ever changing state of the AVCodecContext fields similarly to
> >> update_context_from_thread() in pthread_frame.c. But it seems you don't.
> >> These contexts are only used as a) logcontexts (where the actual
> >> user-facing AVCodecContext should be used, so that the user can make
> >> sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
> >> export_frame_params() where only some basic fields
> >> (dimension-related+pix_fmt) is set. Presumably c) is done for b).
> >>
> > I remember if i did not use a local AVCodecContext  it would trigger some
> > assert when resolution changed.
> >
>
> Can you be more specific about what assert has been triggered? And have
> you set the AVFrame fields directly before ff_thread_get_buffer()?
>
hmm, this has not happened now.
Let us remove the memdup

>
> >>
> >> But the user is allowed to change the provided callbacks in the master
> >> context at any time. E.g. the call to ff_thread_get_buffer() in
> >> vvc_refs.c currently uses the VVCFrameContext and therefore uses the
> >> get_buffer2 callback in place now (during av_memdup()). This is wrong.
> >>
> > This will not happen. av_memdup only happens in vvc_decode_init.
> > Nobody will call ff_thread_get_buffer at this time
> >
>
> You missed the point: If the user changes the get_buffer2 callback after
> init, the new callback will not be used at all.
>
fixed.

>
> >>
> >> I think you can just remove VVCFrameContext.avctx and use the
> >> user-facing AVCodecContext if you set the AVFrame properties that are
> >> normally derived from the AVCodecContext directly on the AVFrame before
> >> ff_thread_get_buffer().
> >
> > Could you explain more about how to create a user-facing  AVCodecContext?
> >
>
> You do not create a user-facing AVCodecContext, the user does (and calls
> avcodec_send_packet()/avcodec_receive_frame() with it).


> >>
> >>> +
> >>> +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const
> >> H2645NAL *nal, const CodedBitstreamUnit *unit)
> >>> +{
> >>> +    int  ret;
> >>> +
> >>> +    s->temporal_id   = nal->temporal_id;
> >>> +
> >>> +    switch (unit->type) {
> >>> +    case VVC_VPS_NUT:
> >>> +    case VVC_SPS_NUT:
> >>> +    case VVC_PPS_NUT:
> >>> +        /* vps, sps, sps cached by s->cbc */
> >>> +        break;
> >>> +    case VVC_TRAIL_NUT:
> >>> +    case VVC_STSA_NUT:
> >>> +    case VVC_RADL_NUT:
> >>> +    case VVC_RASL_NUT:
> >>> +    case VVC_IDR_W_RADL:
> >>> +    case VVC_IDR_N_LP:
> >>> +    case VVC_CRA_NUT:
> >>> +    case VVC_GDR_NUT:
> >>> +        ret = decode_slice(s, fc, nal, unit);
> >>> +        if (ret < 0)
> >>> +            goto fail;
> >>> +        break;
> >>> +    case VVC_PREFIX_APS_NUT:
> >>> +    case VVC_SUFFIX_APS_NUT:
> >>> +        ret = ff_vvc_decode_aps(&s->ps, unit);
> >>> +        if (ret < 0)
> >>> +            goto fail;
> >>> +        break;
> >>> +    default:
> >>> +        av_log(s->avctx, AV_LOG_INFO,
> >>> +               "Skipping NAL unit %d\n", unit->type);
> >>
> >> This will probably be very noisy (and warn for every SEI). I don't think
> >> it is even needed, as h2645_parse.c already contains debug log messages
> >> to display the unit type.
> >>
> > It's copied from hevcdec. It means we did not handle the nal diffrent
> than
> > h2645_parser.c messages
> >
>
> 1. The message is unnecessary, because a user who wants to know which
> NAL units have been handled or not can get the info about which units
> are present from h2645_parse.c and then look up in this list whether
> this type is processed.
> 2. hevcdec.c does "handle" quite a lot more NAL units; e.g. it actually
> handles SEI messages and it ignores e.g. Access unit delimiters as well
> as HEVC_NAL_UNSPEC62. Whereas you do not.
>
removed

>
> > A fail that is only "return ret" is pointless (not only here).
> >>
> > At someday if we need to add some cleanup code. we do not need to change
> > all returns to goto.
> >
>
> IMO a goto fail should be added if and when it is actually beneficial.
>
fixed

>
> >>> +
> >>> +    if (output) {
> >>> +        while (s->nb_delayed) {
> >>> +            wait_delayed_frame(s, output, &got_output);
> >>> +            if (got_output) {
> >>> +                av_frame_unref(output);
> >>> +            }
> >>> +        }
> >>> +        av_frame_free(&output);
> >>> +    }
> >>>  }
> >>>
> >>>  static av_cold int vvc_decode_free(AVCodecContext *avctx)
> >>>  {
> >>> +    VVCContext *s = avctx->priv_data;
> >>> +    int i;
> >>> +
> >>> +    ff_cbs_fragment_free(&s->current_frame);
> >>
> >> Is it sure that the fragment is not in use (given that other threads may
> >> be running now before vvc_decode_flush())?
> >>
> > Do you mean the executor threads? If they want to use some data, they
> will
> > take their own hip.
> > see ff_refstruct_replace(&sps->r, rsps);
> >
>
> "hip"?
>
:)

>
> I have now noticed that the SliceContexts contain a reference to the
> packet's data via VVCSH->H266RawSliceHeader, which in reality points to
> a H266RawSlice which contains a reference to the actual data, so this
> point is moot. But using H266RawSliceHeader* for a H266RawSlice* is not
> nice.
>
Ok, let me take a  reference to H266RawSlice.

>
> >>
> >>> +
> >>> +    s->nb_fcs = (avctx->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 :
> >> FFMIN(av_cpu_count(), VVC_MAX_FRMAE_DELAY);
> >>
> >> This may evaluate av_cpu_count() multiple times. Furthermore I don't
> >> know why this define is used here at all: With frame threading, the
> >> number of frame threads is not limited by the delay/number of reordering
> >> frames at all (we even have frame-threading for decoders without
> >> frame-reordering at all).
> >>
> >  vvc_decode_frame only allows 1 frame in 1 frame out. We can remove the
> > delay if we switch to FFCodec->receive_frame,
> >
>
> I do not get how this is supposed to address my point.
>
> >>
> >> But worst of this is that you do not check avctx->thread_count at all.
> >>
> > we do not use avctx->thread_count.  we use the executor to manage
> threads.
> >
>
> This is complete nonsense: It is the user who specifies how many threads
> to use, regardless of which mechanism is used to manage threads.
>
fixed

>
> >>
> >>> +    s->fcs = av_calloc(s->nb_fcs, sizeof(*s->fcs));
> >>> +    if (!s->fcs)
> >>> +        goto fail;
> >>> +
> >>> +    for (int i = 0; i < s->nb_fcs; i++) {
> >>> +        VVCFrameContext *fc = s->fcs + i;
> >>> +        ret = frame_context_init(fc, avctx);
> >>> +        if (ret < 0)
> >>> +            goto fail;
> >>> +    }
> >>> +
> >>> +    s->executor = ff_vvc_executor_alloc(s, s->nb_fcs);
> >>> +    if (!s->executor)
> >>> +        goto fail;
> >>> +
> >>> +    s->eos = 1;
> >>> +    GDR_SET_RECOVERED(s);
> >>> +    memset(&ff_vvc_default_scale_m, 16,
> sizeof(ff_vvc_default_scale_m));
> >>
> >> This needs to be done once (i.e. protected by an AVOnce) and not every
> >> time a decoder is set up. Otherwise there might be data races.
> >>
> > It's not read and set, it will no data races:), I can change it to
> AVOnce
> > .
>
> This is wrong: It is set here and presumably it will be read somewhere,
> so there absolutely can be data races.
> If you believe that there is no data race because every memset sets it
> to the same value, then you should be aware that the C specification
> disagrees with you (all references are to the C11 spec):
>
> a) 5.1.2.4 25: "The execution of a program contains a data race if it
> contains two conflicting actions in different threads, at least one of
> which is not atomic, and neither happens before the other. Any such data
> race results in undefined behavior."
> b) 5.1.2.4 4: "Two expression evaluations conflict if one of them
> modifies a memory location and the other one reads or modifies the same
> memory location."
> c) Note 2 in 3.1 (to the definition of "access"):
> "‘‘Modify’’ includes the case where the new value being stored is the
> same as the previous value."
>
> With the current code data races will happen if a) two different decoder
> instances are initialized without synchronisation (given that lavc does
> not serialize initialization of codecs (except in rare cases based upon
> a flag which this decoder does not set), this synchronization would have
> to be performed by the user, but we do not require our users to do this)
> or b) a decoder is initialized while another decoder runs and reads from
> ff_vvc_default_scale_m:
>
> Because the accesses performed by the initing thread are always
> modifications according to c), the accesses by the different threads
> conflict by definition b). memset() is not required to perform atomic
> modifications (and according to the standard atomic modifications can
> only happen with atomic objects, which ff_vvc_default_scale_m is not)
> and by our assumption there is no synchronisation between these actions,
> so it is a data race according to a). And data races are undefined
> behaviour.
>
> This clause allows compilers to optimize lots of code as if the program
> were single-threaded (because concurrent accesses were UB, so presumably
> they don't happen). In particular, speculative writes are legal (and
> happen sometimes, but probably not in memset). In fact, it would be
> legal for memset to always zero the memory it is supposed to set and
> then overwrite it with the final value.
>

Thanks for the explanation. fixed

>
> - Andreas
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c
index 3c591ce875..e40eb7339f 100644
--- a/libavcodec/vvc/vvcdec.c
+++ b/libavcodec/vvc/vvcdec.c
@@ -21,28 +21,1035 @@ 
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "libavcodec/codec_internal.h"
+#include "libavcodec/decode.h"
 #include "libavcodec/profiles.h"
+#include "libavcodec/refstruct.h"
+#include "libavutil/cpu.h"
 
 #include "vvcdec.h"
+#include "vvc_ctu.h"
+#include "vvc_data.h"
+#include "vvc_refs.h"
+#include "vvc_thread.h"
+
+static int vvc_frame_start(VVCContext *s, VVCFrameContext *fc, SliceContext *sc)
+{
+    const VVCPH *ph                 = &fc->ps.ph;
+    const H266RawSliceHeader *rsh   = sc->sh.r;
+    int ret;
+
+    // 8.3.1 Decoding process for picture order count
+    if (!s->temporal_id && !ph->r->ph_non_ref_pic_flag && !(IS_RASL(s) || IS_RADL(s)))
+        s->poc_tid0 = ph->poc;
+
+    if ((ret = ff_vvc_set_new_ref(s, fc, &fc->frame)) < 0)
+        goto fail;
+
+    if (!IS_IDR(s))
+        ff_vvc_bump_frame(s, fc);
+
+    av_frame_unref(fc->output_frame);
+
+    if ((ret = ff_vvc_output_frame(s, fc, fc->output_frame,rsh->sh_no_output_of_prior_pics_flag, 0)) < 0)
+        goto fail;
+
+    if ((ret = ff_vvc_frame_rpl(s, fc, sc)) < 0)
+        goto fail;
+
+    if ((ret = ff_vvc_frame_thread_init(fc)) < 0)
+        goto fail;
+    return 0;
+fail:
+    if (fc->ref)
+        ff_vvc_unref_frame(fc, fc->ref, ~0);
+    fc->ref = NULL;
+    return ret;
+}
+
+static void ctb_arrays_free(VVCFrameContext *fc)
+{
+    av_freep(&fc->tab.deblock);
+    av_freep(&fc->tab.sao);
+    av_freep(&fc->tab.alf);
+    av_freep(&fc->tab.slice_idx);
+    av_freep(&fc->tab.coeffs);
+    if (fc->tab.ctus) {
+        for (int i = 0; i < fc->tab.ctu_count; i++)
+            ff_vvc_ctu_free_cus(fc->tab.ctus + i);
+        av_freep(&fc->tab.ctus);
+    }
+    ff_refstruct_pool_uninit(&fc->rpl_tab_pool);
+}
+
+static int ctb_arrays_init(VVCFrameContext *fc, const int ctu_count, const int ctu_size)
+{
+    if (fc->tab.ctu_count != ctu_count || fc->tab.ctu_size != ctu_size) {
+        ctb_arrays_free(fc);
+        fc->tab.deblock         = av_calloc(ctu_count, sizeof(*fc->tab.deblock));
+        fc->tab.sao             = av_calloc(ctu_count, sizeof(*fc->tab.sao));
+        fc->tab.alf             = av_calloc(ctu_count, sizeof(*fc->tab.alf));
+        fc->tab.ctus            = av_calloc(ctu_count, sizeof(*fc->tab.ctus));
+        fc->tab.slice_idx       = av_malloc(ctu_count * sizeof(*fc->tab.slice_idx));
+        if (!fc->tab.deblock || !fc->tab.sao || !fc->tab.alf || !fc->tab.ctus || !fc->tab.slice_idx )
+            return AVERROR(ENOMEM);
+        fc->tab.coeffs = av_malloc(ctu_count * sizeof(*fc->tab.coeffs) * ctu_size * VVC_MAX_SAMPLE_ARRAYS);
+        if (!fc->tab.coeffs)
+            return AVERROR(ENOMEM);
+        fc->rpl_tab_pool = ff_refstruct_pool_alloc(ctu_count * sizeof(RefPicListTab), 0);
+        if (!fc->rpl_tab_pool)
+            return AVERROR(ENOMEM);
+    } else {
+        memset(fc->tab.deblock, 0, ctu_count * sizeof(*fc->tab.deblock));
+        memset(fc->tab.sao, 0, ctu_count * sizeof(*fc->tab.sao));
+        memset(fc->tab.alf, 0, ctu_count * sizeof(*fc->tab.alf));
+        for (int i = 0; i < fc->tab.ctu_count; i++)
+            ff_vvc_ctu_free_cus(fc->tab.ctus + i);
+        memset(fc->tab.ctus, 0, ctu_count * sizeof(*fc->tab.ctus));
+    }
+    memset(fc->tab.slice_idx, -1, ctu_count * sizeof(*fc->tab.slice_idx));
+
+    return 0;
+}
+
+static void min_cb_arrays_free(VVCFrameContext *fc)
+{
+    for (int i = LUMA; i <= CHROMA; i++) {
+        av_freep(&fc->tab.cb_pos_x[i]);
+        av_freep(&fc->tab.cb_pos_y[i]);
+        av_freep(&fc->tab.cb_width[i]);
+        av_freep(&fc->tab.cb_height[i]);
+        av_freep(&fc->tab.cqt_depth[i]);
+        av_freep(&fc->tab.cpm[i]);
+        av_freep(&fc->tab.cp_mv[i]);
+    }
+
+    av_freep(&fc->tab.ipm);
+    av_freep(&fc->tab.imf);
+    av_freep(&fc->tab.imtf);
+    av_freep(&fc->tab.imm);
+    av_freep(&fc->tab.skip);
+}
+
+static int min_cb_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_cb)
+{
+    if (fc->tab.pic_size_in_min_cb != pic_size_in_min_cb) {
+        min_cb_arrays_free(fc);
+        for (int i = LUMA; i <= CHROMA; i++) {
+            fc->tab.cb_pos_x[i]  = av_mallocz(pic_size_in_min_cb * sizeof(int));
+            fc->tab.cb_pos_y[i]  = av_mallocz(pic_size_in_min_cb * sizeof(int));
+            fc->tab.cb_width[i]  = av_mallocz(pic_size_in_min_cb);
+            fc->tab.cb_height[i] = av_mallocz(pic_size_in_min_cb);
+            fc->tab.cqt_depth[i] = av_mallocz(pic_size_in_min_cb);
+            if (!fc->tab.cb_pos_x[i] || !fc->tab.cb_pos_y[i] || !fc->tab.cb_width[i] || !fc->tab.cb_height[i] || !fc->tab.cqt_depth[i])
+                return AVERROR(ENOMEM);
+
+            fc->tab.cpm[i]   = av_mallocz(pic_size_in_min_cb);
+            fc->tab.cp_mv[i] = av_mallocz(pic_size_in_min_cb * sizeof(Mv) * MAX_CONTROL_POINTS);
+            if (!fc->tab.cpm[i] || !fc->tab.cp_mv[i])
+                return AVERROR(ENOMEM);
+        }
+
+        fc->tab.ipm  = av_mallocz(pic_size_in_min_cb);
+        fc->tab.imf  = av_mallocz(pic_size_in_min_cb);
+        fc->tab.imtf = av_mallocz(pic_size_in_min_cb);
+        fc->tab.imm  = av_mallocz(pic_size_in_min_cb);
+        fc->tab.skip = av_mallocz(pic_size_in_min_cb);
+        if (!fc->tab.ipm || !fc->tab.imf || !fc->tab.imtf || !fc->tab.imm || !fc->tab.skip)
+            return AVERROR(ENOMEM);
+    } else {
+        for (int i = LUMA; i <= CHROMA; i++) {
+            memset(fc->tab.cb_pos_x[i], 0, pic_size_in_min_cb * sizeof(int));
+            memset(fc->tab.cb_pos_y[i], 0, pic_size_in_min_cb * sizeof(int));
+            memset(fc->tab.cb_width[i], 0, pic_size_in_min_cb);
+            memset(fc->tab.cb_height[i], 0, pic_size_in_min_cb);
+            memset(fc->tab.cqt_depth[i], 0, pic_size_in_min_cb);
+            memset(fc->tab.cpm[i], 0, pic_size_in_min_cb);
+            memset(fc->tab.cp_mv[i], 0, pic_size_in_min_cb * sizeof(Mv) * MAX_CONTROL_POINTS);
+        }
+
+        memset(fc->tab.ipm, 0, pic_size_in_min_cb);
+        memset(fc->tab.imf, 0, pic_size_in_min_cb);
+        memset(fc->tab.imtf, 0, pic_size_in_min_cb);
+        memset(fc->tab.imm, 0, pic_size_in_min_cb);
+        memset(fc->tab.skip, 0, pic_size_in_min_cb);
+    }
+    return 0;
+}
+
+static void min_tu_arrays_free(VVCFrameContext *fc)
+{
+    for (int i = LUMA; i <= CHROMA; i++) {
+        av_freep(&fc->tab.tb_pos_x0[i]);
+        av_freep(&fc->tab.tb_pos_y0[i]);
+        av_freep(&fc->tab.tb_width[i]);
+        av_freep(&fc->tab.tb_height[i]);
+        av_freep(&fc->tab.pcmf[i]);
+    }
+
+    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
+        av_freep(&fc->tab.qp[i]);
+        av_freep(&fc->tab.tu_coded_flag[i]);
+    }
+
+    av_freep(&fc->tab.tu_joint_cbcr_residual_flag);
+}
+
+static int min_tu_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_tu)
+{
+    if (fc->tab.pic_size_in_min_tu != pic_size_in_min_tu) {
+        min_tu_arrays_free(fc);
+        for (int i = LUMA; i <= CHROMA; i++) {
+            fc->tab.tb_pos_x0[i] = av_mallocz(pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_x0[0]));
+            fc->tab.tb_pos_y0[i] = av_mallocz(pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_y0[0])) ;
+            fc->tab.tb_width[i]  = av_mallocz(pic_size_in_min_tu);
+            fc->tab.tb_height[i] = av_mallocz(pic_size_in_min_tu);
+            fc->tab.pcmf[i]      = av_mallocz(pic_size_in_min_tu);
+            if (!fc->tab.tb_pos_x0[i] || !fc->tab.tb_pos_y0[i] ||
+                !fc->tab.tb_width[i] || !fc->tab.tb_height[i] || !fc->tab.pcmf[i])
+                return AVERROR(ENOMEM);
+        }
+
+        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
+            fc->tab.tu_coded_flag[i] = av_mallocz(pic_size_in_min_tu);
+            if (!fc->tab.tu_coded_flag[i])
+                return AVERROR(ENOMEM);
+
+            fc->tab.qp[i] = av_mallocz(pic_size_in_min_tu);
+            if (!fc->tab.qp[i])
+                return AVERROR(ENOMEM);
+        }
+
+        fc->tab.tu_joint_cbcr_residual_flag  = av_mallocz(pic_size_in_min_tu);
+        if (!fc->tab.tu_joint_cbcr_residual_flag)
+            return AVERROR(ENOMEM);
+    } else {
+        for (int i = LUMA; i <= CHROMA; i++) {
+            memset(fc->tab.tb_pos_x0[i], 0, pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_x0[0]));
+            memset(fc->tab.tb_pos_y0[i], 0, pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_y0[0])) ;
+            memset(fc->tab.tb_width[i], 0, pic_size_in_min_tu);
+            memset(fc->tab.tb_height[i], 0, pic_size_in_min_tu);
+            memset(fc->tab.pcmf[i], 0, pic_size_in_min_tu);
+        }
+
+        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
+            memset(fc->tab.tu_coded_flag[i], 0, pic_size_in_min_tu);
+            memset(fc->tab.qp[i], 0, pic_size_in_min_tu);
+        }
+        memset(fc->tab.tu_joint_cbcr_residual_flag, 0, pic_size_in_min_tu);
+    }
+    return 0;
+}
+
+static void min_pu_arrays_free(VVCFrameContext *fc)
+{
+    av_freep(&fc->tab.mvf);
+    av_freep(&fc->tab.msf);
+    av_freep(&fc->tab.iaf);
+    av_freep(&fc->tab.mmi);
+    ff_refstruct_pool_uninit(&fc->tab_dmvr_mvf_pool);
+}
+
+static int min_pu_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_pu)
+{
+    if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
+        min_pu_arrays_free(fc);
+        fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
+        fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
+        fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
+        fc->tab.mvf  = av_mallocz(pic_size_in_min_pu * sizeof(*fc->tab.mvf));
+        if (!fc->tab.msf || !fc->tab.iaf || !fc->tab.mmi || !fc->tab.mvf)
+            return AVERROR(ENOMEM);
+        fc->tab_dmvr_mvf_pool  = ff_refstruct_pool_alloc(pic_size_in_min_pu * sizeof(MvField), FF_REFSTRUCT_POOL_FLAG_ZERO_EVERY_TIME);
+        if (!fc->tab_dmvr_mvf_pool)
+            return AVERROR(ENOMEM);
+    } else {
+        memset(fc->tab.msf, 0, pic_size_in_min_pu);
+        memset(fc->tab.iaf, 0, pic_size_in_min_pu);
+        memset(fc->tab.mmi, 0, pic_size_in_min_pu);
+        memset(fc->tab.mvf, 0, pic_size_in_min_pu * sizeof(*fc->tab.mvf));
+    }
+
+    return 0;
+}
+
+static void bs_arrays_free(VVCFrameContext *fc)
+{
+    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
+        av_freep(&fc->tab.horizontal_bs[i]);
+        av_freep(&fc->tab.vertical_bs[i]);
+    }
+    av_freep(&fc->tab.horizontal_q);
+    av_freep(&fc->tab.horizontal_p);
+    av_freep(&fc->tab.vertical_p);
+    av_freep(&fc->tab.vertical_q);
+}
+
+static int bs_arrays_init(VVCFrameContext *fc, const int bs_width, const int bs_height)
+{
+    if (fc->tab.bs_width != bs_width || fc->tab.bs_height != bs_height) {
+        bs_arrays_free(fc);
+        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
+            fc->tab.horizontal_bs[i] = av_calloc(bs_width, bs_height);
+            fc->tab.vertical_bs[i]   = av_calloc(bs_width, bs_height);
+            if (!fc->tab.horizontal_bs[i] || !fc->tab.vertical_bs[i])
+                return AVERROR(ENOMEM);
+        }
+        fc->tab.horizontal_q = av_calloc(bs_width, bs_height);
+        fc->tab.horizontal_p = av_calloc(bs_width, bs_height);
+        fc->tab.vertical_p   = av_calloc(bs_width, bs_height);
+        fc->tab.vertical_q   = av_calloc(bs_width, bs_height);
+        if (!fc->tab.horizontal_q || !fc->tab.horizontal_p || !fc->tab.vertical_p || !fc->tab.vertical_q)
+            return AVERROR(ENOMEM);
+    } else {
+        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
+            memset(fc->tab.horizontal_bs[i], 0, bs_width * bs_height);
+            memset(fc->tab.vertical_bs[i], 0, bs_width * bs_height);
+        }
+        memset(fc->tab.horizontal_q, 0, bs_width * bs_height);
+        memset(fc->tab.horizontal_p, 0, bs_width * bs_height);
+        memset(fc->tab.vertical_p, 0, bs_width * bs_height);
+        memset(fc->tab.vertical_q, 0, bs_width * bs_height);
+    }
+    return 0;
+}
+
+static void pixel_buffer_free(VVCFrameContext *fc)
+{
+    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
+        av_freep(&fc->tab.sao_pixel_buffer_h[i]);
+        av_freep(&fc->tab.sao_pixel_buffer_v[i]);
+        for (int j = 0; j < 2; j++) {
+            av_freep(&fc->tab.alf_pixel_buffer_h[i][j]);
+            av_freep(&fc->tab.alf_pixel_buffer_v[i][j]);
+        }
+    }
+}
+
+static int pixel_buffer_init(VVCFrameContext *fc, const int width, const int height,
+    const int ctu_width, const int ctu_height, const int chroma_format_idc, const int ps)
+{
+    const VVCSPS *sps = fc->ps.sps;
+    const int c_end   = chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
+
+    if (fc->tab.chroma_format_idc != chroma_format_idc ||
+        fc->tab.width != width || fc->tab.height != height ||
+        fc->tab.ctu_width != ctu_width || fc->tab.ctu_height != ctu_height) {
+        pixel_buffer_free(fc);
+        for (int c_idx = 0; c_idx < c_end; c_idx++) {
+            const int w = width >> sps->hshift[c_idx];
+            const int h = height >> sps->vshift[c_idx];
+            fc->tab.sao_pixel_buffer_h[c_idx] = av_malloc((w * 2 * ctu_height) << ps);
+            fc->tab.sao_pixel_buffer_v[c_idx] = av_malloc((h * 2 * ctu_width)  << ps);
+            if (!fc->tab.sao_pixel_buffer_h[c_idx] || !fc->tab.sao_pixel_buffer_v[c_idx])
+                return AVERROR(ENOMEM);
+        }
+
+        for (int c_idx = 0; c_idx < c_end; c_idx++) {
+            const int w = width >> sps->hshift[c_idx];
+            const int h = height >> sps->vshift[c_idx];
+            const int border_pixels = c_idx ? ALF_BORDER_CHROMA : ALF_BORDER_LUMA;
+            for (int i = 0; i < 2; i++) {
+                fc->tab.alf_pixel_buffer_h[c_idx][i] = av_malloc((w * border_pixels * ctu_height) << ps);
+                fc->tab.alf_pixel_buffer_v[c_idx][i] = av_malloc(h * ALF_PADDING_SIZE * ctu_width);
+                if (!fc->tab.alf_pixel_buffer_h[c_idx][i] || !fc->tab.alf_pixel_buffer_v[c_idx][i])
+                    return AVERROR(ENOMEM);
+            }
+        }
+    }
+    return 0;
+}
+
+static void pic_arrays_free(VVCFrameContext *fc)
+{
+    ctb_arrays_free(fc);
+    min_cb_arrays_free(fc);
+    min_pu_arrays_free(fc);
+    min_tu_arrays_free(fc);
+    bs_arrays_free(fc);
+    ff_refstruct_pool_uninit(&fc->cu_pool);
+    ff_refstruct_pool_uninit(&fc->tu_pool);
+    pixel_buffer_free(fc);
+
+    for (int i = 0; i < 2; i++)
+        av_freep(&fc->tab.msm[i]);
+    av_freep(&fc->tab.ispmf);
+
+    fc->tab.ctu_count = 0;
+    fc->tab.ctu_size  = 0;
+    fc->tab.pic_size_in_min_cb = 0;
+    fc->tab.pic_size_in_min_pu = 0;
+    fc->tab.pic_size_in_min_tu = 0;
+    fc->tab.width              = 0;
+    fc->tab.height             = 0;
+    fc->tab.ctu_width          = 0;
+    fc->tab.ctu_height         = 0;
+    fc->tab.bs_width           = 0;
+    fc->tab.bs_height          = 0;
+}
+
+static int pic_arrays_init(VVCContext *s, VVCFrameContext *fc)
+{
+    const VVCSPS *sps               = fc->ps.sps;
+    const VVCPPS *pps               = fc->ps.pps;
+    const int ctu_size              = 1 << sps->ctb_log2_size_y << sps->ctb_log2_size_y;
+    const int pic_size_in_min_cb    = pps->min_cb_width * pps->min_cb_height;
+    const int pic_size_in_min_pu    = pps->min_pu_width * pps->min_pu_height;
+    const int pic_size_in_min_tu    = pps->min_tu_width * pps->min_tu_height;
+    const int w32                   = AV_CEIL_RSHIFT(pps->width,  5);
+    const int h32                   = AV_CEIL_RSHIFT(pps->height,  5);
+    const int w64                   = AV_CEIL_RSHIFT(pps->width,  6);
+    const int h64                   = AV_CEIL_RSHIFT(pps->height,  6);
+    const int bs_width              = (fc->ps.pps->width >> 2) + 1;
+    const int bs_height             = (fc->ps.pps->height >> 2) + 1;
+    int ret;
+
+    if ((ret = ctb_arrays_init(fc, pps->ctb_count, ctu_size)) < 0)
+        goto fail;
+
+    if ((ret = min_cb_arrays_init(fc, pic_size_in_min_cb)) < 0)
+        goto fail;
+
+    if ((ret = min_pu_arrays_init(fc, pic_size_in_min_pu)) < 0)
+        goto fail;
+
+    if ((ret = min_tu_arrays_init(fc, pic_size_in_min_tu)) < 0)
+        goto fail;
+
+    if ((ret = bs_arrays_init(fc, bs_width, bs_height)) < 0)
+        goto fail;
+
+    if ((ret = pixel_buffer_init(fc, pps->width, pps->height, pps->ctb_width, pps->ctb_height,
+        sps->r->sps_chroma_format_idc, sps->pixel_shift)) < 0)
+        goto fail;
+
+    if (AV_CEIL_RSHIFT(fc->tab.width,  5) != w32 || AV_CEIL_RSHIFT(fc->tab.height,  5) != h32) {
+        for (int i = LUMA; i <= CHROMA; i++) {
+            av_freep(&fc->tab.msm[i]);
+            fc->tab.msm[i] = av_calloc(w32, h32);
+            if (!fc->tab.msm[i])
+                goto fail;
+        }
+    } else {
+        for (int i = LUMA; i <= CHROMA; i++)
+            memset(fc->tab.msm[i], 0, w32 * h32);
+    }
+    if (AV_CEIL_RSHIFT(fc->tab.width,  6) != w64 || AV_CEIL_RSHIFT(fc->tab.height,  6) != h64) {
+        av_freep(&fc->tab.ispmf);
+        fc->tab.ispmf = av_calloc(w64, h64);
+        if (!fc->tab.ispmf)
+            goto fail;
+    } else {
+        memset(fc->tab.ispmf, 0, w64 * h64);
+    }
+
+    if (!fc->cu_pool) {
+        fc->cu_pool = ff_refstruct_pool_alloc(sizeof(CodingUnit), 0);
+        if (!fc->cu_pool)
+            goto fail;
+    }
+
+    if (!fc->tu_pool) {
+        fc->tu_pool = ff_refstruct_pool_alloc(sizeof(TransformUnit), 0);
+        if (!fc->tu_pool)
+            goto fail;
+    }
+
+    fc->tab.ctu_count = pps->ctb_count;
+    fc->tab.ctu_size  = ctu_size;
+    fc->tab.pic_size_in_min_cb = pic_size_in_min_cb;
+    fc->tab.pic_size_in_min_pu = pic_size_in_min_pu;
+    fc->tab.pic_size_in_min_tu = pic_size_in_min_tu;
+    fc->tab.width              = pps->width;
+    fc->tab.height             = pps->height;
+    fc->tab.ctu_width          = pps->ctb_width;
+    fc->tab.ctu_height         = pps->ctb_height;
+    fc->tab.chroma_format_idc  = sps->r->sps_chroma_format_idc;
+    fc->tab.pixel_shift        = sps->pixel_shift;
+    fc->tab.bs_width           = bs_width;
+    fc->tab.bs_height          = bs_height;
+
+    return 0;
+fail:
+    pic_arrays_free(fc);
+    return ret;
+}
+
+static int min_positive(const int idx, const int diff, const int min_diff)
+{
+    return diff > 0 && (idx < 0 || diff < min_diff);
+}
+
+static int max_negtive(const int idx, const int diff, const int max_diff)
+{
+    return diff < 0 && (idx < 0 || diff > max_diff);
+}
+
+typedef int (*smvd_find_fxn)(const int idx, const int diff, const int old_diff);
+
+static int8_t smvd_find(const VVCFrameContext *fc, const SliceContext *sc, int lx, smvd_find_fxn find)
+{
+    const H266RawSliceHeader *rsh   = sc->sh.r;
+    const RefPicList *rpl           = sc->rpl + lx;
+    const int poc                   = fc->ref->poc;
+    int8_t idx                      = -1;
+    int old_diff                    = -1;
+    for (int i = 0; i < rsh->num_ref_idx_active[lx]; i++) {
+        if (!rpl->isLongTerm[i]) {
+            int diff = poc - rpl->list[i];
+            if (find(idx, diff, old_diff)) {
+                idx = i;
+                old_diff = diff;
+            }
+        }
+    }
+    return idx;
+}
+
+static void vvc_smvd_ref_idx(const VVCFrameContext *fc, SliceContext *sc)
+{
+    VVCSH *sh = &sc->sh;
+    if (IS_B(sh->r)) {
+        sh->ref_idx_sym[0] = smvd_find(fc, sc, 0, min_positive);
+        sh->ref_idx_sym[1] = smvd_find(fc, sc, 1, max_negtive);
+        if (sh->ref_idx_sym[0] == -1 || sh->ref_idx_sym[1] == -1) {
+            sh->ref_idx_sym[0] = smvd_find(fc, sc, 0, max_negtive);
+            sh->ref_idx_sym[1] = smvd_find(fc, sc, 1, min_positive);
+        }
+    }
+}
+
+static void eps_free(SliceContext *slice)
+{
+    av_freep(&slice->eps);
+}
+
+static void slices_free(VVCFrameContext *fc)
+{
+    if (fc->slices) {
+        for (int i = 0; i < fc->nb_slices_allocated; i++) {
+            SliceContext *slice = fc->slices[i];
+            if (slice) {
+                ff_refstruct_unref(&slice->sh.r);
+                eps_free(slice);
+                av_free(slice);
+            }
+        }
+        av_freep(&fc->slices);
+    }
+    fc->nb_slices_allocated = 0;
+    fc->nb_slices = 0;
+}
+
+static int slices_realloc(VVCFrameContext *fc)
+{
+    void *p;
+    const int size = (fc->nb_slices_allocated + 1) * 3 / 2;
+
+    if (fc->nb_slices < fc->nb_slices_allocated)
+        return 0;
+
+    p = av_realloc(fc->slices, size * sizeof(*fc->slices));
+    if (!p)
+        return AVERROR(ENOMEM);
+
+    fc->slices = p;
+    for (int i = fc->nb_slices_allocated; i < size; i++) {
+        fc->slices[i] = av_calloc(1, sizeof(*fc->slices[0]));
+        if (!fc->slices[i]) {
+            for (int j = fc->nb_slices_allocated; j < i; j++)
+                av_freep(&fc->slices[j]);
+            return AVERROR(ENOMEM);
+        }
+        fc->slices[i]->slice_idx = i;
+    }
+    fc->nb_slices_allocated = size;
+    return 0;
+}
+
+static void ep_init_cabac_decoder(SliceContext *sc, const int index, const H2645NAL *nal, GetBitContext *gb)
+{
+    const H266RawSliceHeader *rsh   = sc->sh.r;
+    EntryPoint *ep                  = sc->eps + index;
+    int size;
+
+    if (index < rsh->num_entry_points) {
+        int skipped = 0;
+        int64_t start =  (gb->index >> 3);
+        int64_t end = start + rsh->sh_entry_point_offset_minus1[index] + 1;
+        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] <= start) {
+            skipped++;
+        }
+        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] < end) {
+            end--;
+            skipped++;
+        }
+        size = end - start;
+    } else {
+        size = get_bits_left(gb) / 8;
+    }
+    ff_init_cabac_decoder (&ep->cc, gb->buffer + get_bits_count(gb) / 8, size);
+    skip_bits(gb, size * 8);
+}
+
+static int init_slice_context(SliceContext *sc, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
+{
+    const VVCSH *sh             = &sc->sh;
+    const H266RawSlice *slice   = (const H266RawSlice *)unit->content;
+    int nb_eps                  = sh->r->num_entry_points + 1;
+    int ctu_addr                = 0;
+    GetBitContext gb;
+
+    if (sc->nb_eps != nb_eps) {
+        eps_free(sc);
+        sc->eps = av_calloc(nb_eps, sizeof(*sc->eps));
+        if (!sc->eps)
+            return AVERROR(ENOMEM);
+        sc->nb_eps = nb_eps;
+    }
+
+    init_get_bits8(&gb, slice->data, slice->data_size);
+    for (int i = 0; i < sc->nb_eps; i++)
+    {
+        EntryPoint *ep = sc->eps + i;
+
+        ep->ctu_start = ctu_addr;
+        ep->ctu_end   = (i + 1 == sc->nb_eps ? sh->num_ctus_in_curr_slice : sh->entry_point_start_ctu[i]);
+
+        for (int j = ep->ctu_start; j < ep->ctu_end; j++) {
+            const int rs = sc->sh.ctb_addr_in_curr_slice[j];
+            fc->tab.slice_idx[rs] = sc->slice_idx;
+        }
+
+        ep_init_cabac_decoder(sc, i, nal, &gb);
+
+        if (i + 1 < sc->nb_eps)
+            ctu_addr = sh->entry_point_start_ctu[i];
+    }
+
+    return 0;
+}
+
+static VVCFrameContext* get_frame_context(const VVCContext *s, const VVCFrameContext *fc, const int delta)
+{
+    const int size = s->nb_fcs;
+    const int idx = (fc - s->fcs + delta  + size) % size;
+    return s->fcs + idx;
+}
+
+static int vvc_ref_frame(VVCFrameContext *fc, VVCFrame *dst, VVCFrame *src)
+{
+    int ret;
+
+    ret = av_frame_ref(dst->frame, src->frame);
+    if (ret < 0)
+        return ret;
+
+    ff_refstruct_replace(&dst->progress, src->progress);
+
+    ff_refstruct_replace(&dst->tab_dmvr_mvf, src->tab_dmvr_mvf);
+
+    ff_refstruct_replace(&dst->rpl_tab, src->rpl_tab);
+    ff_refstruct_replace(&dst->rpl, src->rpl);
+    dst->nb_rpl_elems = src->nb_rpl_elems;
+
+    dst->poc = src->poc;
+    dst->ctb_count = src->ctb_count;
+    dst->flags = src->flags;
+    dst->sequence = src->sequence;
+
+    return 0;
+}
+
+static av_cold void frame_context_free(VVCFrameContext *fc)
+{
+    slices_free(fc);
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(fc->DPB); i++) {
+        ff_vvc_unref_frame(fc, &fc->DPB[i], ~0);
+        av_frame_free(&fc->DPB[i].frame);
+    }
+
+    ff_vvc_frame_thread_free(fc);
+    pic_arrays_free(fc);
+    av_frame_free(&fc->output_frame);
+    ff_vvc_frame_ps_free(&fc->ps);
+    av_freep(&fc->avctx);
+}
+
+static av_cold int frame_context_init(VVCFrameContext *fc, AVCodecContext *avctx)
+{
+
+    fc->avctx = av_memdup(avctx, sizeof(*avctx));
+    if (!fc->avctx)
+        goto fail;
+
+    fc->output_frame = av_frame_alloc();
+    if (!fc->output_frame)
+        goto fail;
+
+    for (int j = 0; j < FF_ARRAY_ELEMS(fc->DPB); j++) {
+        fc->DPB[j].frame = av_frame_alloc();
+        if (!fc->DPB[j].frame)
+            goto fail;
+    }
+
+    return 0;
+fail:
+    return AVERROR(ENOMEM);
+}
+
+static int frame_context_setup(VVCFrameContext *fc, VVCContext *s)
+{
+    int ret = 0;
+
+    // copy refs from the last frame
+    if (s->nb_frames && s->nb_fcs > 1) {
+        VVCFrameContext *prev = get_frame_context(s, fc, -1);
+        for (int i = 0; i < FF_ARRAY_ELEMS(fc->DPB); i++) {
+            ff_vvc_unref_frame(fc, &fc->DPB[i], ~0);
+            if (prev->DPB[i].frame->buf[0]) {
+                ret = vvc_ref_frame(fc, &fc->DPB[i], &prev->DPB[i]);
+                if (ret < 0)
+                    goto fail;
+            }
+        }
+    }
+
+    if (IS_IDR(s)) {
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        ff_vvc_clear_refs(fc);
+    }
+
+    ret = pic_arrays_init(s, fc);
+    if (ret < 0)
+        goto fail;
+    ff_vvc_dsp_init(&fc->vvcdsp, fc->ps.sps->bit_depth);
+    ff_videodsp_init(&fc->vdsp, fc->ps.sps->bit_depth);
+
+fail:
+    return ret;
+}
+
+static void export_frame_params(VVCFrameContext *fc)
+{
+    AVCodecContext *c   = fc->avctx;
+    const VVCSPS *sps   = fc->ps.sps;
+    const VVCPPS *pps   = fc->ps.pps;
+
+    c->pix_fmt          = sps->pix_fmt;
+    c->coded_width      = pps->width;
+    c->coded_height     = pps->height;
+    c->width            = pps->width  - pps->r->pps_conf_win_left_offset - pps->r->pps_conf_win_right_offset;
+    c->height           = pps->height - pps->r->pps_conf_win_top_offset - pps->r->pps_conf_win_bottom_offset;
+}
+
+static int decode_slice(VVCContext *s, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
+{
+    int ret = 0;
+    SliceContext *sc;
+    VVCSH *sh;
+    const int is_first_slice = !fc->nb_slices;
+
+    ret = slices_realloc(fc);
+    if (ret < 0)
+        return ret;
+    sc = fc->slices[fc->nb_slices];
+
+    sh = &sc->sh;
+
+    if (ret < 0)
+        goto fail;
+
+    s->vcl_unit_type = nal->type;
+    if (is_first_slice) {
+        //first slice
+        ret = ff_vvc_decode_frame_ps(&fc->ps, s);
+        if (ret < 0)
+            return ret;
+
+        ret = frame_context_setup(fc, s);
+        if (ret < 0)
+            goto fail;
+
+        export_frame_params(fc);
+    }
+
+    ret = ff_vvc_decode_sh(&sc->sh, &fc->ps, unit);
+    if (ret < 0)
+        return ret;
+
+    if (is_first_slice) {
+        ret = vvc_frame_start(s, fc, sc);
+        if (ret < 0)
+            return ret;
+    } else if (fc->ref) {
+        if (!IS_I(sh->r)) {
+            ret = ff_vvc_slice_rpl(s, fc, sc);
+            if (ret < 0) {
+                av_log(fc->avctx, AV_LOG_WARNING,
+                       "Error constructing the reference lists for the current slice.\n");
+                return ret;
+            }
+        }
+    } else {
+        av_log(fc->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
+        return ret;
+    }
+
+    if (!IS_I(sh->r))
+        vvc_smvd_ref_idx(fc, sc);
+
+    ret = init_slice_context(sc, fc, nal, unit);
+    if (ret < 0)
+        goto fail;
+    fc->nb_slices++;
+
+fail:
+    return ret;
+}
+
+static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
+{
+    int  ret;
+
+    s->temporal_id   = nal->temporal_id;
+
+    switch (unit->type) {
+    case VVC_VPS_NUT:
+    case VVC_SPS_NUT:
+    case VVC_PPS_NUT:
+        /* vps, sps, sps cached by s->cbc */
+        break;
+    case VVC_TRAIL_NUT:
+    case VVC_STSA_NUT:
+    case VVC_RADL_NUT:
+    case VVC_RASL_NUT:
+    case VVC_IDR_W_RADL:
+    case VVC_IDR_N_LP:
+    case VVC_CRA_NUT:
+    case VVC_GDR_NUT:
+        ret = decode_slice(s, fc, nal, unit);
+        if (ret < 0)
+            goto fail;
+        break;
+    case VVC_PREFIX_APS_NUT:
+    case VVC_SUFFIX_APS_NUT:
+        ret = ff_vvc_decode_aps(&s->ps, unit);
+        if (ret < 0)
+            goto fail;
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_INFO,
+               "Skipping NAL unit %d\n", unit->type);
+    }
+
+    return 0;
+fail:
+    return ret;
+}
+
+static int decode_nal_units(VVCContext *s, VVCFrameContext *fc, AVPacket *avpkt)
+{
+    const CodedBitstreamH266Context *h266   = (const CodedBitstreamH266Context *)s->cbc->priv_data;
+    CodedBitstreamFragment *frame           = &s->current_frame;
+    int i, ret = 0;
+    int eos_at_start = 1;
+    s->last_eos = s->eos;
+    s->eos = 0;
+
+    ff_cbs_fragment_reset(frame);
+    ret = ff_cbs_read_packet(s->cbc, frame, avpkt);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Failed to read packet.\n");
+        return ret;
+    }
+    /* decode the NAL units */
+    for (i = 0; i < frame->nb_units; i++) {
+        const H2645NAL *nal             = h266->common.read_packet.nals + i;
+        const CodedBitstreamUnit *unit  = frame->units + i;
+
+        if (unit->type == VVC_EOB_NUT || unit->type == VVC_EOS_NUT) {
+            if (eos_at_start)
+                s->last_eos = 1;
+            else
+                s->eos = 1;
+        } else {
+            ret = decode_nal_unit(s, fc, nal, unit);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_WARNING,
+                        "Error parsing NAL unit #%d.\n", i);
+                goto fail;
+            }
+        }
+    }
+    return 0;
+
+fail:
+    if (fc->ref)
+        ff_vvc_report_frame_finished(fc->ref);
+    return ret;
+}
+
+static int set_output_format(const VVCContext *s, const AVFrame *output)
+{
+    AVCodecContext *c = s->avctx;
+    int ret;
+
+    if (output->width != c->width || output->height != c->height) {
+        if ((ret = ff_set_dimensions(c, output->width, output->height)) < 0)
+            return ret;
+    }
+    c->pix_fmt = output->format;
+    return 0;
+}
+
+static int wait_delayed_frame(VVCContext *s, AVFrame *output, int *got_output)
+{
+    VVCFrameContext *delayed = get_frame_context(s, s->fcs, s->nb_frames - s->nb_delayed);
+    int ret = ff_vvc_frame_wait(s, delayed);
+
+    if (!ret && delayed->output_frame->buf[0]) {
+        av_frame_move_ref(output, delayed->output_frame);
+        ret = set_output_format(s, output);
+        if (!ret)
+            *got_output = 1;
+    }
+    s->nb_delayed--;
+
+    return ret;
+}
+
+static int submit_frame(VVCContext *s, VVCFrameContext *fc, AVFrame *output, int *got_output)
+{
+    int ret;
+    s->nb_frames++;
+    s->nb_delayed++;
+    ff_vvc_frame_submit(s, fc);
+    if (s->nb_delayed >= s->nb_fcs) {
+        if ((ret = wait_delayed_frame(s, output, got_output)) < 0)
+            return ret;
+    }
+    return 0;
+}
 
 static int vvc_decode_frame(AVCodecContext *avctx, AVFrame *output,
     int *got_output, AVPacket *avpkt)
 {
+    VVCContext *s = avctx->priv_data;
+    VVCFrameContext *fc;
+    int ret;
+
+    if (!avpkt->size) {
+        while (s->nb_delayed) {
+            if ((ret = wait_delayed_frame(s, output, got_output)) < 0)
+                return ret;
+            if (*got_output)
+                return 0;
+        }
+        if (s->nb_frames) {
+            //we still have frames cached in dpb.
+            VVCFrameContext *last = get_frame_context(s, s->fcs, s->nb_frames - 1);
+
+            ret = ff_vvc_output_frame(s, last, output, 0, 1);
+            if (ret < 0)
+                return ret;
+            if (ret) {
+                *got_output = ret;
+                if ((ret = set_output_format(s, output)) < 0)
+                    return ret;
+            }
+        }
+        return 0;
+    }
+
+    fc = get_frame_context(s, s->fcs, s->nb_frames);
+
+    fc->nb_slices = 0;
+    fc->decode_order = s->nb_frames;
+
+    ret = decode_nal_units(s, fc, avpkt);
+    if (ret < 0)
+        return ret;
+
+    ret = submit_frame(s, fc, output, got_output);
+    if (ret < 0)
+        return ret;
+
     return avpkt->size;
 }
 
 static void vvc_decode_flush(AVCodecContext *avctx)
 {
+    VVCContext *s = avctx->priv_data;
+    int got_output;
+    AVFrame *output = av_frame_alloc();
+
+    if (output) {
+        while (s->nb_delayed) {
+            wait_delayed_frame(s, output, &got_output);
+            if (got_output) {
+                av_frame_unref(output);
+            }
+        }
+        av_frame_free(&output);
+    }
 }
 
 static av_cold int vvc_decode_free(AVCodecContext *avctx)
 {
+    VVCContext *s = avctx->priv_data;
+    int i;
+
+    ff_cbs_fragment_free(&s->current_frame);
+    vvc_decode_flush(avctx);
+    ff_vvc_executor_free(&s->executor);
+    if (s->fcs) {
+        for (i = 0; i < s->nb_fcs; i++)
+            frame_context_free(s->fcs + i);
+        av_free(s->fcs);
+    }
+    ff_vvc_ps_uninit(&s->ps);
+    ff_cbs_close(&s->cbc);
+
     return 0;
 }
 
+#define VVC_MAX_FRMAE_DELAY 16
 static av_cold int vvc_decode_init(AVCodecContext *avctx)
 {
+    VVCContext *s       = avctx->priv_data;
+    int ret;
+
+    s->avctx = avctx;
+
+    if (ff_cbs_init(&s->cbc, AV_CODEC_ID_VVC, avctx))
+        goto fail;
+
+    s->nb_fcs = (avctx->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 : FFMIN(av_cpu_count(), VVC_MAX_FRMAE_DELAY);
+    s->fcs = av_calloc(s->nb_fcs, sizeof(*s->fcs));
+    if (!s->fcs)
+        goto fail;
+
+    for (int i = 0; i < s->nb_fcs; i++) {
+        VVCFrameContext *fc = s->fcs + i;
+        ret = frame_context_init(fc, avctx);
+        if (ret < 0)
+            goto fail;
+    }
+
+    s->executor = ff_vvc_executor_alloc(s, s->nb_fcs);
+    if (!s->executor)
+        goto fail;
+
+    s->eos = 1;
+    GDR_SET_RECOVERED(s);
+    memset(&ff_vvc_default_scale_m, 16, sizeof(ff_vvc_default_scale_m));
+
     return 0;
+
+fail:
+    vvc_decode_free(avctx);
+    return AVERROR(ENOMEM);
 }
 
 const FFCodec ff_vvc_decoder = {