[FFmpeg-devel,07/14] vvcdec: add inter prediction

Message ID	20230521130319.13813-8-nuomi2021@gmail.com
State	Superseded
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: Nuo Mi <nuomi2021@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Sun, 21 May 2023 21:03:12 +0800 Message-Id: <20230521130319.13813-8-nuomi2021@gmail.com> In-Reply-To: <20230521130319.13813-1-nuomi2021@gmail.com> References: <20230521130319.13813-1-nuomi2021@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 07/14] vvcdec: add inter prediction Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Nuo Mi <nuomi2021@gmail.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	add vvc decoder c code \| expand [FFmpeg-devel,00/14] add vvc decoder c code [FFmpeg-devel,01/14] vvcdec: add thread executor [FFmpeg-devel,02/14] vvcdec: add vvc decoder stub [FFmpeg-devel,03/14] vvcdec: add sps, pps, sh parser [FFmpeg-devel,04/14] vvcdec: add cabac decoder [FFmpeg-devel,05/14] vvcdec: add reference management [FFmpeg-devel,06/14] vvcdec: add motion vector decoder [FFmpeg-devel,07/14] vvcdec: add inter prediction [FFmpeg-devel,08/14] vvcdec: add inv transform 1d [FFmpeg-devel,09/14] vvcdec: add intra prediction [FFmpeg-devel,10/14] vvcdec: add LMCS, Deblocking, SAO, and ALF filters [FFmpeg-devel,11/14] vvcdec: add dsp init and inv transform [FFmpeg-devel,12/14] vvcdec: add CTU(Coding Tree Unit) parser [FFmpeg-devel,13/14] vvcdec: add CTU thread logical [FFmpeg-devel,14/14] vvcdec: add full vvc decoder

Context	Check	Description
andriy/make_x86	fail	Make failed

diff --git a/libavcodec/vvc/Makefile b/libavcodec/vvc/Makefile index 7c59b99f15..f60a85ac52 100644 --- a/libavcodec/vvc/Makefile +++ b/libavcodec/vvc/Makefile @@ -8,4 +8,5 @@ OBJS-$(CONFIG_VVC_DECODER) += vvc/vvcdec.o \ vvc/vvc_cabac.o \ vvc/vvc_refs.o \ vvc/vvc_mvs.o \ - vvc/vvc_ctu.o + vvc/vvc_ctu.o \ + vvc/vvc_inter.o diff --git a/libavcodec/vvc/vvc_inter.c b/libavcodec/vvc/vvc_inter.c new file mode 100644 index 0000000000..7fe1fd7ef4 --- /dev/null +++ b/libavcodec/vvc/vvc_inter.c @@ -0,0 +1,1051 @@ +/* + * VVC inter prediction + * + * Copyright (C) 2022 Nuo Mi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vvc_data.h" +#include "vvc_inter.h" +#include "vvc_mvs.h" +#include "vvc_refs.h" + +static const int bcw_w_lut[] = {4, 5, 3, 10, -2}; + +static int emulated_edge(const VVCFrameContext *fc, uint8_t *dst, const uint8_t **src, ptrdiff_t *src_stride, + const int x_off, const int y_off, const int block_w, const int block_h, const int is_luma) +{ + const int extra_before = is_luma ? LUMA_EXTRA_BEFORE : CHROMA_EXTRA_BEFORE; + const int extra_after = is_luma ? LUMA_EXTRA_AFTER : CHROMA_EXTRA_AFTER; + const int extra = is_luma ? LUMA_EXTRA : CHROMA_EXTRA; + const int pic_width = is_luma ? fc->ps.pps->width : (fc->ps.pps->width >> fc->ps.sps->hshift[1]); + const int pic_height = is_luma ? fc->ps.pps->height : (fc->ps.pps->height >> fc->ps.sps->vshift[1]); + + if (x_off < extra_before || y_off < extra_before || + x_off >= pic_width - block_w - extra_after || + y_off >= pic_height - block_h - extra_after) { + const ptrdiff_t edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << fc->ps.sps->pixel_shift; + int offset = extra_before * *src_stride + (extra_before << fc->ps.sps->pixel_shift); + int buf_offset = extra_before * edge_emu_stride + (extra_before << fc->ps.sps->pixel_shift); + + fc->vdsp.emulated_edge_mc(dst, *src - offset, edge_emu_stride, *src_stride, + block_w + extra, block_h + extra, x_off - extra_before, y_off - extra_before, + pic_width, pic_height); + + *src = dst + buf_offset; + *src_stride = edge_emu_stride; + return 1; + } + return 0; +} + +static void emulated_edge_dmvr(const VVCFrameContext *fc, uint8_t *dst, const uint8_t **src, ptrdiff_t *src_stride, + const int x_sb, const int y_sb, const int x_off, const int y_off, const int block_w, const int block_h, const int is_luma) +{ + const int extra_before = is_luma ? LUMA_EXTRA_BEFORE : CHROMA_EXTRA_BEFORE; + const int extra_after = is_luma ? LUMA_EXTRA_AFTER : CHROMA_EXTRA_AFTER; + const int extra = is_luma ? LUMA_EXTRA : CHROMA_EXTRA; + const int pic_width = is_luma ? fc->ps.pps->width : (fc->ps.pps->width >> fc->ps.sps->hshift[1]); + const int pic_height = is_luma ? fc->ps.pps->height : (fc->ps.pps->height >> fc->ps.sps->vshift[1]); + + if (x_off < extra_before || y_off < extra_before || + x_off >= pic_width - block_w - extra_after || + y_off >= pic_height - block_h - extra_after|| + (x_off != x_sb || y_off != y_sb)) { + const int ps = fc->ps.sps->pixel_shift; + const ptrdiff_t edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << ps; + const int offset = extra_before * *src_stride + (extra_before << ps); + const int buf_offset = extra_before * edge_emu_stride + (extra_before << ps); + + const int start_x = FFMIN(FFMAX(x_sb - extra_before, 0), pic_width - 1); + const int start_y = FFMIN(FFMAX(y_sb - extra_before, 0), pic_height - 1); + const int width = FFMAX(FFMIN(pic_width, x_sb + block_w + extra_after) - start_x, 1); + const int height = FFMAX(FFMIN(pic_height, y_sb + block_h + extra_after) - start_y, 1); + + fc->vdsp.emulated_edge_mc(dst, *src - offset, edge_emu_stride, *src_stride, block_w + extra, block_h + extra, + x_off - start_x - extra_before, y_off - start_y - extra_before, width, height); + + *src = dst + buf_offset; + *src_stride = edge_emu_stride; + } +} + +static void emulated_edge_bilinear(const VVCFrameContext *fc, uint8_t *dst, uint8_t **src, ptrdiff_t *src_stride, + const int x_off, const int y_off, const int block_w, const int block_h) +{ + int pic_width = fc->ps.pps->width; + int pic_height = fc->ps.pps->height; + + if (x_off < BILINEAR_EXTRA_BEFORE || y_off < BILINEAR_EXTRA_BEFORE || + x_off >= pic_width - block_w - BILINEAR_EXTRA_AFTER || + y_off >= pic_height - block_h - BILINEAR_EXTRA_AFTER) { + const ptrdiff_t edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << fc->ps.sps->pixel_shift; + const int offset = BILINEAR_EXTRA_BEFORE * *src_stride + (BILINEAR_EXTRA_BEFORE << fc->ps.sps->pixel_shift); + const int buf_offset = BILINEAR_EXTRA_BEFORE * edge_emu_stride + (BILINEAR_EXTRA_BEFORE << fc->ps.sps->pixel_shift); + + fc->vdsp.emulated_edge_mc(dst, *src - offset, edge_emu_stride, *src_stride, block_w + BILINEAR_EXTRA, block_h + BILINEAR_EXTRA, + x_off - BILINEAR_EXTRA_BEFORE, y_off - BILINEAR_EXTRA_BEFORE, pic_width, pic_height); + + *src = dst + buf_offset; + *src_stride = edge_emu_stride; + } +} + + +#define EMULATED_EDGE_LUMA(dst, src, src_stride, x_off, y_off) \ + emulated_edge(fc, dst, src, src_stride, x_off, y_off, block_w, block_h, 1) + +#define EMULATED_EDGE_CHROMA(dst, src, src_stride, x_off, y_off) \ + emulated_edge(fc, dst, src, src_stride, x_off, y_off, block_w, block_h, 0) + +#define EMULATED_EDGE_DMVR_LUMA(dst, src, src_stride, x_sb, y_sb, x_off, y_off) \ + emulated_edge_dmvr(fc, dst, src, src_stride, x_sb, y_sb, x_off, y_off, block_w, block_h, 1) + +#define EMULATED_EDGE_DMVR_CHROMA(dst, src, src_stride, x_sb, y_sb, x_off, y_off) \ + emulated_edge_dmvr(fc, dst, src, src_stride, x_sb, y_sb, x_off, y_off, block_w, block_h, 0) + +#define EMULATED_EDGE_BILINEAR(dst, src, src_stride, x_off, y_off) \ + emulated_edge_bilinear(fc, dst, src, src_stride, x_off, y_off, pred_w, pred_h) + +// part of 8.5.6.6 Weighted sample prediction process +static int derive_weight_uni(int *denom, int *wx, int *ox, + const VVCLocalContext *lc, const MvField *mvf, const int c_idx) +{ + const VVCFrameContext *fc = lc->fc; + const VVCPPS *pps = fc->ps.pps; + const VVCSH *sh = &lc->sc->sh; + const int weight_flag = (IS_P(sh) && pps->weighted_pred_flag) || + (IS_B(sh) && pps->weighted_bipred_flag); + if (weight_flag) { + const int lx = mvf->pred_flag - PF_L0; + const PredWeightTable *w = pps->wp_info_in_ph_flag ? &fc->ps.ph->pwt : &sh->pwt; + + *denom = w->log2_denom[c_idx > 0]; + *wx = w->weight[lx][c_idx][mvf->ref_idx[lx]]; + *ox = w->offset[lx][c_idx][mvf->ref_idx[lx]]; + } + return weight_flag; +} + +// part of 8.5.6.6 Weighted sample prediction process +static int derive_weight(int *denom, int *w0, int *w1, int *o0, int *o1, + const VVCLocalContext *lc, const MvField *mvf, const int c_idx, const int dmvr_flag) +{ + const VVCFrameContext *fc = lc->fc; + const VVCPPS *pps = fc->ps.pps; + const VVCSH *sh = &lc->sc->sh; + const int bcw_idx = mvf->bcw_idx; + const int weight_flag = (IS_P(sh) && pps->weighted_pred_flag) || + (IS_B(sh) && fc->ps.pps->weighted_bipred_flag && !dmvr_flag); + if ((!weight_flag && !bcw_idx) || (bcw_idx && lc->cu->ciip_flag)) + return 0; + + if (bcw_idx) { + *denom = 2; + *w1 = bcw_w_lut[bcw_idx]; + *w0 = 8 - *w1; + *o0 = *o1 = 0; + } else { + const VVCPPS *pps = fc->ps.pps; + const PredWeightTable *w = pps->wp_info_in_ph_flag ? &fc->ps.ph->pwt : &sh->pwt; + + *denom = w->log2_denom[c_idx > 0]; + *w0 = w->weight[L0][c_idx][mvf->ref_idx[L0]]; + *w1 = w->weight[L1][c_idx][mvf->ref_idx[L1]]; + *o0 = w->offset[L0][c_idx][mvf->ref_idx[L0]]; + *o1 = w->offset[L1][c_idx][mvf->ref_idx[L1]]; + } + return 1; +} + +static void luma_mc(VVCLocalContext *lc, int16_t *dst, const AVFrame *ref, const Mv *mv, + int x_off, int y_off, const int block_w, const int block_h) +{ + const VVCFrameContext *fc = lc->fc; + const uint8_t *src = ref->data[0]; + ptrdiff_t src_stride = ref->linesize[0]; + + const int mx = mv->x & 0xf; + const int my = mv->y & 0xf; + + x_off += mv->x >> 4; + y_off += mv->y >> 4; + src += y_off * src_stride + (x_off * (1 << fc->ps.sps->pixel_shift)); + + EMULATED_EDGE_LUMA(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off); + + fc->vvcdsp.inter.put[LUMA][!!my][!!mx](dst, src, src_stride, block_h, mx, my, block_w, 0, 0); +} + +static void chroma_mc(VVCLocalContext *lc, int16_t *dst, const AVFrame *ref, const Mv *mv, + int x_off, int y_off, const int block_w, const int block_h, const int c_idx) +{ + const VVCFrameContext *fc = lc->fc; + const uint8_t *src = ref->data[c_idx]; + ptrdiff_t src_stride = ref->linesize[c_idx]; + int hs = fc->ps.sps->hshift[c_idx]; + int vs = fc->ps.sps->vshift[c_idx]; + + const intptr_t mx = av_mod_uintp2(mv->x, 4 + hs); + const intptr_t my = av_mod_uintp2(mv->y, 4 + vs); + const intptr_t _mx = mx << (1 - hs); + const intptr_t _my = my << (1 - vs); + + x_off += mv->x >> (4 + hs); + y_off += mv->y >> (4 + vs); + src += y_off * src_stride + (x_off * (1 << fc->ps.sps->pixel_shift)); + + EMULATED_EDGE_CHROMA(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off); + fc->vvcdsp.inter.put[CHROMA][!!my][!!mx](dst, src, src_stride, block_h, _mx, _my, block_w, 0, 0); +} + +static void luma_mc_uni(VVCLocalContext *lc, uint8_t *dst, const ptrdiff_t dst_stride, + const AVFrame *ref, const MvField *mvf, int x_off, int y_off, const int block_w, const int block_h, + const int hf_idx, const int vf_idx) +{ + const VVCFrameContext *fc = lc->fc; + const int lx = mvf->pred_flag - PF_L0; + const Mv *mv = mvf->mv + lx; + const uint8_t *src = ref->data[0]; + ptrdiff_t src_stride = ref->linesize[0]; + const int mx = mv->x & 0xf; + const int my = mv->y & 0xf; + int denom, wx, ox; + + x_off += mv->x >> 4; + y_off += mv->y >> 4; + src += y_off * src_stride + (x_off * (1 << fc->ps.sps->pixel_shift)); + + EMULATED_EDGE_LUMA(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off); + + if (derive_weight_uni(&denom, &wx, &ox, lc, mvf, LUMA)) { + fc->vvcdsp.inter.put_uni_w[LUMA][!!my][!!mx](dst, dst_stride, src, src_stride, + block_h, denom, wx, ox, mx, my, block_w, hf_idx, vf_idx); + } else { + fc->vvcdsp.inter.put_uni[LUMA][!!my][!!mx](dst, dst_stride, src, src_stride, + block_h, mx, my, block_w, hf_idx, vf_idx); + } +} + +static void luma_bdof(VVCLocalContext *lc, uint8_t *dst, const ptrdiff_t dst_stride, + const uint8_t *_src0, const ptrdiff_t src0_stride, const int mx0, const int my0, + const uint8_t *_src1, const ptrdiff_t src1_stride, const int mx1, const int my1, + const int block_w, const int block_h, const int hf_idx, const int vf_idx) +{ + const VVCFrameContext *fc = lc->fc; + int16_t *tmp0 = lc->tmp + 1 + MAX_PB_SIZE; + int16_t *tmp1 = lc->tmp1 + 1 + MAX_PB_SIZE; + + fc->vvcdsp.inter.put[LUMA][!!my0][!!mx0](tmp0, _src0, src0_stride, + block_h, mx0, my0, block_w, hf_idx, vf_idx); + fc->vvcdsp.inter.bdof_fetch_samples(tmp0, _src0, src0_stride, mx0, my0, block_w, block_h); + + fc->vvcdsp.inter.put[LUMA][!!my1][!!mx1](tmp1, _src1, src1_stride, + block_h, mx1, my1, block_w, hf_idx, vf_idx); + fc->vvcdsp.inter.bdof_fetch_samples(tmp1, _src1, src1_stride, mx1, my1, block_w, block_h); + fc->vvcdsp.inter.apply_bdof(dst, dst_stride, tmp0, tmp1, block_w, block_h); +} + + static void luma_mc_bi(VVCLocalContext *lc, uint8_t *dst, const ptrdiff_t dst_stride, + const AVFrame *ref0, const Mv *mv0, const int x_off, const int y_off, const int block_w, const int block_h, + const AVFrame *ref1, const Mv *mv1, const MvField *mvf, const int hf_idx, const int vf_idx, + const MvField *orig_mv, const int dmvr_flag, const int sb_bdof_flag) +{ + const VVCFrameContext *fc = lc->fc; + ptrdiff_t src0_stride = ref0->linesize[0]; + ptrdiff_t src1_stride = ref1->linesize[0]; + const int mx0 = mv0->x & 0xf; + const int my0 = mv0->y & 0xf; + const int mx1 = mv1->x & 0xf; + const int my1 = mv1->y & 0xf; + + const int x_off0 = x_off + (mv0->x >> 4); + const int y_off0 = y_off + (mv0->y >> 4); + const int x_off1 = x_off + (mv1->x >> 4); + const int y_off1 = y_off + (mv1->y >> 4); + + const uint8_t *src0 = ref0->data[0] + y_off0 * src0_stride + (int)((unsigned)x_off0 << fc->ps.sps->pixel_shift); + const uint8_t *src1 = ref1->data[0] + y_off1 * src1_stride + (int)((unsigned)x_off1 << fc->ps.sps->pixel_shift); + + if (dmvr_flag) { + const int x_sb0 = x_off + (orig_mv->mv[L0].x >> 4); + const int y_sb0 = y_off + (orig_mv->mv[L0].y >> 4); + const int x_sb1 = x_off + (orig_mv->mv[L1].x >> 4); + const int y_sb1 = y_off + (orig_mv->mv[L1].y >> 4); + EMULATED_EDGE_DMVR_LUMA(lc->edge_emu_buffer, &src0, &src0_stride, x_sb0, y_sb0, x_off0, y_off0); + EMULATED_EDGE_DMVR_LUMA(lc->edge_emu_buffer2, &src1, &src1_stride, x_sb1, y_sb1, x_off1, y_off1); + } else { + EMULATED_EDGE_LUMA(lc->edge_emu_buffer, &src0, &src0_stride, x_off0, y_off0); + EMULATED_EDGE_LUMA(lc->edge_emu_buffer2, &src1, &src1_stride, x_off1, y_off1); + } + if (sb_bdof_flag) { + luma_bdof(lc, dst, dst_stride, src0, src0_stride, mx0, my0, src1, src1_stride, mx1, my1, + block_w, block_h, hf_idx, vf_idx); + } else { + int denom, w0, w1, o0, o1; + fc->vvcdsp.inter.put[LUMA][!!my0][!!mx0](lc->tmp, src0, src0_stride, + block_h, mx0, my0, block_w, hf_idx, vf_idx); + if (derive_weight(&denom, &w0, &w1, &o0, &o1, lc, mvf, LUMA, dmvr_flag)) { + fc->vvcdsp.inter.put_bi_w[LUMA][!!my1][!!mx1](dst, dst_stride, src1, src1_stride, lc->tmp, + block_h, denom, w0, w1, o0, o1, mx1, my1, block_w, hf_idx, vf_idx); + } else { + fc->vvcdsp.inter.put_bi[LUMA][!!my1][!!mx1](dst, dst_stride, src1, src1_stride, lc->tmp, + block_h, mx1, my1, block_w, hf_idx, vf_idx); + } + } +} + +static void chroma_mc_uni(VVCLocalContext *lc, uint8_t *dst, const ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, int x_off, int y_off, + const int block_w, const int block_h, const MvField *mvf, const int c_idx, + const int hf_idx, const int vf_idx) +{ + const VVCFrameContext *fc = lc->fc; + const int lx = mvf->pred_flag - PF_L0; + const int hs = fc->ps.sps->hshift[1]; + const int vs = fc->ps.sps->vshift[1]; + const Mv *mv = &mvf->mv[lx]; + const intptr_t mx = av_mod_uintp2(mv->x, 4 + hs); + const intptr_t my = av_mod_uintp2(mv->y, 4 + vs); + const intptr_t _mx = mx << (1 - hs); + const intptr_t _my = my << (1 - vs); + int denom, wx, ox; + + x_off += mv->x >> (4 + hs); + y_off += mv->y >> (4 + vs); + src += y_off * src_stride + (x_off * (1 << fc->ps.sps->pixel_shift)); + + EMULATED_EDGE_CHROMA(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off); + if (derive_weight_uni(&denom, &wx, &ox, lc, mvf, c_idx)) { + fc->vvcdsp.inter.put_uni_w[CHROMA][!!my][!!mx](dst, dst_stride, src, src_stride, + block_h, denom, wx, ox, _mx, _my, block_w, hf_idx, vf_idx); + } else { + fc->vvcdsp.inter.put_uni[CHROMA][!!my][!!mx](dst, dst_stride, src, src_stride, + block_h, _mx, _my, block_w, hf_idx, vf_idx); + } +} + +static void chroma_mc_bi(VVCLocalContext *lc, uint8_t *dst, const ptrdiff_t dst_stride, + const AVFrame *ref0, const AVFrame *ref1, const int x_off, const int y_off, + const int block_w, const int block_h, const MvField *mvf, const int c_idx, + const int hf_idx, const int vf_idx, const MvField *orig_mv, const int dmvr_flag, const int ciip_flag) +{ + const VVCFrameContext *fc = lc->fc; + const uint8_t *src0 = ref0->data[c_idx]; + const uint8_t *src1 = ref1->data[c_idx]; + ptrdiff_t src0_stride = ref0->linesize[c_idx]; + ptrdiff_t src1_stride = ref1->linesize[c_idx]; + const Mv *mv0 = &mvf->mv[0]; + const Mv *mv1 = &mvf->mv[1]; + const int hs = fc->ps.sps->hshift[1]; + const int vs = fc->ps.sps->vshift[1]; + + const intptr_t mx0 = av_mod_uintp2(mv0->x, 4 + hs); + const intptr_t my0 = av_mod_uintp2(mv0->y, 4 + vs); + const intptr_t mx1 = av_mod_uintp2(mv1->x, 4 + hs); + const intptr_t my1 = av_mod_uintp2(mv1->y, 4 + vs); + const intptr_t _mx0 = mx0 << (1 - hs); + const intptr_t _my0 = my0 << (1 - vs); + const intptr_t _mx1 = mx1 << (1 - hs); + const intptr_t _my1 = my1 << (1 - vs); + + const int x_off0 = x_off + (mv0->x >> (4 + hs)); + const int y_off0 = y_off + (mv0->y >> (4 + vs)); + const int x_off1 = x_off + (mv1->x >> (4 + hs)); + const int y_off1 = y_off + (mv1->y >> (4 + vs)); + int denom, w0, w1, o0, o1; + + src0 += y_off0 * src0_stride + (int)((unsigned)x_off0 << fc->ps.sps->pixel_shift); + src1 += y_off1 * src1_stride + (int)((unsigned)x_off1 << fc->ps.sps->pixel_shift); + + if (dmvr_flag) { + const int x_sb0 = x_off + (orig_mv->mv[L0].x >> (4 + hs)); + const int y_sb0 = y_off + (orig_mv->mv[L0].y >> (4 + vs)); + const int x_sb1 = x_off + (orig_mv->mv[L1].x >> (4 + hs)); + const int y_sb1 = y_off + (orig_mv->mv[L1].y >> (4 + vs)); + EMULATED_EDGE_DMVR_CHROMA(lc->edge_emu_buffer, &src0, &src0_stride, x_sb0, y_sb0, x_off0, y_off0); + EMULATED_EDGE_DMVR_CHROMA(lc->edge_emu_buffer2, &src1, &src1_stride, x_sb1, y_sb1, x_off1, y_off1); + } else { + EMULATED_EDGE_CHROMA(lc->edge_emu_buffer, &src0, &src0_stride, x_off0, y_off0); + EMULATED_EDGE_CHROMA(lc->edge_emu_buffer2, &src1, &src1_stride, x_off1, y_off1); + } + + fc->vvcdsp.inter.put[CHROMA][!!my0][!!mx0](lc->tmp, src0, src0_stride, + block_h, _mx0, _my0, block_w, hf_idx, vf_idx); + if (derive_weight(&denom, &w0, &w1, &o0, &o1, lc, mvf, c_idx, dmvr_flag)) { + fc->vvcdsp.inter.put_bi_w[CHROMA][!!my1][!!mx1](dst, dst_stride, src1, src1_stride, lc->tmp, + block_h, denom, w0, w1, o0, o1, _mx1, _my1, block_w, hf_idx, vf_idx); + } else { + fc->vvcdsp.inter.put_bi[CHROMA][!!my1][!!mx1](dst, dst_stride, src1, src1_stride, lc->tmp, + block_h, _mx1, _my1, block_w, hf_idx, vf_idx); + } +} + +static void luma_prof_uni(VVCLocalContext *lc, uint8_t *dst, const ptrdiff_t dst_stride, + const AVFrame *ref, const MvField *mvf, int x_off, int y_off, const int block_w, const int block_h, + const int cb_prof_flag, const int16_t *diff_mv_x, const int16_t *diff_mv_y) +{ + const VVCFrameContext *fc = lc->fc; + const uint8_t *src = ref->data[0]; + ptrdiff_t src_stride = ref->linesize[0]; + uint16_t *prof_tmp = lc->tmp + 1 + MAX_PB_SIZE; + const int lx = mvf->pred_flag - PF_L0; + const Mv *mv = mvf->mv + lx; + const int mx = mv->x & 0xf; + const int my = mv->y & 0xf; + int denom, wx, ox; + const int weight_flag = derive_weight_uni(&denom, &wx, &ox, lc, mvf, LUMA); + + x_off += mv->x >> 4; + y_off += mv->y >> 4; + src += y_off * src_stride + (x_off * (1 << fc->ps.sps->pixel_shift)); + + EMULATED_EDGE_LUMA(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off); + if (cb_prof_flag) { + fc->vvcdsp.inter.put[LUMA][!!my][!!mx](prof_tmp, src, src_stride, AFFINE_MIN_BLOCK_SIZE, mx, my, AFFINE_MIN_BLOCK_SIZE, 2, 2); + fc->vvcdsp.inter.fetch_samples(prof_tmp, src, src_stride, mx, my); + if (!weight_flag) + fc->vvcdsp.inter.apply_prof_uni(dst, dst_stride, prof_tmp, diff_mv_x, diff_mv_y); + else + fc->vvcdsp.inter.apply_prof_uni_w(dst, dst_stride, prof_tmp, diff_mv_x, diff_mv_y, denom, wx, ox); + } else { + if (!weight_flag) + fc->vvcdsp.inter.put_uni[LUMA][!!my][!!mx](dst, dst_stride, src, src_stride, block_h, mx, my, block_w, 2, 2); + else + fc->vvcdsp.inter.put_uni_w[LUMA][!!my][!!mx](dst, dst_stride, src, src_stride, block_h, denom, wx, ox, mx, my, block_w, 2, 2); + } +} + +static void luma_prof_bi(VVCLocalContext *lc, uint8_t *dst, const ptrdiff_t dst_stride, + const AVFrame *ref0, const AVFrame *ref1, const MvField *mvf, const int x_off, const int y_off, + const int block_w, const int block_h) +{ + const VVCFrameContext *fc = lc->fc; + const PredictionUnit *pu = &lc->cu->pu; + ptrdiff_t src0_stride = ref0->linesize[0]; + ptrdiff_t src1_stride = ref1->linesize[0]; + uint16_t *prof_tmp = lc->tmp1 + 1 + MAX_PB_SIZE; + const Mv *mv0 = mvf->mv + L0; + const Mv *mv1 = mvf->mv + L1; + const int mx0 = mv0->x & 0xf; + const int my0 = mv0->y & 0xf; + const int mx1 = mv1->x & 0xf; + const int my1 = mv1->y & 0xf; + const int x_off0 = x_off + (mv0->x >> 4); + const int y_off0 = y_off + (mv0->y >> 4); + const int x_off1 = x_off + (mv1->x >> 4); + const int y_off1 = y_off + (mv1->y >> 4); + + const uint8_t *src0 = ref0->data[0] + y_off0 * src0_stride + (int)((unsigned)x_off0 << fc->ps.sps->pixel_shift); + const uint8_t *src1 = ref1->data[0] + y_off1 * src1_stride + (int)((unsigned)x_off1 << fc->ps.sps->pixel_shift); + + int denom, w0, w1, o0, o1; + const int weight_flag = derive_weight(&denom, &w0, &w1, &o0, &o1, lc, mvf, LUMA, 0); + + EMULATED_EDGE_LUMA(lc->edge_emu_buffer, &src0, &src0_stride, x_off0, y_off0); + EMULATED_EDGE_LUMA(lc->edge_emu_buffer2, &src1, &src1_stride, x_off1, y_off1); + + if (!pu->cb_prof_flag[L0]) { + fc->vvcdsp.inter.put[LUMA][!!my0][!!mx0](lc->tmp, src0, src0_stride, + block_h, mx0, my0, block_w, 2, 2); + } else { + fc->vvcdsp.inter.put[LUMA][!!my0][!!mx0](prof_tmp, src0, src0_stride, AFFINE_MIN_BLOCK_SIZE, mx0, my0, AFFINE_MIN_BLOCK_SIZE, 2, 2); + fc->vvcdsp.inter.fetch_samples(prof_tmp, src0, src0_stride, mx0, my0); + fc->vvcdsp.inter.apply_prof(lc->tmp, prof_tmp, pu->diff_mv_x[L0], pu->diff_mv_y[L0]); + } + if (!pu->cb_prof_flag[L1]) { + if (weight_flag) { + fc->vvcdsp.inter.put_bi_w[LUMA][!!my1][!!mx1](dst, dst_stride, src1, src1_stride, lc->tmp, + block_h, denom, w0, w1, o0, o1, mx1, my1, block_w, 2, 2); + } else { + fc->vvcdsp.inter.put_bi[LUMA][!!my1][!!mx1](dst, dst_stride, src1, src1_stride, lc->tmp, + block_h, mx1, my1, block_w, 2, 2); + } + } else { + fc->vvcdsp.inter.put[LUMA][!!my1][!!mx1](prof_tmp, src1, src1_stride, AFFINE_MIN_BLOCK_SIZE, mx1, my1, AFFINE_MIN_BLOCK_SIZE, 2, 2); + fc->vvcdsp.inter.fetch_samples(prof_tmp, src1, src1_stride, mx1, my1); + if (weight_flag) { + fc->vvcdsp.inter.apply_prof_bi_w(dst, dst_stride, lc->tmp, prof_tmp, pu->diff_mv_x[L1], pu->diff_mv_y[L1], + denom, w0, w1, o0, o1); + } else { + fc->vvcdsp.inter.apply_prof_bi(dst, dst_stride, lc->tmp, prof_tmp, pu->diff_mv_x[L1], pu->diff_mv_y[L1]); + } + } + +} + +static void vvc_await_progress(const VVCFrameContext *fc, VVCFrame *ref, + const Mv *mv, const int y0, const int height) +{ + //todo: check why we need magic number 9 + const int y = FFMAX(0, (mv->y >> 4) + y0 + height + 9); + + ff_vvc_await_progress(ref, y); +} + +static int pred_await_progress(const VVCFrameContext *fc, VVCFrame *ref[2], + const MvField *mv, const int y0, const int height) +{ + for (int mask = PF_L0; mask <= PF_L1; mask++) { + if (mv->pred_flag & mask) { + const int lx = mask - PF_L0; + ref[lx] = fc->ref->refPicList[lx].ref[mv->ref_idx[lx]]; + if (!ref[lx]) + return AVERROR_INVALIDDATA; + vvc_await_progress(fc, ref[lx], mv->mv + lx, y0, height); + } + } + return 0; +} + +#define POS(c_idx, x, y) \ + &fc->frame->data[c_idx][((y) >> fc->ps.sps->vshift[c_idx]) * fc->frame->linesize[c_idx] + \ + (((x) >> fc->ps.sps->hshift[c_idx]) << fc->ps.sps->pixel_shift)] + +static void pred_gpm_blk(VVCLocalContext *lc) +{ + const VVCFrameContext *fc = lc->fc; + const CodingUnit *cu = lc->cu; + const PredictionUnit *pu = &cu->pu; + + const uint8_t angle_idx = ff_vvc_gpm_angle_idx[pu->gpm_partition_idx]; + const uint8_t weights_idx = ff_vvc_gpm_angle_to_weights_idx[angle_idx]; + const int w = av_log2(cu->cb_width) - 3; + const int h = av_log2(cu->cb_height) - 3; + const uint8_t off_x = ff_vvc_gpm_weights_offset_x[pu->gpm_partition_idx][h][w]; + const uint8_t off_y = ff_vvc_gpm_weights_offset_y[pu->gpm_partition_idx][h][w]; + const uint8_t mirror_type = ff_vvc_gpm_angle_to_mirror[angle_idx]; + const uint8_t *weights; + + const int c_end = fc->ps.sps->chroma_format_idc ? 3 : 1; + + int16_t *tmp[2] = {lc->tmp, lc->tmp1}; + const ptrdiff_t tmp_stride = MAX_PB_SIZE; + + for (int c_idx = 0; c_idx < c_end; c_idx++) { + const int hs = fc->ps.sps->hshift[c_idx]; + const int vs = fc->ps.sps->vshift[c_idx]; + const int x = lc->cu->x0 >> hs; + const int y = lc->cu->y0 >> vs; + const int width = cu->cb_width >> hs; + const int height = cu->cb_height >> vs; + uint8_t *dst = POS(c_idx, lc->cu->x0, lc->cu->y0); + ptrdiff_t dst_stride = fc->frame->linesize[c_idx]; + + int step_x = 1 << hs; + int step_y = VVC_GPM_WEIGHT_SIZE << vs; + if (!mirror_type) { + weights = &ff_vvc_gpm_weights[weights_idx][off_y * VVC_GPM_WEIGHT_SIZE + off_x]; + } else if (mirror_type == 1) { + step_x = -step_x; + weights = &ff_vvc_gpm_weights[weights_idx][off_y * VVC_GPM_WEIGHT_SIZE + VVC_GPM_WEIGHT_SIZE - 1- off_x]; + } else { + step_y = -step_y; + weights = &ff_vvc_gpm_weights[weights_idx][(VVC_GPM_WEIGHT_SIZE - 1 - off_y) * VVC_GPM_WEIGHT_SIZE + off_x]; + } + + for (int i = 0; i < 2; i++) { + const MvField *mv = pu->gpm_mv + i; + const int lx = mv->pred_flag - PF_L0; + VVCFrame *ref = fc->ref->refPicList[lx].ref[mv->ref_idx[lx]]; + if (!ref) + return; + if (c_idx) { + chroma_mc(lc, tmp[i], ref->frame, mv->mv + lx, x, y, width, height, c_idx); + } else { + vvc_await_progress(fc, ref, mv->mv + lx, y, height); + luma_mc(lc, tmp[i], ref->frame, mv->mv + lx, x, y, width, height); + } + } + fc->vvcdsp.inter.put_gpm(dst, dst_stride, width, height, tmp[0], tmp[1], tmp_stride, weights, step_x, step_y); + } + return; +} + +static int ciip_derive_intra_weight(const VVCLocalContext *lc, const int x0, const int y0, + const int width, const int height) +{ + const VVCFrameContext *fc = lc->fc; + const VVCSPS *sps = fc->ps.sps; + const int x0b = av_mod_uintp2(x0, sps->ctb_log2_size_y); + const int y0b = av_mod_uintp2(y0, sps->ctb_log2_size_y); + const int available_l = lc->ctb_left_flag || x0b; + const int available_u = lc->ctb_up_flag || y0b; + const int min_pu_width = fc->ps.pps->min_pu_width; + + int w = 1; + + if (available_u &&fc->ref->tab_mvf[((y0 - 1) >> MIN_PU_LOG2) * min_pu_width + ((x0 - 1 + width)>> MIN_PU_LOG2)].pred_flag == PF_INTRA) + w++; + + if (available_l && fc->ref->tab_mvf[((y0 - 1 + height)>> MIN_PU_LOG2) * min_pu_width + ((x0 - 1) >> MIN_PU_LOG2)].pred_flag == PF_INTRA) + w++; + + return w; +} + +static void pred_regular_luma(VVCLocalContext *lc, const int hf_idx, const int vf_idx, const MvField *mv, + const int x0, const int y0, const int sbw, const int sbh, const MvField *orig_mv, const int dmvr_flag, const int sb_bdof_flag) +{ + const SliceContext *sc = lc->sc; + const VVCFrameContext *fc = lc->fc; + const int ciip_flag = lc->cu->ciip_flag; + uint8_t *dst = POS(0, x0, y0); + const ptrdiff_t dst_stride = fc->frame->linesize[0]; + uint8_t *inter = ciip_flag ? (uint8_t *)lc->ciip_tmp1 : dst; + const ptrdiff_t inter_stride = ciip_flag ? (MAX_PB_SIZE * sizeof(uint16_t)) : dst_stride; + VVCFrame *ref[2]; + + if (pred_await_progress(fc, ref, mv, y0, sbh) < 0) + return; + + if (mv->pred_flag != PF_BI) { + const int lx = mv->pred_flag - PF_L0; + luma_mc_uni(lc, inter, inter_stride, ref[lx]->frame, + mv, x0, y0, sbw, sbh, hf_idx, vf_idx); + } else { + luma_mc_bi(lc, inter, inter_stride, ref[0]->frame, + &mv->mv[0], x0, y0, sbw, sbh, ref[1]->frame, &mv->mv[1], mv, + hf_idx, vf_idx, orig_mv, dmvr_flag, sb_bdof_flag); + + } + + if (ciip_flag) { + const int intra_weight = ciip_derive_intra_weight(lc, x0, y0, sbw, sbh); + fc->vvcdsp.intra.intra_pred(lc, x0, y0, sbw, sbh, 0); + if (sc->sh.lmcs_used_flag) + fc->vvcdsp.lmcs.filter(inter, inter_stride, sbw, sbh, fc->ps.ph->lmcs_fwd_lut); + fc->vvcdsp.inter.put_ciip(dst, dst_stride, sbw, sbh, inter, inter_stride, intra_weight); + + } +} + +static void pred_regular_chroma(VVCLocalContext *lc, const MvField *mv, + const int x0, const int y0, const int sbw, const int sbh, const MvField *orig_mv, const int dmvr_flag) +{ + const VVCFrameContext *fc = lc->fc; + const int hs = fc->ps.sps->hshift[1]; + const int vs = fc->ps.sps->vshift[1]; + const int x0_c = x0 >> hs; + const int y0_c = y0 >> vs; + const int w_c = sbw >> hs; + const int h_c = sbh >> vs; + const int do_ciip = lc->cu->ciip_flag && (w_c > 2); + + uint8_t* dst1 = POS(1, x0, y0); + uint8_t* dst2 = POS(2, x0, y0); + const ptrdiff_t dst1_stride = fc->frame->linesize[1]; + const ptrdiff_t dst2_stride = fc->frame->linesize[2]; + + uint8_t *inter1 = do_ciip ? (uint8_t *)lc->ciip_tmp1 : dst1; + const ptrdiff_t inter1_stride = do_ciip ? (MAX_PB_SIZE * sizeof(uint16_t)) : dst1_stride; + + uint8_t *inter2 = do_ciip ? (uint8_t *)lc->ciip_tmp2 : dst2; + const ptrdiff_t inter2_stride = do_ciip ? (MAX_PB_SIZE * sizeof(uint16_t)) : dst2_stride; + + //fix me + const int hf_idx = 0; + const int vf_idx = 0; + if (mv->pred_flag != PF_BI) { + const int lx = mv->pred_flag - PF_L0; + VVCFrame* ref = fc->ref->refPicList[lx].ref[mv->ref_idx[lx]]; + if (!ref) + return; + chroma_mc_uni(lc, inter1, inter1_stride, ref->frame->data[1], ref->frame->linesize[1], + x0_c, y0_c, w_c, h_c, mv, CB, hf_idx, vf_idx); + chroma_mc_uni(lc, inter2, inter2_stride, ref->frame->data[2], ref->frame->linesize[2], + x0_c, y0_c, w_c, h_c, mv, CR, hf_idx, vf_idx); + } else { + VVCFrame* ref0 = fc->ref->refPicList[0].ref[mv->ref_idx[0]]; + VVCFrame* ref1 = fc->ref->refPicList[1].ref[mv->ref_idx[1]]; + if (!ref0 || !ref1) + return; + chroma_mc_bi(lc, inter1, inter1_stride, ref0->frame, ref1->frame, + x0_c, y0_c, w_c, h_c, mv, CB, hf_idx, vf_idx, orig_mv, dmvr_flag, lc->cu->ciip_flag); + + chroma_mc_bi(lc, inter2, inter2_stride, ref0->frame, ref1->frame, + x0_c, y0_c, w_c, h_c, mv, CR, hf_idx, vf_idx, orig_mv, dmvr_flag, lc->cu->ciip_flag); + + } + if (do_ciip) { + const int intra_weight = ciip_derive_intra_weight(lc, x0, y0, sbw, sbh); + fc->vvcdsp.intra.intra_pred(lc, x0, y0, sbw, sbh, 1); + fc->vvcdsp.intra.intra_pred(lc, x0, y0, sbw, sbh, 2); + fc->vvcdsp.inter.put_ciip(dst1, dst1_stride, w_c, h_c, inter1, inter1_stride, intra_weight); + fc->vvcdsp.inter.put_ciip(dst2, dst2_stride, w_c, h_c, inter2, inter2_stride, intra_weight); + + } +} + +// derive bdofFlag from 8.5.6 Decoding process for inter blocks +// derive dmvr from 8.5.1 General decoding process for coding units coded in inter prediction mode +static void derive_dmvr_bdof_flag(VVCLocalContext *lc, int *dmvr_flag, int *bdof_flag, const PredictionUnit* pu) +{ + const VVCFrameContext *fc = lc->fc; + const VVCPPS *pps = fc->ps.pps; + const VVCPH *ph = fc->ps.ph; + const VVCSH *sh = &lc->sc->sh; + const int poc = ph->poc; + const RefPicList *rpl0 = fc->ref->refPicList + L0; + const RefPicList *rpl1 = fc->ref->refPicList + L1; + const int8_t *ref_idx = pu->mi.ref_idx; + const MotionInfo *mi = &pu->mi; + const CodingUnit *cu = lc->cu; + const PredWeightTable *w = pps->wp_info_in_ph_flag ? &fc->ps.ph->pwt : &sh->pwt; + + *dmvr_flag = 0; + *bdof_flag = 0; + + if (mi->pred_flag == PF_BI && + (poc - rpl0->list[ref_idx[L0]] == rpl1->list[ref_idx[L1]] - poc) && + !rpl0->isLongTerm[ref_idx[L0]] && !rpl1->isLongTerm[ref_idx[L1]] && + !cu->ciip_flag && + !mi->bcw_idx && + !w->weight_flag[L0][LUMA][mi->ref_idx[L0]] && !w->weight_flag[L1][LUMA][mi->ref_idx[L1]] && + !w->weight_flag[L0][CHROMA][mi->ref_idx[L0]] && !w->weight_flag[L1][CHROMA][mi->ref_idx[L1]] && + cu->cb_width >= 8 && cu->cb_height >= 8 && + (cu->cb_width * cu->cb_height >= 128)) { + // fixme: for RprConstraintsActiveFlag + if (!ph->bdof_disabled_flag && + mi->motion_model_idc == MOTION_TRANSLATION && + !pu->merge_subblock_flag && + !pu->sym_mvd_flag) + *bdof_flag = 1; + if (!ph->dmvr_disabled_flag && + pu->general_merge_flag && + !pu->mmvd_merge_flag) + *dmvr_flag = 1; + + } +} + +void ff_vvc_ctu_apply_dmvr_info(VVCFrameContext *fc, const int x0, const int y0) +{ + const VVCPPS *pps = fc->ps.pps; + const int ctb_size = fc->ps.sps->ctb_size_y; + const int x_end = FFMIN(x0 + ctb_size, pps->width); + const int y_end = FFMIN(y0 + ctb_size, pps->height); + + for (int y = y0; y < y_end; y += MIN_PU_SIZE) { + for (int x = x0; x < x_end; x += MIN_PU_SIZE) { + const int off = pps->min_pu_width * (y >> MIN_PU_LOG2) + (x >> MIN_PU_LOG2); + const DMVRInfo *di = &fc->tab.dmvr[off]; + if (di->dmvr_enabled) { + MvField *mvf = &fc->ref->tab_mvf[off]; + if (mvf->pred_flag & PF_L0) + mvf->mv[L0] = di->mv[L0]; + if (mvf->pred_flag & PF_L1) + mvf->mv[L1] = di->mv[L1]; + } + } + } +} + +// 8.5.3.5 Parametric motion vector refinement process +static int parametric_mv_refine(const int *sad, const int stride) +{ + const int sad_minus = sad[-stride]; + const int sad_center = sad[0]; + const int sad_plus = sad[stride]; + int dmvc; + int denom = (( sad_minus + sad_plus) - (sad_center << 1 ) ) << 3; + if (!denom) + dmvc = 0; + else { + if (sad_minus == sad_center) + dmvc = -8; + else if (sad_plus == sad_center) + dmvc = 8; + else { + int num = ( sad_minus - sad_plus ) << 4; + int sign_num = 0; + int quotient = 0; + int counter = 3; + if (num < 0 ) { + num = - num; + sign_num = 1; + } + while (counter > 0) { + counter = counter - 1; + quotient = quotient << 1; + if ( num >= denom ) { + num = num - denom; + quotient = quotient + 1; + } + denom = (denom >> 1); + } + if (sign_num == 1 ) + dmvc = -quotient; + else + dmvc = quotient; + } + } + return dmvc; +} + +#define SAD_ARRAY_SIZE 5 +//8.5.3 Decoder-side motion vector refinement process +static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mv, MvField *orig_mv, int *sb_bdof_flag, + const AVFrame *ref0, const AVFrame *ref1, const int x_off, const int y_off, const int block_w, const int block_h) +{ + const VVCFrameContext *fc = lc->fc; + ptrdiff_t src0_stride = ref0->linesize[0]; + ptrdiff_t src1_stride = ref1->linesize[0]; + Mv *mv0 = mv->mv + L0; + Mv *mv1 = mv->mv + L1; + const int sr_range = 2; + const int mx0 = mv0->x & 0xf; + const int my0 = mv0->y & 0xf; + const int mx1 = mv1->x & 0xf; + const int my1 = mv1->y & 0xf; + const int x_off0 = x_off + (mv0->x >> 4) - sr_range; + const int y_off0 = y_off + (mv0->y >> 4) - sr_range; + const int x_off1 = x_off + (mv1->x >> 4) - sr_range; + const int y_off1 = y_off + (mv1->y >> 4) - sr_range; + const int pred_w = block_w + 2 * sr_range; + const int pred_h = block_h + 2 * sr_range; + + uint8_t *src0 = ref0->data[0] + y_off0 * src0_stride + (int)((unsigned)x_off0 << fc->ps.sps->pixel_shift); + uint8_t *src1 = ref1->data[0] + y_off1 * src1_stride + (int)((unsigned)x_off1 << fc->ps.sps->pixel_shift); + + int sad[SAD_ARRAY_SIZE][SAD_ARRAY_SIZE]; + int min_dx, min_dy, min_sad, dx, dy; + + *orig_mv = *mv; + min_dx = min_dy = dx = dy = 2; + + EMULATED_EDGE_BILINEAR(lc->edge_emu_buffer, &src0, &src0_stride, x_off0, y_off0); + EMULATED_EDGE_BILINEAR(lc->edge_emu_buffer2, &src1, &src1_stride, x_off1, y_off1); + fc->vvcdsp.inter.dmvr[!!my0][!!mx0](lc->tmp, src0, src0_stride, pred_h, mx0, my0, pred_w); + fc->vvcdsp.inter.dmvr[!!my1][!!mx1](lc->tmp1, src1, src1_stride, pred_h, mx1, my1, pred_w); + + min_sad = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, dy, block_w, block_h); + min_sad -= min_sad >> 2; + sad[dy][dx] = min_sad; + + if (min_sad >= block_w * block_h) { + int dmv[2]; + // 8.5.3.4 Array entry selection process + for (dy = 0; dy < SAD_ARRAY_SIZE; dy++) { + for (dx = 0; dx < SAD_ARRAY_SIZE; dx++) { + if (dx != sr_range || dy != sr_range) { + sad[dy][dx] = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, dy, block_w, block_h); + if (sad[dy][dx] < min_sad) { + min_sad = sad[dy][dx]; + min_dx = dx; + min_dy = dy; + } + } + } + } + dmv[0] = (min_dx - sr_range) << 4; + dmv[1] = (min_dy - sr_range) << 4; + if (min_dx != 0 && min_dx != 4 && min_dy != 0 && min_dy != 4) { + dmv[0] += parametric_mv_refine(&sad[min_dy][min_dx], 1); + dmv[1] += parametric_mv_refine(&sad[min_dy][min_dx], SAD_ARRAY_SIZE); + } + mv0->x += dmv[0]; + mv0->y += dmv[1]; + mv1->x += -dmv[0]; + mv1->y += -dmv[1]; + ff_vvc_clip_mv(mv0); + ff_vvc_clip_mv(mv1); + } + if (min_sad < 2 * block_w * block_h) { + *sb_bdof_flag = 0; + } +} + +static void set_dmvr_info(VVCFrameContext *fc, const int x0, const int y0, + const int width, const int height, const MvField *mvf) +{ + const VVCPPS *pps = fc->ps.pps; + + for (int y = y0; y < y0 + height; y += MIN_PU_SIZE) { + for (int x = x0; x < x0 + width; x += MIN_PU_SIZE) { + DMVRInfo *di = &fc->tab.dmvr[pps->min_pu_width * (y >> MIN_PU_LOG2) + (x >> MIN_PU_LOG2)]; + di->dmvr_enabled = 1; + if (mvf->pred_flag & PF_L0) + di->mv[L0] = mvf->mv[L0]; + if (mvf->pred_flag & PF_L1) + di->mv[L1] = mvf->mv[L1]; + } + } +} + +static void derive_sb_mv(VVCLocalContext *lc, MvField *mv, MvField *orig_mv, int *sb_bdof_flag, + const int x0, const int y0, const int sbw, const int sbh, const int dmvr_flag, const int bdof_flag) +{ + VVCFrameContext *fc = lc->fc; + + *orig_mv = *mv = *ff_vvc_get_mvf(fc, x0, y0); + if (bdof_flag) + *sb_bdof_flag = 1; + if (dmvr_flag) { + VVCFrame* ref[2]; + if (pred_await_progress(fc, ref, mv, y0, sbh) < 0) + return; + dmvr_mv_refine(lc, mv, orig_mv, sb_bdof_flag, ref[0]->frame, ref[1]->frame, x0, y0, sbw, sbh); + set_dmvr_info(fc, x0, y0, sbw, sbh, mv); + } +} + +static void pred_regular_blk(VVCLocalContext *lc, const int skip_ciip) +{ + const VVCFrameContext *fc = lc->fc; + const CodingUnit *cu = lc->cu; + const PredictionUnit *pu = &cu->pu; + const MotionInfo *mi = &pu->mi; + MvField mv, orig_mv; + int sbw, sbh, num_sb_x, num_sb_y, sb_bdof_flag = 0; + int dmvr_flag, bdof_flag; + + if (cu->ciip_flag && skip_ciip) + return; + + derive_dmvr_bdof_flag(lc, &dmvr_flag, &bdof_flag, pu); + num_sb_x = mi->num_sb_x; + num_sb_y = mi->num_sb_y; + if (dmvr_flag || bdof_flag) { + num_sb_x = (cu->cb_width > 16) ? (cu->cb_width >> 4) : 1; + num_sb_y = (cu->cb_height > 16) ? (cu->cb_height >> 4) : 1; + } + sbw = cu->cb_width / num_sb_x; + sbh = cu->cb_height / num_sb_y; + + for (int sby = 0; sby < num_sb_y; sby++) { + for (int sbx = 0; sbx < num_sb_x; sbx++) { + const int x0 = cu->x0 + sbx * sbw; + const int y0 = cu->y0 + sby * sbh; + + if (cu->ciip_flag) + ff_vvc_set_neighbour_available(lc, x0, y0, sbw, sbh); + + derive_sb_mv(lc, &mv, &orig_mv, &sb_bdof_flag, x0, y0, sbw, sbh, dmvr_flag, bdof_flag); + pred_regular_luma(lc, mi->hpel_if_idx, mi->hpel_if_idx, &mv, x0, y0, sbw, sbh, &orig_mv, dmvr_flag, sb_bdof_flag); + if (fc->ps.sps->chroma_format_idc) + pred_regular_chroma(lc, &mv, x0, y0, sbw, sbh, &orig_mv, dmvr_flag); + } + } +} + +static void derive_affine_mvc(MvField *mvc, const VVCFrameContext *fc, const MvField *mv, + const int x0, const int y0, const int sbw, const int sbh) +{ + const int hs = fc->ps.sps->hshift[1]; + const int vs = fc->ps.sps->vshift[1]; + const MvField* mv2 = ff_vvc_get_mvf(fc, x0 + hs * sbw, y0 + vs * sbh); + *mvc = *mv; + mvc->mv[0].x += mv2->mv[0].x; + mvc->mv[0].y += mv2->mv[0].y; + mvc->mv[1].x += mv2->mv[1].x; + mvc->mv[1].y += mv2->mv[1].y; + ff_vvc_round_mv(mvc->mv + 0, 0, 1); + ff_vvc_round_mv(mvc->mv + 1, 0, 1); +} + +static void pred_affine_blk(VVCLocalContext *lc) +{ + const VVCFrameContext *fc = lc->fc; + const CodingUnit *cu = lc->cu; + const PredictionUnit *pu = &cu->pu; + const MotionInfo *mi = &pu->mi; + const int x0 = cu->x0; + const int y0 = cu->y0; + const int sbw = cu->cb_width / mi->num_sb_x; + const int sbh = cu->cb_height / mi->num_sb_y; + const int hs = fc->ps.sps->hshift[1]; + const int vs = fc->ps.sps->vshift[1]; + + for (int sby = 0; sby < mi->num_sb_y; sby++) { + for (int sbx = 0; sbx < mi->num_sb_x; sbx++) { + const int x = x0 + sbx * sbw; + const int y = y0 + sby * sbh; + + uint8_t *dst0 = POS(0, x, y); + const MvField *mv = ff_vvc_get_mvf(fc, x, y); + VVCFrame *ref[2]; + + if (pred_await_progress(fc, ref, mv, y, sbh) < 0) + return; + + if (mi->pred_flag != PF_BI) { + const int lx = mi->pred_flag - PF_L0; + luma_prof_uni(lc, dst0, fc->frame->linesize[0], ref[lx]->frame, + mv, x, y, sbw, sbh, pu->cb_prof_flag[lx], + pu->diff_mv_x[lx], pu->diff_mv_y[lx]); + } else { + luma_prof_bi(lc, dst0, fc->frame->linesize[0], ref[0]->frame, ref[1]->frame, + mv, x, y, sbw, sbh); + } + if (fc->ps.sps->chroma_format_idc) { + if (!av_mod_uintp2(sby, vs) && !av_mod_uintp2(sbx, hs)) { + MvField mvc; + derive_affine_mvc(&mvc, fc, mv, x, y, sbw, sbh); + pred_regular_chroma(lc, &mvc, x, y, sbw<<hs, sbh<<vs, NULL, 0); + + } + } + + } + } +} + +static void predict_inter(VVCLocalContext *lc) +{ + const VVCFrameContext *fc = lc->fc; + const CodingUnit *cu = lc->cu; + const PredictionUnit *pu = &cu->pu; + + if (pu->merge_gpm_flag) + pred_gpm_blk(lc); + else if (pu->inter_affine_flag) + pred_affine_blk(lc); + else + pred_regular_blk(lc, 1); //intra block is not ready yet, skip ciip + if (lc->sc->sh.lmcs_used_flag && !cu->ciip_flag) { + uint8_t* dst0 = POS(0, cu->x0, cu->y0); + fc->vvcdsp.lmcs.filter(dst0, fc->frame->linesize[LUMA], cu->cb_width, cu->cb_height, fc->ps.ph->lmcs_fwd_lut); + } +} + +int ff_vvc_predict_inter(VVCLocalContext *lc, const int rs) +{ + const VVCFrameContext *fc = lc->fc; + const CTU *ctu = fc->tab.ctus + rs; + CodingUnit *cu = ctu->cus; + + while (cu) { + lc->cu = cu; + if (cu->pred_mode != MODE_INTRA && cu->pred_mode != MODE_PLT && cu->tree_type != DUAL_TREE_CHROMA) + predict_inter(lc); + cu = cu->next; + } + + return 0; +} + +void ff_vvc_predict_ciip(VVCLocalContext *lc) +{ + av_assert0(lc->cu->ciip_flag); + + //todo: refact out ciip from pred_regular_blk + pred_regular_blk(lc, 0); +} + +#undef POS diff --git a/libavcodec/vvc/vvc_inter.h b/libavcodec/vvc/vvc_inter.h new file mode 100644 index 0000000000..03422fbcfe --- /dev/null +++ b/libavcodec/vvc/vvc_inter.h @@ -0,0 +1,50 @@ +/* + * VVC inter prediction + * + * Copyright (C) 2023 Nuo Mi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VVC_INTER_H +#define AVCODEC_VVC_INTER_H + +#include "vvc_ctu.h" + +/** + * Loop entire CTU to predict all inter coding blocks + * @param lc local context for CTU + * @param rs raster order for the CTU + * @return AVERROR + */ +int ff_vvc_predict_inter(VVCLocalContext *lc, int rs); + +/** + * CIIP(Combined Inter-Intra Prediction) for a coding block + * @param lc local context for CTU + */ +void ff_vvc_predict_ciip(VVCLocalContext *lc); + +/** + * apply DMVR(Decoder-Side Motion Vector Refinement) for the ctu + * @param lc local context for CTU + * @param x0 x position for the CTU + * @param y0 y position for the CTU + */ +void ff_vvc_ctu_apply_dmvr_info(VVCFrameContext *fc, int x0, const int y0); + +#endif // AVCODEC_VVC_INTER_H diff --git a/libavcodec/vvc/vvc_inter_template.c b/libavcodec/vvc/vvc_inter_template.c new file mode 100644 index 0000000000..6c4b353b1c --- /dev/null +++ b/libavcodec/vvc/vvc_inter_template.c @@ -0,0 +1,1510 @@ +/* + * VVC inter prediction DSP + * + * Copyright (C) 2022 Nuo Mi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +static void FUNC(put_vvc_pel_pixels)(int16_t *dst, + const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = src[x] << (14 - BIT_DEPTH); + src += src_stride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_pel_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int height, + const intptr_t mx, const intptr_t my, const int width, const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + + for (int y = 0; y < height; y++) { + memcpy(dst, src, width * sizeof(pixel)); + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_pel_bi_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + + const int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src0[x] + offset) >> shift); + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_pel_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int height, + const int denom, const int wx, const int _ox, const intptr_t mx, const intptr_t my, + const int width, const int hf_idx, const int vf_idx) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + const int ox = _ox * (1 << (BIT_DEPTH - 8)); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int v = (src[x] << (14 - BIT_DEPTH)); + dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox); + } + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_pel_bi_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const int denom, const int wx0, const int wx1, + int ox0, int ox1, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + + const int shift = 14 + 1 - BIT_DEPTH; + const int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src0[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define LUMA_FILTER(src, stride) \ + (filter[0] * src[x - 3 * stride] + \ + filter[1] * src[x - 2 * stride] + \ + filter[2] * src[x - stride] + \ + filter[3] * src[x ] + \ + filter[4] * src[x + stride] + \ + filter[5] * src[x + 2 * stride] + \ + filter[6] * src[x + 3 * stride] + \ + filter[7] * src[x + 4 * stride]) + +static void FUNC(put_vvc_luma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_luma_filters[hf_idx][mx]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_luma_filters[vf_idx][my]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = LUMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8); + src += src_stride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const int8_t *filter; + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + LUMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + + src -= LUMA_EXTRA_BEFORE * src_stride; + filter = ff_vvc_luma_filters[hf_idx][mx]; + for (y = 0; y < height + LUMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + LUMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_luma_filters[vf_idx][my]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = LUMA_FILTER(tmp, MAX_PB_SIZE) >> 6; + tmp += MAX_PB_SIZE; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_uni_h)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_luma_filters[hf_idx][mx]; + const int shift = 14 - BIT_DEPTH; + +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int val = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + dst[x] = av_clip_pixel((val + offset) >> shift); + } + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_luma_bi_h)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + pixel *src = (pixel*)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + + const int8_t *filter = ff_vvc_luma_filters[hf_idx][mx]; + + const int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src0[x] + offset) >> shift); + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_uni_v)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_luma_filters[vf_idx][my]; + const int shift = 14 - BIT_DEPTH; + +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int val = LUMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8); + dst[x] = av_clip_pixel((val + offset) >> shift); + } + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_luma_bi_v)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + + const int8_t *filter = ff_vvc_luma_filters[vf_idx][my]; + + const int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((LUMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8)) + src0[x] + offset) >> shift); + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_uni_hv)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const int8_t *filter; + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + LUMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + const int shift = 14 - BIT_DEPTH; + +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + src -= LUMA_EXTRA_BEFORE * src_stride; + filter = ff_vvc_luma_filters[hf_idx][mx]; + for (y = 0; y < height + LUMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + LUMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_luma_filters[vf_idx][my]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + int val = LUMA_FILTER(tmp, MAX_PB_SIZE) >> 6; + dst[x] = av_clip_pixel((val + offset) >> shift); + + + } + tmp += MAX_PB_SIZE; + dst += dst_stride; + } + +} + +static void FUNC(put_vvc_luma_bi_hv)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const int8_t *filter; + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + LUMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + const int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + src -= LUMA_EXTRA_BEFORE * src_stride; + filter = ff_vvc_luma_filters[hf_idx][mx]; + for (y = 0; y < height + LUMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + LUMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_luma_filters[vf_idx][my]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((LUMA_FILTER(tmp, MAX_PB_SIZE) >> 6) + src0[x] + offset) >> shift); + tmp += MAX_PB_SIZE; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_uni_w_h)(uint8_t *_dst, ptrdiff_t _dst_stride, + const uint8_t *_src, ptrdiff_t _src_stride, int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width, int hf_idx, int vf_idx) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_luma_filters[hf_idx][mx]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_luma_bi_w_h)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, const int height, + const int denom, const int w0, const int w1, const int o0, int o1, + const intptr_t mx, const intptr_t my, const int width, const int hf_idx, const int vf_idx) +{ + const pixel *src = (pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_luma_filters[hf_idx][mx]; + const int shift = denom + FFMAX(2, 14 - BIT_DEPTH) + 1; + const int offset = (((o0 + o1) << (BIT_DEPTH - 8)) + 1) << (shift - 1); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int src1 = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + dst[x] = av_clip_pixel((src1 * w1 + src0[x] * w0 + offset) >> shift); + } + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_uni_w_v)(uint8_t *_dst, ptrdiff_t _dst_stride, + const uint8_t *_src, ptrdiff_t _src_stride, int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width, const int hf_idx, const int vf_idx) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_luma_filters[vf_idx][my]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((LUMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_luma_bi_w_v)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const int denom, const int wx0, const int wx1, + int ox0, int ox1, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + + const int8_t *filter = ff_vvc_luma_filters[vf_idx][my]; + + const int shift = 14 + 1 - BIT_DEPTH; + const int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((LUMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8)) * wx1 + src0[x] * wx0 + + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_luma_uni_w_hv)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int denom, + int wx, int ox, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const int8_t *filter; + pixel *src = (pixel*)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + LUMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= LUMA_EXTRA_BEFORE * src_stride; + filter = ff_vvc_luma_filters[hf_idx][mx]; + for (y = 0; y < height + LUMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + LUMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_luma_filters[vf_idx][my]; + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((LUMA_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); + tmp += MAX_PB_SIZE; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_luma_bi_w_hv)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const int denom, const int wx0, const int wx1, + int ox0, int ox1, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const int8_t *filter; + const pixel *src = (pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + LUMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + const int shift = 14 + 1 - BIT_DEPTH; + const int log2Wd = denom + shift - 1; + + src -= LUMA_EXTRA_BEFORE * src_stride; + filter = ff_vvc_luma_filters[hf_idx][mx]; + for (y = 0; y < height + LUMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + LUMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_luma_filters[vf_idx][my]; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((LUMA_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src0[x] * wx0 + + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); + tmp += MAX_PB_SIZE; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define CHROMA_FILTER(src, stride) \ + (filter[0] * src[x - stride] + \ + filter[1] * src[x] + \ + filter[2] * src[x + stride] + \ + filter[3] * src[x + 2 * stride]) + +static void FUNC(put_vvc_chroma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[vf_idx][my]; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = CHROMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8); + src += src_stride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + int16_t tmp_array[(MAX_PB_SIZE + CHROMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + + src -= CHROMA_EXTRA_BEFORE * src_stride; + + for (y = 0; y < height + CHROMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + CHROMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_chroma_filters[vf_idx][my]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = CHROMA_FILTER(tmp, MAX_PB_SIZE) >> 6; + tmp += MAX_PB_SIZE; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_uni_h)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + const int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_chroma_bi_h)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + const int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src0[x] + offset) >> shift); + dst += dst_stride; + src += src_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_uni_v)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[vf_idx][my]; + const int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8)) + offset) >> shift); + src += src_stride; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_chroma_bi_v)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + pixel *src = (pixel *)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[vf_idx][my]; + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8)) + src0[x] + offset) >> shift); + dst += dst_stride; + src += src_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_uni_hv)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + int16_t tmp_array[(MAX_PB_SIZE + CHROMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= CHROMA_EXTRA_BEFORE * src_stride; + + for (y = 0; y < height + CHROMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + CHROMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_chroma_filters[vf_idx][my]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); + tmp += MAX_PB_SIZE; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_chroma_bi_hv)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const pixel *src = (pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + int16_t tmp_array[(MAX_PB_SIZE + CHROMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + const int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + src -= CHROMA_EXTRA_BEFORE * src_stride; + + for (y = 0; y < height + CHROMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + CHROMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_chroma_filters[vf_idx][my]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(tmp, MAX_PB_SIZE) >> 6) + src0[x] + offset) >> shift); + tmp += MAX_PB_SIZE; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_uni_w_h)(uint8_t *_dst, ptrdiff_t _dst_stride, + const uint8_t *_src, ptrdiff_t _src_stride, int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width, const int hf_idx, const int vf_idx) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + dst[x] = av_clip_pixel((((CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + } + dst += dst_stride; + src += src_stride; + } +} + +static void FUNC(put_vvc_chroma_bi_w_h)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const int denom, const int w0, const int w1, + int o0, int o1, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + const int shift = denom + FFMAX(2, 14 - BIT_DEPTH) + 1; + const int offset = (((o0 + o1) << (BIT_DEPTH - 8)) + 1) << (shift - 1); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int src1 = CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + dst[x] = av_clip_pixel((src1 * w1 + src0[x] * w0 + offset) >> shift); + } + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_uni_w_v)(uint8_t *_dst, ptrdiff_t _dst_stride, + const uint8_t *_src, ptrdiff_t _src_stride, int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width, const int hf_idx, const int vf_idx) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[vf_idx][my]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + dst[x] = av_clip_pixel((((CHROMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + } + dst += dst_stride; + src += src_stride; + } +} + +static void FUNC(put_vvc_chroma_bi_w_v)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const int denom, const int wx0, const int wx1, + int ox0, int ox1, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + const pixel *src = (pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[vf_idx][my]; + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int shift = 14 + 1 - BIT_DEPTH; + const int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(src, src_stride) >> (BIT_DEPTH - 8)) * wx1 + src0[x] * wx0 + + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); + src += src_stride; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_chroma_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dst_stride, const uint8_t *_src, ptrdiff_t _src_stride, + int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width, const int hf_idx, const int vf_idx) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + int16_t tmp_array[(MAX_PB_SIZE + CHROMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= CHROMA_EXTRA_BEFORE * src_stride; + + for (y = 0; y < height + CHROMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + CHROMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_chroma_filters[vf_idx][my]; + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((CHROMA_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); + tmp += MAX_PB_SIZE; + dst += dst_stride; + } +} + +static void FUNC(put_vvc_chroma_bi_w_hv)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const uint8_t *_src, const ptrdiff_t _src_stride, const int16_t *src0, + const int height, const int denom, const int wx0, const int wx1, + int ox0, int ox1, const intptr_t mx, const intptr_t my, const int width, + const int hf_idx, const int vf_idx) +{ + int x, y; + const pixel *src = (pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_chroma_filters[hf_idx][mx]; + int16_t tmp_array[(MAX_PB_SIZE + CHROMA_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + const int shift = 14 + 1 - BIT_DEPTH; + const int log2Wd = denom + shift - 1; + + src -= CHROMA_EXTRA_BEFORE * src_stride; + + for (y = 0; y < height + CHROMA_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = CHROMA_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + CHROMA_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_chroma_filters[vf_idx][my]; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((CHROMA_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src0[x] * wx0 + + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); + tmp += MAX_PB_SIZE; + dst += dst_stride; + src0 += MAX_PB_SIZE; + } +} + +static void FUNC(put_vvc_ciip)(uint8_t *_dst, const ptrdiff_t _dst_stride, const int width, const int height, + const uint8_t *_inter, const ptrdiff_t _inter_stride, const int intra_weight) +{ + pixel *dst = (pixel *)_dst; + pixel *inter = (pixel *)_inter; + const int dst_stride = _dst_stride / sizeof(pixel); + const int inter_stride = _inter_stride / sizeof(pixel); + + const int inter_weight = 4 - intra_weight; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = (dst[x] * intra_weight + inter[x] * inter_weight + 2) >> 2; + dst += dst_stride; + inter += inter_stride; + } +} + +static void FUNC(put_vvc_gpm)(uint8_t *_dst, ptrdiff_t dst_stride, int width, int height, + const int16_t *tmp, const int16_t *tmp1, const ptrdiff_t tmp_stride, + const uint8_t *weights, const int step_x, const int step_y) +{ + const int shift = FFMAX(5, 17 - BIT_DEPTH); + const int offset = 1 << (shift - 1); + pixel *dst = (pixel *)_dst; + + dst_stride /= sizeof(pixel); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const uint8_t w = weights[x * step_x]; + dst[x] = av_clip_pixel((tmp[x] * w + tmp1[x] * (8 - w) + offset) >> shift); + } + dst += dst_stride; + tmp += tmp_stride; + tmp1 += tmp_stride; + weights += step_y; + } +} + +//8.5.6.3.3 Luma integer sample fetching process, add one extra pad line +static void FUNC(bdof_fetch_samples)(int16_t *_dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int x_frac, const int y_frac, const int width, const int height) +{ + const int x_off = (x_frac >> 3) - 1; + const int y_off = (y_frac >> 3) - 1; + + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const pixel *src = (pixel*)_src + (x_off) + y_off * src_stride; + int16_t *dst = _dst - 1 - MAX_PB_SIZE; + + const int shift = 14 - BIT_DEPTH; + const int bdof_width = width + 2 * BDOF_BORDER_EXT; + + // top + for (int i = 0; i < bdof_width; i++) + dst[i] = src[i] << shift; + + dst += MAX_PB_SIZE; + src += src_stride; + + for (int i = 0; i < height; i++) { + dst[0] = src[0] << shift; + dst[1 + width] = src[1 + width] << shift; + dst += MAX_PB_SIZE; + src += src_stride; + } + for (int i = 0; i < bdof_width; i++) + dst[i] = src[i] << shift; +} + +//8.5.6.3.3 Luma integer sample fetching process +static void FUNC(fetch_samples)(int16_t *_dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int x_frac, const int y_frac) +{ + FUNC(bdof_fetch_samples)(_dst, _src, _src_stride, x_frac, y_frac, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE); +} + +static void FUNC(prof_grad_filter)(int16_t *_gradient_h, int16_t *_gradient_v, const ptrdiff_t gradient_stride, + const int16_t *_src, const ptrdiff_t src_stride, const int width, const int height, const int pad) +{ + const int shift = 6; + const int16_t *src = _src; + int16_t *gradient_h = _gradient_h + pad * (1 + gradient_stride); + int16_t *gradient_v = _gradient_v + pad * (1 + gradient_stride); + + for (int y = 0; y < height; y++) { + const int16_t *p = src; + for (int x = 0; x < width; x++) { + gradient_h[x] = (p[1] >> shift) - (p[-1] >> shift); + gradient_v[x] = (p[src_stride] >> shift) - (p[-src_stride] >> shift); + p++; + } + gradient_h += gradient_stride; + gradient_v += gradient_stride; + src += src_stride; + } + if (pad) { + pad_int16(_gradient_h + 1 + gradient_stride, gradient_stride, width, height); + pad_int16(_gradient_v + 1 + gradient_stride, gradient_stride, width, height); + } +} + +static void FUNC(apply_prof)(int16_t *dst, const int16_t *src, const int16_t *diff_mv_x, const int16_t *diff_mv_y) +{ + const int limit = (1 << FFMAX(13, BIT_DEPTH + 1)); ///< dILimit + + int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE, 0); + + for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) { + for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) { + const int o = y * AFFINE_MIN_BLOCK_SIZE + x; + const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o]; + const int val = src[x] + av_clip(di, -limit, limit - 1); + dst[x] = val; + + } + src += MAX_PB_SIZE; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(apply_prof_uni)(uint8_t *_dst, const ptrdiff_t _dst_stride, const int16_t *src, const int16_t *diff_mv_x, const int16_t *diff_mv_y) +{ + const int limit = (1 << FFMAX(13, BIT_DEPTH + 1)); ///< dILimit + + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + pixel *dst = (pixel*)_dst; + + const int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE, 0); + + for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) { + for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) { + const int o = y * AFFINE_MIN_BLOCK_SIZE + x; + const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o]; + const int val = src[x] + av_clip(di, -limit, limit - 1); + dst[x] = av_clip_pixel((val + offset) >> shift); + + } + src += MAX_PB_SIZE; + dst += dst_stride; + } +} + +static void FUNC(apply_prof_uni_w)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const int16_t *src, const int16_t *diff_mv_x, const int16_t *diff_mv_y, + const int denom, const int wx, int ox) +{ + const int limit = (1 << FFMAX(13, BIT_DEPTH + 1)); ///< dILimit + + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + pixel *dst = (pixel*)_dst; + + const int shift = denom + FFMAX(2, 14 - BIT_DEPTH); + const int offset = 1 << (shift - 1); + + int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + + ox = ox * (1 << (BIT_DEPTH - 8)); + + FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE, 0); + + for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) { + for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) { + const int o = y * AFFINE_MIN_BLOCK_SIZE + x; + const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o]; + const int val = src[x] + av_clip(di, -limit, limit - 1); + dst[x] = av_clip_pixel(((val * wx + offset) >> shift) + ox); + } + src += MAX_PB_SIZE; + dst += dst_stride; + } +} + +static void FUNC(apply_prof_bi)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const int16_t *src0, const int16_t *src1, const int16_t *diff_mv_x, const int16_t *diff_mv_y) +{ + const int limit = (1 << FFMAX(13, BIT_DEPTH + 1)); ///< dILimit + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + pixel *dst = (pixel*)_dst; + + + const int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + const int offset = 1 << (shift - 1); +#else + const int offset = 0; +#endif + + int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src1, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE, 0); + + for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) { + for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) { + const int o = y * AFFINE_MIN_BLOCK_SIZE + x; + const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o]; + const int val = src1[x] + av_clip(di, -limit, limit - 1); + dst[x] = av_clip_pixel((val + src0[x] + offset) >> shift); + + } + src0 += MAX_PB_SIZE; + src1 += MAX_PB_SIZE; + dst += dst_stride; + } +} + +static void FUNC(apply_prof_bi_w)(uint8_t *_dst, const ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, + const int16_t *diff_mv_x, const int16_t *diff_mv_y, const int denom, const int w0, const int w1, int o0, int o1) +{ + const int limit = (1 << FFMAX(13, BIT_DEPTH + 1)); ///< dILimit + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + pixel *dst = (pixel*)_dst; + + + const int shift = 14 + 1 - BIT_DEPTH; + const int log2Wd = denom + shift - 1; + + int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE]; + + o0 = o0 * (1 << (BIT_DEPTH - 8)); + o1 = o1 * (1 << (BIT_DEPTH - 8)); + + FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src1, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE, 0); + + for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) { + for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) { + const int o = y * AFFINE_MIN_BLOCK_SIZE + x; + const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o]; + const int val = src1[x] + av_clip(di, -limit, limit - 1); + dst[x] = av_clip_pixel((val * w1 + src0[x] * w0 + ((o0 + o1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); + } + src0 += MAX_PB_SIZE; + src1 += MAX_PB_SIZE; + dst += dst_stride; + } +} + +static void FUNC(derive_bdof_vx_vy)(const int16_t *_src0, const int16_t *_src1, + const int16_t **gradient_h, const int16_t **gradient_v, ptrdiff_t gradient_stride, + int* vx, int* vy) +{ + const int shift2 = 4; + const int shift3 = 1; + const int thres = 1 << 4; + int sgx2 = 0, sgy2 = 0, sgxgy = 0, sgxdi = 0, sgydi = 0; + const int16_t *src0 = _src0 - 1 - MAX_PB_SIZE; + const int16_t *src1 = _src1 - 1 - MAX_PB_SIZE; + + for (int y = 0; y < BDOF_GRADIENT_SIZE; y++) { + for (int x = 0; x < BDOF_GRADIENT_SIZE; x++) { + const int diff = (src0[x] >> shift2) - (src1[x] >> shift2); + const int idx = gradient_stride * y + x; + const int temph = (gradient_h[0][idx] + gradient_h[1][idx]) >> shift3; + const int tempv = (gradient_v[0][idx] + gradient_v[1][idx]) >> shift3; + sgx2 += FFABS(temph); + sgy2 += FFABS(tempv); + sgxgy += VVC_SIGN(tempv) * temph; + sgxdi += -VVC_SIGN(temph) * diff; + sgydi += -VVC_SIGN(tempv) * diff; + } + src0 += MAX_PB_SIZE; + src1 += MAX_PB_SIZE; + } + *vx = sgx2 > 0 ? av_clip((sgxdi << 2) >> av_log2(sgx2) , -thres + 1, thres - 1) : 0; + *vy = sgy2 > 0 ? av_clip(((sgydi << 2) - ((*vx * sgxgy) >> 1)) >> av_log2(sgy2), -thres + 1, thres - 1) : 0; +} + +static void FUNC(apply_bdof_min_block)(pixel* dst, const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, + const int16_t **gradient_h, const int16_t **gradient_v, const int vx, const int vy) +{ + const int shift4 = 15 - BIT_DEPTH; + const int offset4 = 1 << (shift4 - 1); + + const int16_t* gh[] = { gradient_h[0] + 1 + BDOF_PADDED_SIZE, gradient_h[1] + 1 + BDOF_PADDED_SIZE }; + const int16_t* gv[] = { gradient_v[0] + 1 + BDOF_PADDED_SIZE, gradient_v[1] + 1 + BDOF_PADDED_SIZE }; + + for (int y = 0; y < BDOF_BLOCK_SIZE; y++) { + for (int x = 0; x < BDOF_BLOCK_SIZE; x++) { + const int idx = y * BDOF_PADDED_SIZE + x; + const int bdof_offset = vx * (gh[0][idx] - gh[1][idx]) + vy * (gv[0][idx] - gv[1][idx]); + dst[x] = av_clip_pixel((src0[x] + offset4 + src1[x] + bdof_offset) >> shift4); + } + dst += dst_stride; + src0 += MAX_PB_SIZE; + src1 += MAX_PB_SIZE; + } +} + +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, int16_t *_src0, int16_t *_src1, + const int block_w, const int block_h) +{ + int16_t gradient_h[2][BDOF_PADDED_SIZE * BDOF_PADDED_SIZE]; + int16_t gradient_v[2][BDOF_PADDED_SIZE * BDOF_PADDED_SIZE]; + int vx, vy; + + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + pixel* dst = (pixel*)_dst; + + FUNC(prof_grad_filter)(gradient_h[0], gradient_v[0], BDOF_PADDED_SIZE, + _src0, MAX_PB_SIZE, block_w, block_h, 1); + pad_int16(_src0, MAX_PB_SIZE, block_w, block_h); + FUNC(prof_grad_filter)(gradient_h[1], gradient_v[1], BDOF_PADDED_SIZE, + _src1, MAX_PB_SIZE, block_w, block_h, 1); + pad_int16(_src1, MAX_PB_SIZE, block_w, block_h); + + for (int y = 0; y < block_h; y += BDOF_BLOCK_SIZE) { + for (int x = 0; x < block_w; x += BDOF_BLOCK_SIZE) { + const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x; + const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x; + pixel *d = dst + x; + const int idx = BDOF_PADDED_SIZE * y + x; + const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx }; + const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx }; + FUNC(derive_bdof_vx_vy)(src0, src1, gh, gv, BDOF_PADDED_SIZE, &vx, &vy); + FUNC(apply_bdof_min_block)(d, dst_stride, src0, src1, gh, gv, vx, vy); + } + dst += BDOF_BLOCK_SIZE * dst_stride; + } +} + +#define DMVR_FILTER(src, stride) \ + (filter[0] * src[x] + \ + filter[1] * src[x + stride]) + +//8.5.3.2.2 Luma sample bilinear interpolation process +static void FUNC(dmvr_vvc_luma)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width) +{ + const pixel *src = (const pixel *)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +#if BIT_DEPTH > 10 + const int shift4 = BIT_DEPTH - 10; + const int offset4 = 1 << (shift4 - 1); +#endif + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { +#if BIT_DEPTH > 10 + dst[x] = (src[x] + offset4) >> shift4; +#else + dst[x] = src[x] << (10 - BIT_DEPTH); +#endif + } + src += src_stride; + dst += MAX_PB_SIZE; + } + +} + +//8.5.3.2.2 Luma sample bilinear interpolation process +static void FUNC(dmvr_vvc_luma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width) +{ + const pixel *src = (const pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_dmvr_filters[mx]; + const int shift1 = BIT_DEPTH - 6; + const int offset1 = 1 << (shift1 - 1); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = (DMVR_FILTER(src, 1) + offset1) >> shift1; + src += src_stride; + dst += MAX_PB_SIZE; + } +} + +//8.5.3.2.2 Luma sample bilinear interpolation process +static void FUNC(dmvr_vvc_luma_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width) +{ + const pixel *src = (pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + const int8_t *filter = ff_vvc_dmvr_filters[my]; + const int shift1 = BIT_DEPTH - 6; + const int offset1 = 1 << (shift1 - 1); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + dst[x] = (DMVR_FILTER(src, src_stride) + offset1) >> shift1; + src += src_stride; + dst += MAX_PB_SIZE; + } + +} + +//8.5.3.2.2 Luma sample bilinear interpolation process +static void FUNC(dmvr_vvc_luma_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const intptr_t mx, const intptr_t my, const int width) +{ + int x, y; + const int8_t *filter; + const pixel *src = (pixel*)_src; + const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + const int shift1 = BIT_DEPTH - 6; + const int offset1 = 1 << (shift1 - 1); + const int shift2 = 4; + const int offset2 = 1 << (shift2 - 1); + + src -= BILINEAR_EXTRA_BEFORE * src_stride; + filter = ff_vvc_dmvr_filters[mx]; + for (y = 0; y < height + BILINEAR_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = (DMVR_FILTER(src, 1) + offset1) >> shift1; + src += src_stride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + BILINEAR_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_vvc_dmvr_filters[my]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = (DMVR_FILTER(tmp, MAX_PB_SIZE) + offset2) >> shift2; + tmp += MAX_PB_SIZE; + dst += MAX_PB_SIZE; + } +} + +#define PEL_FUNC(dst, idx1, idx2, idx3, a) \ + inter->dst[idx1][idx2][idx3] = FUNC(a) \ + +#define DIR_FUNCS(d, C, c) \ + PEL_FUNC(put_##d, C, 0, 0, put_vvc_pel_##d##_pixels); \ + PEL_FUNC(put_##d, C, 0, 1, put_vvc_##c##_##d##_h); \ + PEL_FUNC(put_##d, C, 1, 0, put_vvc_##c##_##d##_v); \ + PEL_FUNC(put_##d, C, 1, 1, put_vvc_##c##_##d##_hv); \ + PEL_FUNC(put_##d##_w, C, 0, 0, put_vvc_pel_##d##_w_pixels); \ + PEL_FUNC(put_##d##_w, C, 0, 1, put_vvc_##c##_##d##_w_h); \ + PEL_FUNC(put_##d##_w, C, 1, 0, put_vvc_##c##_##d##_w_v); \ + PEL_FUNC(put_##d##_w, C, 1, 1, put_vvc_##c##_##d##_w_hv); + +#define FUNCS(C, c) \ + PEL_FUNC(put, C, 0, 0, put_vvc_pel_pixels); \ + PEL_FUNC(put, C, 0, 1, put_vvc_##c##_h); \ + PEL_FUNC(put, C, 1, 0, put_vvc_##c##_v); \ + PEL_FUNC(put, C, 1, 1, put_vvc_##c##_hv); \ + DIR_FUNCS(uni, C, c); \ + DIR_FUNCS(bi, C, c); \ + +static void FUNC(ff_vvc_inter_dsp_init)(VVCInterDSPContext *const inter) +{ + FUNCS(LUMA, luma); + FUNCS(CHROMA, chroma); + + inter->dmvr[0][0] = FUNC(dmvr_vvc_luma); + inter->dmvr[0][1] = FUNC(dmvr_vvc_luma_h); + inter->dmvr[1][0] = FUNC(dmvr_vvc_luma_v); + inter->dmvr[1][1] = FUNC(dmvr_vvc_luma_hv); + + inter->put_ciip = FUNC(put_vvc_ciip); + inter->put_gpm = FUNC(put_vvc_gpm); + + inter->fetch_samples = FUNC(fetch_samples); + inter->bdof_fetch_samples = FUNC(bdof_fetch_samples); + inter->apply_prof = FUNC(apply_prof); + inter->apply_prof_uni = FUNC(apply_prof_uni); + inter->apply_prof_uni_w = FUNC(apply_prof_uni_w); + inter->apply_prof_bi = FUNC(apply_prof_bi); + inter->apply_prof_bi_w = FUNC(apply_prof_bi_w); + inter->apply_bdof = FUNC(apply_bdof); + inter->prof_grad_filter = FUNC(prof_grad_filter); + inter->sad = vvc_sad; +} + +#undef FUNCS +#undef PEL_FUNC +#undef DMVR_FUNCS diff --git a/libavcodec/vvc/vvcdec.h b/libavcodec/vvc/vvcdec.h index 9dacd81dd3..d004742447 100644 --- a/libavcodec/vvc/vvcdec.h +++ b/libavcodec/vvc/vvcdec.h @@ -31,6 +31,7 @@ #include "vvc_executor.h" #include "vvc_ps.h" +#include "vvcdsp.h" #define LUMA 0 #define CHROMA 1 @@ -188,6 +189,7 @@ struct VVCFrameContext { VVCFrame *ref; + VVCDSPContext vvcdsp; VideoDSPContext vdsp; VVCFrameThread *frame_thread; diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h new file mode 100644 index 0000000000..fdc9f35756 --- /dev/null +++ b/libavcodec/vvc/vvcdsp.h @@ -0,0 +1,183 @@ +/* + * VVC DSP + * + * Copyright (C) 2021 Nuo Mi + * + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VVCDSP_H +#define AVCODEC_VVCDSP_H + +#include <stdint.h> +#include <stddef.h> + +enum TxType { + DCT2, + DST7, + DCT8, + N_TX_TYPE, +}; + +enum TxSize { + TX_SIZE_2, + TX_SIZE_4, + TX_SIZE_8, + TX_SIZE_16, + TX_SIZE_32, + TX_SIZE_64, + N_TX_SIZE, +}; + +typedef struct VVCInterDSPContext { + void (*put[2 /* luma, chroma */][2 /* int, frac */][2 /* int, frac */])( + int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, + int height, intptr_t mx, intptr_t my, int width, int hf_idx, int vf_idx); + + void (*put_uni[2 /* luma, chroma */][2 /* int, frac */][2 /* int, frac */])( + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, int height, + intptr_t mx, intptr_t my, int width, int hf_idx, int vf_idx); + void (*put_uni_w[2 /* luma, chroma */][2 /* int, frac */][2 /* int, frac */])( + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width, int hf_idx, int vf_idx); + + void (*put_bi[2 /* luma, chroma */][2 /* int, frac */][2 /* int, frac */])( + uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, const int16_t *src2, + int height, intptr_t mx, intptr_t my, int width, int hf_idx, int vf_idx); + void (*put_bi_w[2 /* luma, chroma */][2 /* int, frac */][2 /* int, frac */])( + uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, const int16_t *src2, + int height, int denom, int wx0, int wx1, int ox0, int ox1, + intptr_t mx, intptr_t my, int width, int hf_idx, int vf_idx); + + void (*put_ciip)(uint8_t *dst, ptrdiff_t dst_stride, int width, int height, + const uint8_t *inter, ptrdiff_t inter_stride, int inter_weight); + + void (*put_gpm)(uint8_t *dst, ptrdiff_t dst_stride, int width, int height, + const int16_t *tmp, const int16_t *tmp1, const ptrdiff_t tmp_stride, + const uint8_t *weights, int step_x, int step_y); + + void (*fetch_samples)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int x_frac, int y_frac); + void (*bdof_fetch_samples)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int x_frac, int y_frac, + int width, int height); + + void (*prof_grad_filter)(int16_t *gradient_h, int16_t *gradient_v, const ptrdiff_t gradient_stride, + const int16_t *src, const ptrdiff_t src_stride, int width, int height, const int pad); + void (*apply_prof)(int16_t *dst, const int16_t *src, const int16_t *diff_mv_x, const int16_t *diff_mv_y); + + void (*apply_prof_uni)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, + const int16_t *diff_mv_x, const int16_t *diff_mv_y); + void (*apply_prof_uni_w)(uint8_t *dst, const ptrdiff_t dst_stride, const int16_t *src, + const int16_t *diff_mv_x, const int16_t *diff_mv_y, int denom, int wx, int ox); + + void (*apply_prof_bi)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, + const int16_t *diff_mv_x, const int16_t *diff_mv_y); + void (*apply_prof_bi_w)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, + const int16_t *diff_mv_x, const int16_t *diff_mv_y, int denom, int w0, int w1, int o0, int o1); + + void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, int block_w, int block_h); + + int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); + void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height, + intptr_t mx, intptr_t my, int width); +} VVCInterDSPContext; + +struct VVCLocalContext; + +typedef struct VVCIntraDSPContext { + void (*intra_cclm_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h); + void (*lmcs_scale_chroma)(struct VVCLocalContext *lc, int *dst, const int *coeff, int w, int h, int x0_cu, int y0_cu); + void (*intra_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h, int c_idx); + void (*pred_planar)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride); + void (*pred_mip)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride, + int mode_id, int is_transpose); + void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride); + void (*pred_v)(uint8_t *src, const uint8_t *_top, int w, int h, ptrdiff_t stride); + void (*pred_h)(uint8_t *src, const uint8_t *_left, int w, int h, ptrdiff_t stride); + void (*pred_angular_v)(uint8_t *src, const uint8_t *_top, const uint8_t *_left, + int w, int h, ptrdiff_t stride, int c_idx, int mode, int ref_idx, int filter_flag, int need_pdpc); + void (*pred_angular_h)(uint8_t *src, const uint8_t *_top, const uint8_t *_left, int w, int h, ptrdiff_t stride, + int c_idx, int mode, int ref_idx, int filter_flag, int need_pdpc); +} VVCIntraDSPContext; + +typedef struct VVCItxDSPContext { + void (*add_residual)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride); + void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); + void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift); + + void (*itx[N_TX_TYPE][N_TX_SIZE])(int *out, ptrdiff_t out_step, const int *in, ptrdiff_t in_step); + void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int depth); +} VVCItxDSPContext; + +typedef struct VVCLMCSDSPContext { + void (*filter)(uint8_t *dst, ptrdiff_t dst_stride, int width, int height, const uint8_t *lut); +} VVCLMCSDSPContext; + +typedef struct VVCLFDSPContext { + int (*ladf_level[2 /* h, v */])(const uint8_t *pix, ptrdiff_t stride); + + void (*filter_luma[2 /* h, v */])(uint8_t *pix, ptrdiff_t stride, int beta, int32_t tc, + uint8_t no_p, uint8_t no_q, uint8_t max_len_p, uint8_t max_len_q, int hor_ctu_edge); + void (*filter_chroma[2 /* h, v */])(uint8_t *pix, ptrdiff_t stride, int beta, int32_t tc, + uint8_t no_p, uint8_t no_q, int shift, int max_len_p, int max_len_q); +} VVCLFDSPContext; + +struct SAOParams; +typedef struct VVCSAODSPContext { + void (*band_filter[9])(uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, + int16_t *sao_offset_val, int sao_left_class, int width, int height); + /* implicit src_stride parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */ + void (*edge_filter[9])(uint8_t *dst /* align 16 */, uint8_t *src /* align 32 */, ptrdiff_t dst_stride, + int16_t *sao_offset_val, int sao_eo_class, int width, int height); + void (*edge_restore[2])(uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, + struct SAOParams *sao, int *borders, int width, int height, int c_idx, + uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); +} VVCSAODSPContext; + +typedef struct VVCALFDSPContext { + void (*filter[2 /* luma, chroma */])(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, + int width, int height, const int8_t *filter, const int16_t *clip); + void (*filter_vb[2 /* luma, chroma */])(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, + int width, int height, const int8_t *filter, const int16_t *clip, int vb_pos); + void (*filter_cc)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *luma, ptrdiff_t luma_stride, + int width, int height, int hs, int vs, const int8_t *filter, int vb_pos); + + void (*classify)(int *class_idx, int *transpose_idx, const uint8_t *src, ptrdiff_t src_stride, int width, int height, + int vb_pos, int *gradient_tmp); + void (*recon_coeff_and_clip)(int8_t *coeff, int16_t *clip, const int *class_idx, const int *transpose_idx, int size, + const int8_t *coeff_set, const uint8_t *clip_idx_set, const uint8_t *class_to_filt); +} VVCALFDSPContext; + +typedef struct VVCDSPContext { + VVCInterDSPContext inter; + VVCIntraDSPContext intra; + VVCItxDSPContext itx; + VVCLMCSDSPContext lmcs; + VVCLFDSPContext lf; + VVCSAODSPContext sao; + VVCALFDSPContext alf; +} VVCDSPContext; + +void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); + +extern const int8_t ff_vvc_chroma_filters[3][32][4]; +extern const int8_t ff_vvc_luma_filters[3][16][8]; +extern const int8_t ff_vvc_dmvr_filters[16][2]; + +#endif /* AVCODEC_VVCDSP_H */

[FFmpeg-devel,07/14] vvcdec: add inter prediction

Checks

Commit Message

Patch