Message ID | tencent_4E27A05F4DB04FAD552AA701B8CD16274D07@qq.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/4] aarch64/vvc: Add w_avg | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
Drop patch 2/4 for now. It needs more polish. See patch v2 https://ffmpeg.org/pipermail/ffmpeg-devel/2024-September/333800.html > On Sep 22, 2024, at 01:41, Zhao Zhili <quinkblack@foxmail.com> wrote: > > From: Zhao Zhili <zhilizhao@tencent.com> > > apply_bdof_8_8x16_c: 18.7 ( 1.00x) > apply_bdof_8_8x16_neon: 9.7 ( 1.93x) > apply_bdof_8_16x8_c: 20.0 ( 1.00x) > apply_bdof_8_16x8_neon: 9.5 ( 2.11x) > apply_bdof_8_16x16_c: 36.7 ( 1.00x) > apply_bdof_8_16x16_neon: 19.0 ( 1.94x) > apply_bdof_10_8x16_c: 18.0 ( 1.00x) > apply_bdof_10_8x16_neon: 10.0 ( 1.80x) > apply_bdof_10_16x8_c: 18.0 ( 1.00x) > apply_bdof_10_16x8_neon: 9.5 ( 1.90x) > apply_bdof_10_16x16_c: 35.5 ( 1.00x) > apply_bdof_10_16x16_neon: 19.0 ( 1.87x) > apply_bdof_12_8x16_c: 17.5 ( 1.00x) > apply_bdof_12_8x16_neon: 9.7 ( 1.80x) > apply_bdof_12_16x8_c: 18.2 ( 1.00x) > apply_bdof_12_16x8_neon: 9.5 ( 1.92x) > apply_bdof_12_16x16_c: 34.5 ( 1.00x) > apply_bdof_12_16x16_neon: 18.7 ( 1.84x) > --- > libavcodec/aarch64/vvc/dsp_init.c | 9 + > libavcodec/aarch64/vvc/inter.S | 351 +++++++++++++++++++++++++++ > libavcodec/aarch64/vvc/of_template.c | 70 ++++++ > 3 files changed, 430 insertions(+) > create mode 100644 libavcodec/aarch64/vvc/of_template.c > > diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c > index b39ebb83fc..03a4c62310 100644 > --- a/libavcodec/aarch64/vvc/dsp_init.c > +++ b/libavcodec/aarch64/vvc/dsp_init.c > @@ -27,16 +27,22 @@ > #include "libavcodec/vvc/dec.h" > #include "libavcodec/vvc/ctu.h" > > +#define BDOF_BLOCK_SIZE 16 > +#define BDOF_MIN_BLOCK_SIZE 4 > + > #define BIT_DEPTH 8 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > #define BIT_DEPTH 10 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > #define BIT_DEPTH 12 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy, > @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) > > c->inter.avg = ff_vvc_avg_8_neon; > c->inter.w_avg = vvc_w_avg_8; > + c->inter.apply_bdof = apply_bdof_8; > > for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) > c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon; > @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) > } else if (bd == 10) { > c->inter.avg = ff_vvc_avg_10_neon; > c->inter.w_avg = vvc_w_avg_10; > + c->inter.apply_bdof = apply_bdof_10; > > c->alf.filter[LUMA] = alf_filter_luma_10_neon; > c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; > } else if (bd == 12) { > c->inter.avg = ff_vvc_avg_12_neon; > c->inter.w_avg = vvc_w_avg_12; > + c->inter.apply_bdof = apply_bdof_12; > > c->alf.filter[LUMA] = alf_filter_luma_12_neon; > c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; > diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S > index 49e1050aee..8cfacef44f 100644 > --- a/libavcodec/aarch64/vvc/inter.S > +++ b/libavcodec/aarch64/vvc/inter.S > @@ -21,6 +21,8 @@ > #include "libavutil/aarch64/asm.S" > > #define VVC_MAX_PB_SIZE 128 > +#define BDOF_BLOCK_SIZE 16 > +#define BDOF_MIN_BLOCK_SIZE 4 > > .macro vvc_avg type, bit_depth > > @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > 32: > ret > endfunc > + > +.unreq dst > +.unreq dst_stride > +.unreq src0 > +.unreq src1 > +.unreq width > +.unreq height > .endm > > vvc_avg avg, 8 > @@ -219,3 +228,345 @@ vvc_avg avg, 12 > vvc_avg w_avg, 8 > vvc_avg w_avg, 10 > vvc_avg w_avg, 12 > + > +function ff_vvc_prof_grad_filter_8x_neon, export=1 > + gh .req x0 > + gv .req x1 > + gstride .req x2 > + src .req x3 > + src_stride .req x4 > + width .req w5 > + height .req w6 > + > + lsl src_stride, src_stride, #1 > + neg x7, src_stride > +1: > + mov x10, src > + mov w11, width > + mov x12, gh > + mov x13, gv > +2: > + ldur q0, [x10, #2] > + ldur q1, [x10, #-2] > + subs w11, w11, #8 > + ldr q2, [x10, src_stride] > + ldr q3, [x10, x7] > + sshr v0.8h, v0.8h, #6 > + sshr v1.8h, v1.8h, #6 > + sshr v2.8h, v2.8h, #6 > + sshr v3.8h, v3.8h, #6 > + sub v0.8h, v0.8h, v1.8h > + sub v2.8h, v2.8h, v3.8h > + st1 {v0.8h}, [x12], #16 > + st1 {v2.8h}, [x13], #16 > + add x10, x10, #16 > + b.ne 2b > + > + subs height, height, #1 > + add gh, gh, gstride, lsl #1 > + add gv, gv, gstride, lsl #1 > + add src, src, src_stride > + b.ne 1b > + ret > + > +.unreq gh > +.unreq gv > +.unreq gstride > +.unreq src > +.unreq src_stride > +.unreq width > +.unreq height > + > +endfunc > + > +.macro vvc_apply_bdof_min_block bit_depth > + dst .req x0 > + dst_stride .req x1 > + src0 .req x2 > + src1 .req x3 > + gh .req x4 > + gv .req x5 > + vx .req w6 > + vy .req w7 > + > + dup v0.4h, vx > + dup v1.4h, vy > + movi v7.4s, #(1 << (14 - \bit_depth)) > + ldp x8, x9, [gh] > + ldp x10, x11, [gv] > + mov x12, #(BDOF_BLOCK_SIZE * 2) > + mov w13, #(BDOF_MIN_BLOCK_SIZE) > + mov x14, #(VVC_MAX_PB_SIZE * 2) > +.if \bit_depth >= 10 > + // clip pixel > + mov w15, #((1 << \bit_depth) - 1) > + movi v18.8h, #0 > + lsl dst_stride, dst_stride, #1 > + dup v17.8h, w15 > +.endif > +1: > + ld1 {v2.4h}, [x8], x12 > + ld1 {v3.4h}, [x9], x12 > + ld1 {v4.4h}, [x10], x12 > + ld1 {v5.4h}, [x11], x12 > + sub v2.4h, v2.4h, v3.4h > + sub v4.4h, v4.4h, v5.4h > + smull v2.4s, v0.4h, v2.4h > + smlal v2.4s, v1.4h, v4.4h > + > + ld1 {v5.4h}, [src0], x14 > + ld1 {v6.4h}, [src1], x14 > + saddl v5.4s, v5.4h, v6.4h > + add v5.4s, v5.4s, v7.4s > + add v5.4s, v5.4s, v2.4s > + sqshrn v5.4h, v5.4s, #(15 - \bit_depth) > + subs w13, w13, #1 > +.if \bit_depth == 8 > + sqxtun v5.8b, v5.8h > + str s5, [dst] > + add dst, dst, dst_stride > +.else > + smin v5.4h, v5.4h, v17.4h > + smax v5.4h, v5.4h, v18.4h > + st1 {v5.4h}, [dst], dst_stride > +.endif > + b.ne 1b > + ret > + > +.unreq dst > +.unreq dst_stride > +.unreq src0 > +.unreq src1 > +.unreq gh > +.unreq gv > +.unreq vx > +.unreq vy > +.endm > + > +function ff_vvc_apply_bdof_min_block_8_neon, export=1 > + vvc_apply_bdof_min_block 8 > +endfunc > + > +function ff_vvc_apply_bdof_min_block_10_neon, export=1 > + vvc_apply_bdof_min_block 10 > +endfunc > + > +function ff_vvc_apply_bdof_min_block_12_neon, export=1 > + vvc_apply_bdof_min_block 12 > +endfunc > + > +.macro derive_bdof_vx_vy_x_begin_end > + ldrh w19, [x14, x16, lsl #1] // load from src0 > + ldrh w20, [x15, x16, lsl #1] // load from src1 > + sxth w19, w19 > + sxth w20, w20 > + asr w19, w19, #4 > + asr w20, w20, #4 > + sub w19, w19, w20 // diff > + add x17, x16, x13, lsl #4 // idx > + ldrh w3, [gh0, x17, lsl #1] // load from gh0 > + ldrh w4, [gh1, x17, lsl #1] // load from gh1 > + sxth w3, w3 > + sxth w4, w4 > + ldrh w22, [gv0, x17, lsl #1] // load from gv0 > + ldrh w23, [gv1, x17, lsl #1] // load from gv1 > + add w3, w3, w4 > + asr w21, w3, #1 // temph > + sxth w3, w22 > + sxth w4, w23 > + add w3, w3, w4 > + cmp w21, #0 > + asr w22, w3, #1 // tempv > + cneg w20, w21, mi > + csetm w23, ne > + csinc w23, w23, wzr, ge // -VVC_SIGN(temph) > + cmp w22, #0 > + add sgx2, sgx2, w20 > + cneg w20, w22, mi > + cset w24, ne > + csinv w24, w24, wzr, ge // VVC_SIGN(tempv) > + add sgy2, sgy2, w20 > + madd sgxgy, w24, w21, sgxgy > + madd sgxdi, w23, w19, sgxdi > + csetm w24, ne > + csinc w24, w24, wzr, ge // -VVC_SIGN(tempv) > + madd sgydi, w24, w19, sgydi > +.endm > + > +function ff_vvc_derive_bdof_vx_vy_neon, export=1 > + src0 .req x0 > + src1 .req x1 > + pad_mask .req w2 > + gh .req x3 > + gv .req x4 > + gh0 .req x27 > + gh1 .req x28 > + gv0 .req x25 > + gv1 .req x26 > + vx .req x5 > + vy .req x6 > + sgx2 .req w7 > + sgy2 .req w8 > + sgxgy .req w9 > + sgxdi .req w10 > + sgydi .req w11 > + y .req x12 > + > + stp x27, x28, [sp, #-80]! > + stp x25, x26, [sp, #16] > + stp x23, x24, [sp, #32] > + stp x21, x22, [sp, #48] > + stp x19, x20, [sp, #64] > + > + ldp gh0, gh1, [gh] > + mov sgx2, #0 > + mov sgy2, #0 > + mov sgxgy, #0 > + mov sgxdi, #0 > + mov sgydi, #0 > + ldp gv0, gv1, [gv] > + > + mov y, #-1 > + mov x13, #-1 // dy > + tst pad_mask, #2 > + b.eq 1f > + mov x13, #0 // dy: pad top > +1: > + add x14, src0, x13, lsl #8 // local src0 > + add x15, src1, x13, lsl #8 // local src1 > + > + // x = -1 > + mov x16, #-1 // dx > + tst pad_mask, #1 > + b.eq 2f > + mov x16, #0 > +2: > + derive_bdof_vx_vy_x_begin_end > + > + // x = 0 to BDOF_MIN_BLOCK_SIZE - 1 > + ldr d0, [x14] > + ldr d1, [x15] > + lsl x19, x13, #5 > + ldr d2, [gh0, x19] > + ldr d3, [gh1, x19] > + sshr v0.4h, v0.4h, #4 > + sshr v1.4h, v1.4h, #4 > + ssubl v0.4s, v0.4h, v1.4h // diff > + ldr d4, [gv0, x19] > + ldr d5, [gv1, x19] > + saddl v2.4s, v2.4h, v3.4h > + saddl v4.4s, v4.4h, v5.4h > + sshr v2.4s, v2.4s, #1 // temph > + sshr v4.4s, v4.4s, #1 // tempv > + abs v3.4s, v2.4s > + abs v5.4s, v4.4s > + addv s3, v3.4s > + addv s5, v5.4s > + mov w19, v3.s[0] > + mov w20, v5.s[0] > + add sgx2, sgx2, w19 > + add sgy2, sgy2, w20 > + > + movi v5.4s, #1 > + cmgt v17.4s, v4.4s, #0 // mask > 0 > + cmlt v18.4s, v4.4s, #0 // mask < 0 > + and v17.16b, v17.16b, v5.16b > + and v18.16b, v18.16b, v5.16b > + neg v19.4s, v18.4s > + add v20.4s, v17.4s, v19.4s // VVC_SIGN(tempv) > + smull v21.2d, v20.2s, v2.2s > + smlal2 v21.2d, v20.4s, v2.4s > + addp d21, v21.2d > + mov w19, v21.s[0] > + add sgxgy, sgxgy, w19 > + > + smull v16.2d, v20.2s, v0.2s > + smlal2 v16.2d, v20.4s, v0.4s > + addp d16, v16.2d > + mov w19, v16.s[0] > + sub sgydi, sgydi, w19 > + > + cmgt v17.4s, v2.4s, #0 > + cmlt v18.4s, v2.4s, #0 > + and v17.16b, v17.16b, v5.16b > + and v18.16b, v18.16b, v5.16b > + neg v21.4s, v17.4s > + add v16.4s, v21.4s, v18.4s // -VVC_SIGN(temph) > + smull v20.2d, v16.2s, v0.2s > + smlal2 v20.2d, v16.4s, v0.4s > + addp d20, v20.2d > + mov w19, v20.s[0] > + add sgxdi, sgxdi, w19 > + > + // x = BDOF_MIN_BLOCK_SIZE > + mov x16, #BDOF_MIN_BLOCK_SIZE // dx > + tst pad_mask, #4 > + b.eq 3f > + mov x16, #(BDOF_MIN_BLOCK_SIZE - 1) > +3: > + derive_bdof_vx_vy_x_begin_end > + > + add y, y, #1 > + cmp y, #(BDOF_MIN_BLOCK_SIZE) > + mov x13, y > + b.gt 4f > + b.lt 1b > + tst pad_mask, #8 > + b.eq 1b > + sub x13, x13, #1 // pad bottom > + b 1b > +4: > + mov w3, #31 > + mov w14, #0 > + mov w16, #-15 > + mov w17, #15 > + cbz sgx2, 5f > + clz w12, sgx2 > + lsl sgxdi, sgxdi, #2 > + sub w13, w3, w12 // log2(sgx2) > + asr sgxdi, sgxdi, w13 > + cmp sgxdi, w16 > + csel w14, w16, sgxdi, lt // clip to -15 > + b.le 5f > + cmp sgxdi, w17 > + csel w14, w17, sgxdi, gt // clip to 15 > +5: > + str w14, [vx] > + > + mov w15, #0 > + cbz sgy2, 6f > + lsl sgydi, sgydi, #2 > + smull x14, w14, sgxgy > + asr w14, w14, #1 > + sub sgydi, sgydi, w14 > + clz w12, sgy2 > + sub w13, w3, w12 // log2(sgy2) > + asr sgydi, sgydi, w13 > + cmp sgydi, w16 > + csel w15, w16, sgydi, lt // clip to -15 > + b.le 6f > + cmp sgydi, w17 > + csel w15, w17, sgydi, gt // clip to 15 > +6: > + str w15, [vy] > + ldp x25, x26, [sp, #16] > + ldp x23, x24, [sp, #32] > + ldp x21, x22, [sp, #48] > + ldp x19, x20, [sp, #64] > + ldp x27, x28, [sp], #80 > + ret > +.unreq src0 > +.unreq src1 > +.unreq pad_mask > +.unreq gh > +.unreq gv > +.unreq vx > +.unreq vy > +.unreq sgx2 > +.unreq sgy2 > +.unreq sgxgy > +.unreq sgxdi > +.unreq sgydi > +.unreq y > +endfunc > + > diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c > new file mode 100644 > index 0000000000..508ea6d99d > --- /dev/null > +++ b/libavcodec/aarch64/vvc/of_template.c > @@ -0,0 +1,70 @@ > +/* > + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "libavcodec/bit_depth_template.c" > + > +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h, > + int16_t *gradient_v, > + const ptrdiff_t gradient_stride, > + const int16_t *_src, > + const ptrdiff_t src_stride, > + const int width, const int height); > + > +void ff_vvc_derive_bdof_vx_vy_neon( > + const int16_t *_src0, const int16_t *_src1, int pad_mask, > + const int16_t **gradient_h, const int16_t **gradient_v, > + int *vx, int *vy); > + > +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst, > + const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, > + const int16_t **gh, const int16_t **gv, const int vx, const int vy); > + > +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, > + const int16_t *_src0, const int16_t *_src1, > + const int block_w, const int block_h) > +{ > + int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; > + int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; > + int vx, vy; > + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); > + pixel* dst = (pixel*)_dst; > + > + ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE, > + _src0, MAX_PB_SIZE, block_w, block_h); > + ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE, > + _src1, MAX_PB_SIZE, block_w, block_h); > + > + for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) { > + for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) { > + const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x; > + const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x; > + pixel *d = dst + x; > + const int idx = BDOF_BLOCK_SIZE * y + x; > + const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx }; > + const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx }; > + const int pad_mask = !x | ((!y) << 1) | > + ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) | > + ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3); > + ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy); > + FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy); > + } > + dst += BDOF_MIN_BLOCK_SIZE * dst_stride; > + } > +} > -- > 2.42.0 > > From: Zhao Zhili <zhilizhao@tencent.com> > > apply_bdof_8_8x16_c: 18.7 ( 1.00x) > apply_bdof_8_8x16_neon: 9.7 ( 1.93x) > apply_bdof_8_16x8_c: 20.0 ( 1.00x) > apply_bdof_8_16x8_neon: 9.5 ( 2.11x) > apply_bdof_8_16x16_c: 36.7 ( 1.00x) > apply_bdof_8_16x16_neon: 19.0 ( 1.94x) > apply_bdof_10_8x16_c: 18.0 ( 1.00x) > apply_bdof_10_8x16_neon: 10.0 ( 1.80x) > apply_bdof_10_16x8_c: 18.0 ( 1.00x) > apply_bdof_10_16x8_neon: 9.5 ( 1.90x) > apply_bdof_10_16x16_c: 35.5 ( 1.00x) > apply_bdof_10_16x16_neon: 19.0 ( 1.87x) > apply_bdof_12_8x16_c: 17.5 ( 1.00x) > apply_bdof_12_8x16_neon: 9.7 ( 1.80x) > apply_bdof_12_16x8_c: 18.2 ( 1.00x) > apply_bdof_12_16x8_neon: 9.5 ( 1.92x) > apply_bdof_12_16x16_c: 34.5 ( 1.00x) > apply_bdof_12_16x16_neon: 18.7 ( 1.84x) > --- > libavcodec/aarch64/vvc/dsp_init.c | 9 + > libavcodec/aarch64/vvc/inter.S | 351 +++++++++++++++++++++++++++ > libavcodec/aarch64/vvc/of_template.c | 70 ++++++ > 3 files changed, 430 insertions(+) > create mode 100644 libavcodec/aarch64/vvc/of_template.c > > diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c > index b39ebb83fc..03a4c62310 100644 > --- a/libavcodec/aarch64/vvc/dsp_init.c > +++ b/libavcodec/aarch64/vvc/dsp_init.c > @@ -27,16 +27,22 @@ > #include "libavcodec/vvc/dec.h" > #include "libavcodec/vvc/ctu.h" > > +#define BDOF_BLOCK_SIZE 16 > +#define BDOF_MIN_BLOCK_SIZE 4 > + > #define BIT_DEPTH 8 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > #define BIT_DEPTH 10 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > #define BIT_DEPTH 12 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy, > @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) > > c->inter.avg = ff_vvc_avg_8_neon; > c->inter.w_avg = vvc_w_avg_8; > + c->inter.apply_bdof = apply_bdof_8; > > for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) > c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon; > @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) > } else if (bd == 10) { > c->inter.avg = ff_vvc_avg_10_neon; > c->inter.w_avg = vvc_w_avg_10; > + c->inter.apply_bdof = apply_bdof_10; > > c->alf.filter[LUMA] = alf_filter_luma_10_neon; > c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; > } else if (bd == 12) { > c->inter.avg = ff_vvc_avg_12_neon; > c->inter.w_avg = vvc_w_avg_12; > + c->inter.apply_bdof = apply_bdof_12; > > c->alf.filter[LUMA] = alf_filter_luma_12_neon; > c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; > diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S > index 49e1050aee..8cfacef44f 100644 > --- a/libavcodec/aarch64/vvc/inter.S > +++ b/libavcodec/aarch64/vvc/inter.S > @@ -21,6 +21,8 @@ > #include "libavutil/aarch64/asm.S" > > #define VVC_MAX_PB_SIZE 128 > +#define BDOF_BLOCK_SIZE 16 > +#define BDOF_MIN_BLOCK_SIZE 4 > > .macro vvc_avg type, bit_depth > > @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > 32: > ret > endfunc > + > +.unreq dst > +.unreq dst_stride > +.unreq src0 > +.unreq src1 > +.unreq width > +.unreq height > .endm > > vvc_avg avg, 8 > @@ -219,3 +228,345 @@ vvc_avg avg, 12 > vvc_avg w_avg, 8 > vvc_avg w_avg, 10 > vvc_avg w_avg, 12 > + > +function ff_vvc_prof_grad_filter_8x_neon, export=1 > + gh .req x0 > + gv .req x1 > + gstride .req x2 > + src .req x3 > + src_stride .req x4 > + width .req w5 > + height .req w6 > + > + lsl src_stride, src_stride, #1 > + neg x7, src_stride > +1: > + mov x10, src > + mov w11, width > + mov x12, gh > + mov x13, gv > +2: > + ldur q0, [x10, #2] > + ldur q1, [x10, #-2] > + subs w11, w11, #8 > + ldr q2, [x10, src_stride] > + ldr q3, [x10, x7] > + sshr v0.8h, v0.8h, #6 > + sshr v1.8h, v1.8h, #6 > + sshr v2.8h, v2.8h, #6 > + sshr v3.8h, v3.8h, #6 > + sub v0.8h, v0.8h, v1.8h > + sub v2.8h, v2.8h, v3.8h > + st1 {v0.8h}, [x12], #16 > + st1 {v2.8h}, [x13], #16 > + add x10, x10, #16 > + b.ne 2b > + > + subs height, height, #1 > + add gh, gh, gstride, lsl #1 > + add gv, gv, gstride, lsl #1 > + add src, src, src_stride > + b.ne 1b > + ret > + > +.unreq gh > +.unreq gv > +.unreq gstride > +.unreq src > +.unreq src_stride > +.unreq width > +.unreq height > + > +endfunc > + > +.macro vvc_apply_bdof_min_block bit_depth > + dst .req x0 > + dst_stride .req x1 > + src0 .req x2 > + src1 .req x3 > + gh .req x4 > + gv .req x5 > + vx .req w6 > + vy .req w7 > + > + dup v0.4h, vx > + dup v1.4h, vy > + movi v7.4s, #(1 << (14 - \bit_depth)) > + ldp x8, x9, [gh] > + ldp x10, x11, [gv] > + mov x12, #(BDOF_BLOCK_SIZE * 2) > + mov w13, #(BDOF_MIN_BLOCK_SIZE) > + mov x14, #(VVC_MAX_PB_SIZE * 2) > +.if \bit_depth >= 10 > + // clip pixel > + mov w15, #((1 << \bit_depth) - 1) > + movi v18.8h, #0 > + lsl dst_stride, dst_stride, #1 > + dup v17.8h, w15 > +.endif > +1: > + ld1 {v2.4h}, [x8], x12 > + ld1 {v3.4h}, [x9], x12 > + ld1 {v4.4h}, [x10], x12 > + ld1 {v5.4h}, [x11], x12 > + sub v2.4h, v2.4h, v3.4h > + sub v4.4h, v4.4h, v5.4h > + smull v2.4s, v0.4h, v2.4h > + smlal v2.4s, v1.4h, v4.4h > + > + ld1 {v5.4h}, [src0], x14 > + ld1 {v6.4h}, [src1], x14 > + saddl v5.4s, v5.4h, v6.4h > + add v5.4s, v5.4s, v7.4s > + add v5.4s, v5.4s, v2.4s > + sqshrn v5.4h, v5.4s, #(15 - \bit_depth) > + subs w13, w13, #1 > +.if \bit_depth == 8 > + sqxtun v5.8b, v5.8h > + str s5, [dst] > + add dst, dst, dst_stride > +.else > + smin v5.4h, v5.4h, v17.4h > + smax v5.4h, v5.4h, v18.4h > + st1 {v5.4h}, [dst], dst_stride > +.endif > + b.ne 1b > + ret > + > +.unreq dst > +.unreq dst_stride > +.unreq src0 > +.unreq src1 > +.unreq gh > +.unreq gv > +.unreq vx > +.unreq vy > +.endm > + > +function ff_vvc_apply_bdof_min_block_8_neon, export=1 > + vvc_apply_bdof_min_block 8 > +endfunc > + > +function ff_vvc_apply_bdof_min_block_10_neon, export=1 > + vvc_apply_bdof_min_block 10 > +endfunc > + > +function ff_vvc_apply_bdof_min_block_12_neon, export=1 > + vvc_apply_bdof_min_block 12 > +endfunc > + > +.macro derive_bdof_vx_vy_x_begin_end > + ldrh w19, [x14, x16, lsl #1] // load from src0 > + ldrh w20, [x15, x16, lsl #1] // load from src1 > + sxth w19, w19 > + sxth w20, w20 > + asr w19, w19, #4 > + asr w20, w20, #4 > + sub w19, w19, w20 // diff > + add x17, x16, x13, lsl #4 // idx > + ldrh w3, [gh0, x17, lsl #1] // load from gh0 > + ldrh w4, [gh1, x17, lsl #1] // load from gh1 > + sxth w3, w3 > + sxth w4, w4 > + ldrh w22, [gv0, x17, lsl #1] // load from gv0 > + ldrh w23, [gv1, x17, lsl #1] // load from gv1 > + add w3, w3, w4 > + asr w21, w3, #1 // temph > + sxth w3, w22 > + sxth w4, w23 > + add w3, w3, w4 > + cmp w21, #0 > + asr w22, w3, #1 // tempv > + cneg w20, w21, mi > + csetm w23, ne > + csinc w23, w23, wzr, ge // -VVC_SIGN(temph) > + cmp w22, #0 > + add sgx2, sgx2, w20 > + cneg w20, w22, mi > + cset w24, ne > + csinv w24, w24, wzr, ge // VVC_SIGN(tempv) > + add sgy2, sgy2, w20 > + madd sgxgy, w24, w21, sgxgy > + madd sgxdi, w23, w19, sgxdi > + csetm w24, ne > + csinc w24, w24, wzr, ge // -VVC_SIGN(tempv) > + madd sgydi, w24, w19, sgydi > +.endm > + > +function ff_vvc_derive_bdof_vx_vy_neon, export=1 > + src0 .req x0 > + src1 .req x1 > + pad_mask .req w2 > + gh .req x3 > + gv .req x4 > + gh0 .req x27 > + gh1 .req x28 > + gv0 .req x25 > + gv1 .req x26 > + vx .req x5 > + vy .req x6 > + sgx2 .req w7 > + sgy2 .req w8 > + sgxgy .req w9 > + sgxdi .req w10 > + sgydi .req w11 > + y .req x12 > + > + stp x27, x28, [sp, #-80]! > + stp x25, x26, [sp, #16] > + stp x23, x24, [sp, #32] > + stp x21, x22, [sp, #48] > + stp x19, x20, [sp, #64] > + > + ldp gh0, gh1, [gh] > + mov sgx2, #0 > + mov sgy2, #0 > + mov sgxgy, #0 > + mov sgxdi, #0 > + mov sgydi, #0 > + ldp gv0, gv1, [gv] > + > + mov y, #-1 > + mov x13, #-1 // dy > + tst pad_mask, #2 > + b.eq 1f > + mov x13, #0 // dy: pad top > +1: > + add x14, src0, x13, lsl #8 // local src0 > + add x15, src1, x13, lsl #8 // local src1 > + > + // x = -1 > + mov x16, #-1 // dx > + tst pad_mask, #1 > + b.eq 2f > + mov x16, #0 > +2: > + derive_bdof_vx_vy_x_begin_end > + > + // x = 0 to BDOF_MIN_BLOCK_SIZE - 1 > + ldr d0, [x14] > + ldr d1, [x15] > + lsl x19, x13, #5 > + ldr d2, [gh0, x19] > + ldr d3, [gh1, x19] > + sshr v0.4h, v0.4h, #4 > + sshr v1.4h, v1.4h, #4 > + ssubl v0.4s, v0.4h, v1.4h // diff > + ldr d4, [gv0, x19] > + ldr d5, [gv1, x19] > + saddl v2.4s, v2.4h, v3.4h > + saddl v4.4s, v4.4h, v5.4h > + sshr v2.4s, v2.4s, #1 // temph > + sshr v4.4s, v4.4s, #1 // tempv > + abs v3.4s, v2.4s > + abs v5.4s, v4.4s > + addv s3, v3.4s > + addv s5, v5.4s > + mov w19, v3.s[0] > + mov w20, v5.s[0] > + add sgx2, sgx2, w19 > + add sgy2, sgy2, w20 > + > + movi v5.4s, #1 > + cmgt v17.4s, v4.4s, #0 // mask > 0 > + cmlt v18.4s, v4.4s, #0 // mask < 0 > + and v17.16b, v17.16b, v5.16b > + and v18.16b, v18.16b, v5.16b > + neg v19.4s, v18.4s > + add v20.4s, v17.4s, v19.4s // VVC_SIGN(tempv) > + smull v21.2d, v20.2s, v2.2s > + smlal2 v21.2d, v20.4s, v2.4s > + addp d21, v21.2d > + mov w19, v21.s[0] > + add sgxgy, sgxgy, w19 > + > + smull v16.2d, v20.2s, v0.2s > + smlal2 v16.2d, v20.4s, v0.4s > + addp d16, v16.2d > + mov w19, v16.s[0] > + sub sgydi, sgydi, w19 > + > + cmgt v17.4s, v2.4s, #0 > + cmlt v18.4s, v2.4s, #0 > + and v17.16b, v17.16b, v5.16b > + and v18.16b, v18.16b, v5.16b > + neg v21.4s, v17.4s > + add v16.4s, v21.4s, v18.4s // -VVC_SIGN(temph) > + smull v20.2d, v16.2s, v0.2s > + smlal2 v20.2d, v16.4s, v0.4s > + addp d20, v20.2d > + mov w19, v20.s[0] > + add sgxdi, sgxdi, w19 > + > + // x = BDOF_MIN_BLOCK_SIZE > + mov x16, #BDOF_MIN_BLOCK_SIZE // dx > + tst pad_mask, #4 > + b.eq 3f > + mov x16, #(BDOF_MIN_BLOCK_SIZE - 1) > +3: > + derive_bdof_vx_vy_x_begin_end > + > + add y, y, #1 > + cmp y, #(BDOF_MIN_BLOCK_SIZE) > + mov x13, y > + b.gt 4f > + b.lt 1b > + tst pad_mask, #8 > + b.eq 1b > + sub x13, x13, #1 // pad bottom > + b 1b > +4: > + mov w3, #31 > + mov w14, #0 > + mov w16, #-15 > + mov w17, #15 > + cbz sgx2, 5f > + clz w12, sgx2 > + lsl sgxdi, sgxdi, #2 > + sub w13, w3, w12 // log2(sgx2) > + asr sgxdi, sgxdi, w13 > + cmp sgxdi, w16 > + csel w14, w16, sgxdi, lt // clip to -15 > + b.le 5f > + cmp sgxdi, w17 > + csel w14, w17, sgxdi, gt // clip to 15 > +5: > + str w14, [vx] > + > + mov w15, #0 > + cbz sgy2, 6f > + lsl sgydi, sgydi, #2 > + smull x14, w14, sgxgy > + asr w14, w14, #1 > + sub sgydi, sgydi, w14 > + clz w12, sgy2 > + sub w13, w3, w12 // log2(sgy2) > + asr sgydi, sgydi, w13 > + cmp sgydi, w16 > + csel w15, w16, sgydi, lt // clip to -15 > + b.le 6f > + cmp sgydi, w17 > + csel w15, w17, sgydi, gt // clip to 15 > +6: > + str w15, [vy] > + ldp x25, x26, [sp, #16] > + ldp x23, x24, [sp, #32] > + ldp x21, x22, [sp, #48] > + ldp x19, x20, [sp, #64] > + ldp x27, x28, [sp], #80 > + ret > +.unreq src0 > +.unreq src1 > +.unreq pad_mask > +.unreq gh > +.unreq gv > +.unreq vx > +.unreq vy > +.unreq sgx2 > +.unreq sgy2 > +.unreq sgxgy > +.unreq sgxdi > +.unreq sgydi > +.unreq y > +endfunc > + > diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c > new file mode 100644 > index 0000000000..508ea6d99d > --- /dev/null > +++ b/libavcodec/aarch64/vvc/of_template.c > @@ -0,0 +1,70 @@ > +/* > + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "libavcodec/bit_depth_template.c" > + > +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h, > + int16_t *gradient_v, > + const ptrdiff_t gradient_stride, > + const int16_t *_src, > + const ptrdiff_t src_stride, > + const int width, const int height); > + > +void ff_vvc_derive_bdof_vx_vy_neon( > + const int16_t *_src0, const int16_t *_src1, int pad_mask, > + const int16_t **gradient_h, const int16_t **gradient_v, > + int *vx, int *vy); > + > +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst, > + const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, > + const int16_t **gh, const int16_t **gv, const int vx, const int vy); > + > +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, > + const int16_t *_src0, const int16_t *_src1, > + const int block_w, const int block_h) > +{ > + int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; > + int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; > + int vx, vy; > + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); > + pixel* dst = (pixel*)_dst; > + > + ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE, > + _src0, MAX_PB_SIZE, block_w, block_h); > + ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE, > + _src1, MAX_PB_SIZE, block_w, block_h); > + > + for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) { > + for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) { > + const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x; > + const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x; > + pixel *d = dst + x; > + const int idx = BDOF_BLOCK_SIZE * y + x; > + const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx }; > + const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx }; > + const int pad_mask = !x | ((!y) << 1) | > + ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) | > + ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3); > + ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy); > + FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy); > + } > + dst += BDOF_MIN_BLOCK_SIZE * dst_stride; > + } > +} > -- > 2.42.0 >
On Sun, Sep 22, 2024 at 1:42 AM Zhao Zhili <quinkblack@foxmail.com> wrote: > From: Zhao Zhili <zhilizhao@tencent.com> > > apply_bdof_8_8x16_c: 18.7 ( 1.00x) > apply_bdof_8_8x16_neon: 9.7 ( 1.93x) > apply_bdof_8_16x8_c: 20.0 ( 1.00x) > apply_bdof_8_16x8_neon: 9.5 ( 2.11x) > apply_bdof_8_16x16_c: 36.7 ( 1.00x) > apply_bdof_8_16x16_neon: 19.0 ( 1.94x) > apply_bdof_10_8x16_c: 18.0 ( 1.00x) > apply_bdof_10_8x16_neon: 10.0 ( 1.80x) > apply_bdof_10_16x8_c: 18.0 ( 1.00x) > apply_bdof_10_16x8_neon: 9.5 ( 1.90x) > apply_bdof_10_16x16_c: 35.5 ( 1.00x) > apply_bdof_10_16x16_neon: 19.0 ( 1.87x) > apply_bdof_12_8x16_c: 17.5 ( 1.00x) > apply_bdof_12_8x16_neon: 9.7 ( 1.80x) > apply_bdof_12_16x8_c: 18.2 ( 1.00x) > apply_bdof_12_16x8_neon: 9.5 ( 1.92x) > apply_bdof_12_16x16_c: 34.5 ( 1.00x) > apply_bdof_12_16x16_neon: 18.7 ( 1.84x) > Hi Zhili, Thank you for the patch AVX2 can achieve a 10-20x performance increase for a width of 16, as demonstrated in this commit <https://github.com/FFmpeg/FFmpeg/commit/7175544c0bab30c12c24a2c440bff40a28ea83d3> . Considering that NEON operates on 128-bit vectors, a more reasonable expectation would be a 5-10x speedup > --- > libavcodec/aarch64/vvc/dsp_init.c | 9 + > libavcodec/aarch64/vvc/inter.S | 351 +++++++++++++++++++++++++++ > libavcodec/aarch64/vvc/of_template.c | 70 ++++++ > 3 files changed, 430 insertions(+) > create mode 100644 libavcodec/aarch64/vvc/of_template.c > > diff --git a/libavcodec/aarch64/vvc/dsp_init.c > b/libavcodec/aarch64/vvc/dsp_init.c > index b39ebb83fc..03a4c62310 100644 > --- a/libavcodec/aarch64/vvc/dsp_init.c > +++ b/libavcodec/aarch64/vvc/dsp_init.c > @@ -27,16 +27,22 @@ > #include "libavcodec/vvc/dec.h" > #include "libavcodec/vvc/ctu.h" > > +#define BDOF_BLOCK_SIZE 16 > +#define BDOF_MIN_BLOCK_SIZE 4 > + > #define BIT_DEPTH 8 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > #define BIT_DEPTH 10 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > #define BIT_DEPTH 12 > #include "alf_template.c" > +#include "of_template.c" > #undef BIT_DEPTH > > int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int > dy, > @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, > const int bd) > > c->inter.avg = ff_vvc_avg_8_neon; > c->inter.w_avg = vvc_w_avg_8; > + c->inter.apply_bdof = apply_bdof_8; > > for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) > c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon; > @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, > const int bd) > } else if (bd == 10) { > c->inter.avg = ff_vvc_avg_10_neon; > c->inter.w_avg = vvc_w_avg_10; > + c->inter.apply_bdof = apply_bdof_10; > > c->alf.filter[LUMA] = alf_filter_luma_10_neon; > c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; > } else if (bd == 12) { > c->inter.avg = ff_vvc_avg_12_neon; > c->inter.w_avg = vvc_w_avg_12; > + c->inter.apply_bdof = apply_bdof_12; > > c->alf.filter[LUMA] = alf_filter_luma_12_neon; > c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; > diff --git a/libavcodec/aarch64/vvc/inter.S > b/libavcodec/aarch64/vvc/inter.S > index 49e1050aee..8cfacef44f 100644 > --- a/libavcodec/aarch64/vvc/inter.S > +++ b/libavcodec/aarch64/vvc/inter.S > @@ -21,6 +21,8 @@ > #include "libavutil/aarch64/asm.S" > > #define VVC_MAX_PB_SIZE 128 > +#define BDOF_BLOCK_SIZE 16 > +#define BDOF_MIN_BLOCK_SIZE 4 > > .macro vvc_avg type, bit_depth > > @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > 32: > ret > endfunc > + > +.unreq dst > +.unreq dst_stride > +.unreq src0 > +.unreq src1 > +.unreq width > +.unreq height > .endm > > vvc_avg avg, 8 > @@ -219,3 +228,345 @@ vvc_avg avg, 12 > vvc_avg w_avg, 8 > vvc_avg w_avg, 10 > vvc_avg w_avg, 12 > + > +function ff_vvc_prof_grad_filter_8x_neon, export=1 > + gh .req x0 > + gv .req x1 > + gstride .req x2 > + src .req x3 > + src_stride .req x4 > + width .req w5 > + height .req w6 > + > + lsl src_stride, src_stride, #1 > + neg x7, src_stride > +1: > + mov x10, src > + mov w11, width > + mov x12, gh > + mov x13, gv > +2: > + ldur q0, [x10, #2] > + ldur q1, [x10, #-2] > + subs w11, w11, #8 > + ldr q2, [x10, src_stride] > + ldr q3, [x10, x7] > + sshr v0.8h, v0.8h, #6 > + sshr v1.8h, v1.8h, #6 > + sshr v2.8h, v2.8h, #6 > + sshr v3.8h, v3.8h, #6 > + sub v0.8h, v0.8h, v1.8h > + sub v2.8h, v2.8h, v3.8h > + st1 {v0.8h}, [x12], #16 > + st1 {v2.8h}, [x13], #16 > + add x10, x10, #16 > + b.ne 2b > + > + subs height, height, #1 > + add gh, gh, gstride, lsl #1 > + add gv, gv, gstride, lsl #1 > + add src, src, src_stride > + b.ne 1b > + ret > + > +.unreq gh > +.unreq gv > +.unreq gstride > +.unreq src > +.unreq src_stride > +.unreq width > +.unreq height > + > +endfunc > + > +.macro vvc_apply_bdof_min_block bit_depth > + dst .req x0 > + dst_stride .req x1 > + src0 .req x2 > + src1 .req x3 > + gh .req x4 > + gv .req x5 > + vx .req w6 > + vy .req w7 > + > + dup v0.4h, vx > + dup v1.4h, vy > + movi v7.4s, #(1 << (14 - \bit_depth)) > + ldp x8, x9, [gh] > + ldp x10, x11, [gv] > + mov x12, #(BDOF_BLOCK_SIZE * 2) > + mov w13, #(BDOF_MIN_BLOCK_SIZE) > + mov x14, #(VVC_MAX_PB_SIZE * 2) > +.if \bit_depth >= 10 > + // clip pixel > + mov w15, #((1 << \bit_depth) - 1) > + movi v18.8h, #0 > + lsl dst_stride, dst_stride, #1 > + dup v17.8h, w15 > +.endif > +1: > + ld1 {v2.4h}, [x8], x12 > + ld1 {v3.4h}, [x9], x12 > + ld1 {v4.4h}, [x10], x12 > + ld1 {v5.4h}, [x11], x12 > + sub v2.4h, v2.4h, v3.4h > + sub v4.4h, v4.4h, v5.4h > + smull v2.4s, v0.4h, v2.4h > + smlal v2.4s, v1.4h, v4.4h > + > + ld1 {v5.4h}, [src0], x14 > + ld1 {v6.4h}, [src1], x14 > + saddl v5.4s, v5.4h, v6.4h > + add v5.4s, v5.4s, v7.4s > + add v5.4s, v5.4s, v2.4s > + sqshrn v5.4h, v5.4s, #(15 - \bit_depth) > + subs w13, w13, #1 > +.if \bit_depth == 8 > + sqxtun v5.8b, v5.8h > + str s5, [dst] > + add dst, dst, dst_stride > +.else > + smin v5.4h, v5.4h, v17.4h > + smax v5.4h, v5.4h, v18.4h > + st1 {v5.4h}, [dst], dst_stride > +.endif > + b.ne 1b > + ret > + > +.unreq dst > +.unreq dst_stride > +.unreq src0 > +.unreq src1 > +.unreq gh > +.unreq gv > +.unreq vx > +.unreq vy > +.endm > + > +function ff_vvc_apply_bdof_min_block_8_neon, export=1 > + vvc_apply_bdof_min_block 8 > +endfunc > + > +function ff_vvc_apply_bdof_min_block_10_neon, export=1 > + vvc_apply_bdof_min_block 10 > +endfunc > + > +function ff_vvc_apply_bdof_min_block_12_neon, export=1 > + vvc_apply_bdof_min_block 12 > +endfunc > + > +.macro derive_bdof_vx_vy_x_begin_end > + ldrh w19, [x14, x16, lsl #1] // load from src0 > + ldrh w20, [x15, x16, lsl #1] // load from src1 > + sxth w19, w19 > + sxth w20, w20 > + asr w19, w19, #4 > + asr w20, w20, #4 > + sub w19, w19, w20 // diff > + add x17, x16, x13, lsl #4 // idx > + ldrh w3, [gh0, x17, lsl #1] // load from gh0 > + ldrh w4, [gh1, x17, lsl #1] // load from gh1 > + sxth w3, w3 > + sxth w4, w4 > + ldrh w22, [gv0, x17, lsl #1] // load from gv0 > + ldrh w23, [gv1, x17, lsl #1] // load from gv1 > + add w3, w3, w4 > + asr w21, w3, #1 // temph > + sxth w3, w22 > + sxth w4, w23 > + add w3, w3, w4 > + cmp w21, #0 > + asr w22, w3, #1 // tempv > + cneg w20, w21, mi > + csetm w23, ne > + csinc w23, w23, wzr, ge // -VVC_SIGN(temph) > + cmp w22, #0 > + add sgx2, sgx2, w20 > + cneg w20, w22, mi > + cset w24, ne > + csinv w24, w24, wzr, ge // VVC_SIGN(tempv) > + add sgy2, sgy2, w20 > + madd sgxgy, w24, w21, sgxgy > + madd sgxdi, w23, w19, sgxdi > + csetm w24, ne > + csinc w24, w24, wzr, ge // -VVC_SIGN(tempv) > + madd sgydi, w24, w19, sgydi > +.endm > + > +function ff_vvc_derive_bdof_vx_vy_neon, export=1 > + src0 .req x0 > + src1 .req x1 > + pad_mask .req w2 > + gh .req x3 > + gv .req x4 > + gh0 .req x27 > + gh1 .req x28 > + gv0 .req x25 > + gv1 .req x26 > + vx .req x5 > + vy .req x6 > + sgx2 .req w7 > + sgy2 .req w8 > + sgxgy .req w9 > + sgxdi .req w10 > + sgydi .req w11 > + y .req x12 > + > + stp x27, x28, [sp, #-80]! > + stp x25, x26, [sp, #16] > + stp x23, x24, [sp, #32] > + stp x21, x22, [sp, #48] > + stp x19, x20, [sp, #64] > + > + ldp gh0, gh1, [gh] > + mov sgx2, #0 > + mov sgy2, #0 > + mov sgxgy, #0 > + mov sgxdi, #0 > + mov sgydi, #0 > + ldp gv0, gv1, [gv] > + > + mov y, #-1 > + mov x13, #-1 // dy > + tst pad_mask, #2 > + b.eq 1f > + mov x13, #0 // dy: pad top > +1: > + add x14, src0, x13, lsl #8 // local src0 > + add x15, src1, x13, lsl #8 // local src1 > + > + // x = -1 > + mov x16, #-1 // dx > + tst pad_mask, #1 > + b.eq 2f > + mov x16, #0 > +2: > + derive_bdof_vx_vy_x_begin_end > + > + // x = 0 to BDOF_MIN_BLOCK_SIZE - 1 > + ldr d0, [x14] > + ldr d1, [x15] > + lsl x19, x13, #5 > + ldr d2, [gh0, x19] > + ldr d3, [gh1, x19] > + sshr v0.4h, v0.4h, #4 > + sshr v1.4h, v1.4h, #4 > + ssubl v0.4s, v0.4h, v1.4h // diff > + ldr d4, [gv0, x19] > + ldr d5, [gv1, x19] > + saddl v2.4s, v2.4h, v3.4h > + saddl v4.4s, v4.4h, v5.4h > + sshr v2.4s, v2.4s, #1 // temph > + sshr v4.4s, v4.4s, #1 // tempv > + abs v3.4s, v2.4s > + abs v5.4s, v4.4s > + addv s3, v3.4s > + addv s5, v5.4s > + mov w19, v3.s[0] > + mov w20, v5.s[0] > + add sgx2, sgx2, w19 > + add sgy2, sgy2, w20 > + > + movi v5.4s, #1 > + cmgt v17.4s, v4.4s, #0 // mask > 0 > + cmlt v18.4s, v4.4s, #0 // mask < 0 > + and v17.16b, v17.16b, v5.16b > + and v18.16b, v18.16b, v5.16b > + neg v19.4s, v18.4s > + add v20.4s, v17.4s, v19.4s // VVC_SIGN(tempv) > + smull v21.2d, v20.2s, v2.2s > + smlal2 v21.2d, v20.4s, v2.4s > + addp d21, v21.2d > + mov w19, v21.s[0] > + add sgxgy, sgxgy, w19 > + > + smull v16.2d, v20.2s, v0.2s > + smlal2 v16.2d, v20.4s, v0.4s > + addp d16, v16.2d > + mov w19, v16.s[0] > + sub sgydi, sgydi, w19 > + > + cmgt v17.4s, v2.4s, #0 > + cmlt v18.4s, v2.4s, #0 > + and v17.16b, v17.16b, v5.16b > + and v18.16b, v18.16b, v5.16b > + neg v21.4s, v17.4s > + add v16.4s, v21.4s, v18.4s // -VVC_SIGN(temph) > + smull v20.2d, v16.2s, v0.2s > + smlal2 v20.2d, v16.4s, v0.4s > + addp d20, v20.2d > + mov w19, v20.s[0] > + add sgxdi, sgxdi, w19 > + > + // x = BDOF_MIN_BLOCK_SIZE > + mov x16, #BDOF_MIN_BLOCK_SIZE // dx > + tst pad_mask, #4 > + b.eq 3f > + mov x16, #(BDOF_MIN_BLOCK_SIZE - 1) > +3: > + derive_bdof_vx_vy_x_begin_end > + > + add y, y, #1 > + cmp y, #(BDOF_MIN_BLOCK_SIZE) > + mov x13, y > + b.gt 4f > + b.lt 1b > + tst pad_mask, #8 > + b.eq 1b > + sub x13, x13, #1 // pad bottom > + b 1b > +4: > + mov w3, #31 > + mov w14, #0 > + mov w16, #-15 > + mov w17, #15 > + cbz sgx2, 5f > + clz w12, sgx2 > + lsl sgxdi, sgxdi, #2 > + sub w13, w3, w12 // log2(sgx2) > + asr sgxdi, sgxdi, w13 > + cmp sgxdi, w16 > + csel w14, w16, sgxdi, lt // clip to -15 > + b.le 5f > + cmp sgxdi, w17 > + csel w14, w17, sgxdi, gt // clip to 15 > +5: > + str w14, [vx] > + > + mov w15, #0 > + cbz sgy2, 6f > + lsl sgydi, sgydi, #2 > + smull x14, w14, sgxgy > + asr w14, w14, #1 > + sub sgydi, sgydi, w14 > + clz w12, sgy2 > + sub w13, w3, w12 // log2(sgy2) > + asr sgydi, sgydi, w13 > + cmp sgydi, w16 > + csel w15, w16, sgydi, lt // clip to -15 > + b.le 6f > + cmp sgydi, w17 > + csel w15, w17, sgydi, gt // clip to 15 > +6: > + str w15, [vy] > + ldp x25, x26, [sp, #16] > + ldp x23, x24, [sp, #32] > + ldp x21, x22, [sp, #48] > + ldp x19, x20, [sp, #64] > + ldp x27, x28, [sp], #80 > + ret > +.unreq src0 > +.unreq src1 > +.unreq pad_mask > +.unreq gh > +.unreq gv > +.unreq vx > +.unreq vy > +.unreq sgx2 > +.unreq sgy2 > +.unreq sgxgy > +.unreq sgxdi > +.unreq sgydi > +.unreq y > +endfunc > + > diff --git a/libavcodec/aarch64/vvc/of_template.c > b/libavcodec/aarch64/vvc/of_template.c > new file mode 100644 > index 0000000000..508ea6d99d > --- /dev/null > +++ b/libavcodec/aarch64/vvc/of_template.c > @@ -0,0 +1,70 @@ > +/* > + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavcodec/bit_depth_template.c" > + > +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h, > + int16_t *gradient_v, > + const ptrdiff_t gradient_stride, > + const int16_t *_src, > + const ptrdiff_t src_stride, > + const int width, const int height); > + > +void ff_vvc_derive_bdof_vx_vy_neon( > + const int16_t *_src0, const int16_t *_src1, int pad_mask, > + const int16_t **gradient_h, const int16_t **gradient_v, > + int *vx, int *vy); > + > +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst, > + const ptrdiff_t dst_stride, const int16_t *src0, const int16_t > *src1, > + const int16_t **gh, const int16_t **gv, const int vx, const int > vy); > + > +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, > + const int16_t *_src0, const int16_t *_src1, > + const int block_w, const int block_h) > +{ > + int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; > + int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; > + int vx, vy; > + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); > + pixel* dst = (pixel*)_dst; > + > + ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], > BDOF_BLOCK_SIZE, > + _src0, MAX_PB_SIZE, block_w, block_h); > + ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], > BDOF_BLOCK_SIZE, > + _src1, MAX_PB_SIZE, block_w, block_h); > + > + for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) { > + for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) { > + const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x; > + const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x; > + pixel *d = dst + x; > + const int idx = BDOF_BLOCK_SIZE * y + x; > + const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + > idx }; > + const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + > idx }; > + const int pad_mask = !x | ((!y) << 1) | > + ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) | > + ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3); > + ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, > &vx, &vy); > + FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, > dst_stride, src0, src1, gh, gv, vx, vy); > + } > + dst += BDOF_MIN_BLOCK_SIZE * dst_stride; > + } > +} > -- > 2.42.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index b39ebb83fc..03a4c62310 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -27,16 +27,22 @@ #include "libavcodec/vvc/dec.h" #include "libavcodec/vvc/ctu.h" +#define BDOF_BLOCK_SIZE 16 +#define BDOF_MIN_BLOCK_SIZE 4 + #define BIT_DEPTH 8 #include "alf_template.c" +#include "of_template.c" #undef BIT_DEPTH #define BIT_DEPTH 10 #include "alf_template.c" +#include "of_template.c" #undef BIT_DEPTH #define BIT_DEPTH 12 #include "alf_template.c" +#include "of_template.c" #undef BIT_DEPTH int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy, @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.avg = ff_vvc_avg_8_neon; c->inter.w_avg = vvc_w_avg_8; + c->inter.apply_bdof = apply_bdof_8; for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon; @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) } else if (bd == 10) { c->inter.avg = ff_vvc_avg_10_neon; c->inter.w_avg = vvc_w_avg_10; + c->inter.apply_bdof = apply_bdof_10; c->alf.filter[LUMA] = alf_filter_luma_10_neon; c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; } else if (bd == 12) { c->inter.avg = ff_vvc_avg_12_neon; c->inter.w_avg = vvc_w_avg_12; + c->inter.apply_bdof = apply_bdof_12; c->alf.filter[LUMA] = alf_filter_luma_12_neon; c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 49e1050aee..8cfacef44f 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -21,6 +21,8 @@ #include "libavutil/aarch64/asm.S" #define VVC_MAX_PB_SIZE 128 +#define BDOF_BLOCK_SIZE 16 +#define BDOF_MIN_BLOCK_SIZE 4 .macro vvc_avg type, bit_depth @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 32: ret endfunc + +.unreq dst +.unreq dst_stride +.unreq src0 +.unreq src1 +.unreq width +.unreq height .endm vvc_avg avg, 8 @@ -219,3 +228,345 @@ vvc_avg avg, 12 vvc_avg w_avg, 8 vvc_avg w_avg, 10 vvc_avg w_avg, 12 + +function ff_vvc_prof_grad_filter_8x_neon, export=1 + gh .req x0 + gv .req x1 + gstride .req x2 + src .req x3 + src_stride .req x4 + width .req w5 + height .req w6 + + lsl src_stride, src_stride, #1 + neg x7, src_stride +1: + mov x10, src + mov w11, width + mov x12, gh + mov x13, gv +2: + ldur q0, [x10, #2] + ldur q1, [x10, #-2] + subs w11, w11, #8 + ldr q2, [x10, src_stride] + ldr q3, [x10, x7] + sshr v0.8h, v0.8h, #6 + sshr v1.8h, v1.8h, #6 + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + sub v0.8h, v0.8h, v1.8h + sub v2.8h, v2.8h, v3.8h + st1 {v0.8h}, [x12], #16 + st1 {v2.8h}, [x13], #16 + add x10, x10, #16 + b.ne 2b + + subs height, height, #1 + add gh, gh, gstride, lsl #1 + add gv, gv, gstride, lsl #1 + add src, src, src_stride + b.ne 1b + ret + +.unreq gh +.unreq gv +.unreq gstride +.unreq src +.unreq src_stride +.unreq width +.unreq height + +endfunc + +.macro vvc_apply_bdof_min_block bit_depth + dst .req x0 + dst_stride .req x1 + src0 .req x2 + src1 .req x3 + gh .req x4 + gv .req x5 + vx .req w6 + vy .req w7 + + dup v0.4h, vx + dup v1.4h, vy + movi v7.4s, #(1 << (14 - \bit_depth)) + ldp x8, x9, [gh] + ldp x10, x11, [gv] + mov x12, #(BDOF_BLOCK_SIZE * 2) + mov w13, #(BDOF_MIN_BLOCK_SIZE) + mov x14, #(VVC_MAX_PB_SIZE * 2) +.if \bit_depth >= 10 + // clip pixel + mov w15, #((1 << \bit_depth) - 1) + movi v18.8h, #0 + lsl dst_stride, dst_stride, #1 + dup v17.8h, w15 +.endif +1: + ld1 {v2.4h}, [x8], x12 + ld1 {v3.4h}, [x9], x12 + ld1 {v4.4h}, [x10], x12 + ld1 {v5.4h}, [x11], x12 + sub v2.4h, v2.4h, v3.4h + sub v4.4h, v4.4h, v5.4h + smull v2.4s, v0.4h, v2.4h + smlal v2.4s, v1.4h, v4.4h + + ld1 {v5.4h}, [src0], x14 + ld1 {v6.4h}, [src1], x14 + saddl v5.4s, v5.4h, v6.4h + add v5.4s, v5.4s, v7.4s + add v5.4s, v5.4s, v2.4s + sqshrn v5.4h, v5.4s, #(15 - \bit_depth) + subs w13, w13, #1 +.if \bit_depth == 8 + sqxtun v5.8b, v5.8h + str s5, [dst] + add dst, dst, dst_stride +.else + smin v5.4h, v5.4h, v17.4h + smax v5.4h, v5.4h, v18.4h + st1 {v5.4h}, [dst], dst_stride +.endif + b.ne 1b + ret + +.unreq dst +.unreq dst_stride +.unreq src0 +.unreq src1 +.unreq gh +.unreq gv +.unreq vx +.unreq vy +.endm + +function ff_vvc_apply_bdof_min_block_8_neon, export=1 + vvc_apply_bdof_min_block 8 +endfunc + +function ff_vvc_apply_bdof_min_block_10_neon, export=1 + vvc_apply_bdof_min_block 10 +endfunc + +function ff_vvc_apply_bdof_min_block_12_neon, export=1 + vvc_apply_bdof_min_block 12 +endfunc + +.macro derive_bdof_vx_vy_x_begin_end + ldrh w19, [x14, x16, lsl #1] // load from src0 + ldrh w20, [x15, x16, lsl #1] // load from src1 + sxth w19, w19 + sxth w20, w20 + asr w19, w19, #4 + asr w20, w20, #4 + sub w19, w19, w20 // diff + add x17, x16, x13, lsl #4 // idx + ldrh w3, [gh0, x17, lsl #1] // load from gh0 + ldrh w4, [gh1, x17, lsl #1] // load from gh1 + sxth w3, w3 + sxth w4, w4 + ldrh w22, [gv0, x17, lsl #1] // load from gv0 + ldrh w23, [gv1, x17, lsl #1] // load from gv1 + add w3, w3, w4 + asr w21, w3, #1 // temph + sxth w3, w22 + sxth w4, w23 + add w3, w3, w4 + cmp w21, #0 + asr w22, w3, #1 // tempv + cneg w20, w21, mi + csetm w23, ne + csinc w23, w23, wzr, ge // -VVC_SIGN(temph) + cmp w22, #0 + add sgx2, sgx2, w20 + cneg w20, w22, mi + cset w24, ne + csinv w24, w24, wzr, ge // VVC_SIGN(tempv) + add sgy2, sgy2, w20 + madd sgxgy, w24, w21, sgxgy + madd sgxdi, w23, w19, sgxdi + csetm w24, ne + csinc w24, w24, wzr, ge // -VVC_SIGN(tempv) + madd sgydi, w24, w19, sgydi +.endm + +function ff_vvc_derive_bdof_vx_vy_neon, export=1 + src0 .req x0 + src1 .req x1 + pad_mask .req w2 + gh .req x3 + gv .req x4 + gh0 .req x27 + gh1 .req x28 + gv0 .req x25 + gv1 .req x26 + vx .req x5 + vy .req x6 + sgx2 .req w7 + sgy2 .req w8 + sgxgy .req w9 + sgxdi .req w10 + sgydi .req w11 + y .req x12 + + stp x27, x28, [sp, #-80]! + stp x25, x26, [sp, #16] + stp x23, x24, [sp, #32] + stp x21, x22, [sp, #48] + stp x19, x20, [sp, #64] + + ldp gh0, gh1, [gh] + mov sgx2, #0 + mov sgy2, #0 + mov sgxgy, #0 + mov sgxdi, #0 + mov sgydi, #0 + ldp gv0, gv1, [gv] + + mov y, #-1 + mov x13, #-1 // dy + tst pad_mask, #2 + b.eq 1f + mov x13, #0 // dy: pad top +1: + add x14, src0, x13, lsl #8 // local src0 + add x15, src1, x13, lsl #8 // local src1 + + // x = -1 + mov x16, #-1 // dx + tst pad_mask, #1 + b.eq 2f + mov x16, #0 +2: + derive_bdof_vx_vy_x_begin_end + + // x = 0 to BDOF_MIN_BLOCK_SIZE - 1 + ldr d0, [x14] + ldr d1, [x15] + lsl x19, x13, #5 + ldr d2, [gh0, x19] + ldr d3, [gh1, x19] + sshr v0.4h, v0.4h, #4 + sshr v1.4h, v1.4h, #4 + ssubl v0.4s, v0.4h, v1.4h // diff + ldr d4, [gv0, x19] + ldr d5, [gv1, x19] + saddl v2.4s, v2.4h, v3.4h + saddl v4.4s, v4.4h, v5.4h + sshr v2.4s, v2.4s, #1 // temph + sshr v4.4s, v4.4s, #1 // tempv + abs v3.4s, v2.4s + abs v5.4s, v4.4s + addv s3, v3.4s + addv s5, v5.4s + mov w19, v3.s[0] + mov w20, v5.s[0] + add sgx2, sgx2, w19 + add sgy2, sgy2, w20 + + movi v5.4s, #1 + cmgt v17.4s, v4.4s, #0 // mask > 0 + cmlt v18.4s, v4.4s, #0 // mask < 0 + and v17.16b, v17.16b, v5.16b + and v18.16b, v18.16b, v5.16b + neg v19.4s, v18.4s + add v20.4s, v17.4s, v19.4s // VVC_SIGN(tempv) + smull v21.2d, v20.2s, v2.2s + smlal2 v21.2d, v20.4s, v2.4s + addp d21, v21.2d + mov w19, v21.s[0] + add sgxgy, sgxgy, w19 + + smull v16.2d, v20.2s, v0.2s + smlal2 v16.2d, v20.4s, v0.4s + addp d16, v16.2d + mov w19, v16.s[0] + sub sgydi, sgydi, w19 + + cmgt v17.4s, v2.4s, #0 + cmlt v18.4s, v2.4s, #0 + and v17.16b, v17.16b, v5.16b + and v18.16b, v18.16b, v5.16b + neg v21.4s, v17.4s + add v16.4s, v21.4s, v18.4s // -VVC_SIGN(temph) + smull v20.2d, v16.2s, v0.2s + smlal2 v20.2d, v16.4s, v0.4s + addp d20, v20.2d + mov w19, v20.s[0] + add sgxdi, sgxdi, w19 + + // x = BDOF_MIN_BLOCK_SIZE + mov x16, #BDOF_MIN_BLOCK_SIZE // dx + tst pad_mask, #4 + b.eq 3f + mov x16, #(BDOF_MIN_BLOCK_SIZE - 1) +3: + derive_bdof_vx_vy_x_begin_end + + add y, y, #1 + cmp y, #(BDOF_MIN_BLOCK_SIZE) + mov x13, y + b.gt 4f + b.lt 1b + tst pad_mask, #8 + b.eq 1b + sub x13, x13, #1 // pad bottom + b 1b +4: + mov w3, #31 + mov w14, #0 + mov w16, #-15 + mov w17, #15 + cbz sgx2, 5f + clz w12, sgx2 + lsl sgxdi, sgxdi, #2 + sub w13, w3, w12 // log2(sgx2) + asr sgxdi, sgxdi, w13 + cmp sgxdi, w16 + csel w14, w16, sgxdi, lt // clip to -15 + b.le 5f + cmp sgxdi, w17 + csel w14, w17, sgxdi, gt // clip to 15 +5: + str w14, [vx] + + mov w15, #0 + cbz sgy2, 6f + lsl sgydi, sgydi, #2 + smull x14, w14, sgxgy + asr w14, w14, #1 + sub sgydi, sgydi, w14 + clz w12, sgy2 + sub w13, w3, w12 // log2(sgy2) + asr sgydi, sgydi, w13 + cmp sgydi, w16 + csel w15, w16, sgydi, lt // clip to -15 + b.le 6f + cmp sgydi, w17 + csel w15, w17, sgydi, gt // clip to 15 +6: + str w15, [vy] + ldp x25, x26, [sp, #16] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #48] + ldp x19, x20, [sp, #64] + ldp x27, x28, [sp], #80 + ret +.unreq src0 +.unreq src1 +.unreq pad_mask +.unreq gh +.unreq gv +.unreq vx +.unreq vy +.unreq sgx2 +.unreq sgy2 +.unreq sgxgy +.unreq sgxdi +.unreq sgydi +.unreq y +endfunc + diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c new file mode 100644 index 0000000000..508ea6d99d --- /dev/null +++ b/libavcodec/aarch64/vvc/of_template.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/bit_depth_template.c" + +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h, + int16_t *gradient_v, + const ptrdiff_t gradient_stride, + const int16_t *_src, + const ptrdiff_t src_stride, + const int width, const int height); + +void ff_vvc_derive_bdof_vx_vy_neon( + const int16_t *_src0, const int16_t *_src1, int pad_mask, + const int16_t **gradient_h, const int16_t **gradient_v, + int *vx, int *vy); + +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst, + const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, + const int16_t **gh, const int16_t **gv, const int vx, const int vy); + +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, + const int16_t *_src0, const int16_t *_src1, + const int block_w, const int block_h) +{ + int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; + int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE]; + int vx, vy; + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + pixel* dst = (pixel*)_dst; + + ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE, + _src0, MAX_PB_SIZE, block_w, block_h); + ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE, + _src1, MAX_PB_SIZE, block_w, block_h); + + for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) { + for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) { + const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x; + const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x; + pixel *d = dst + x; + const int idx = BDOF_BLOCK_SIZE * y + x; + const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx }; + const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx }; + const int pad_mask = !x | ((!y) << 1) | + ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) | + ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3); + ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy); + FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy); + } + dst += BDOF_MIN_BLOCK_SIZE * dst_stride; + } +}
From: Zhao Zhili <zhilizhao@tencent.com> apply_bdof_8_8x16_c: 18.7 ( 1.00x) apply_bdof_8_8x16_neon: 9.7 ( 1.93x) apply_bdof_8_16x8_c: 20.0 ( 1.00x) apply_bdof_8_16x8_neon: 9.5 ( 2.11x) apply_bdof_8_16x16_c: 36.7 ( 1.00x) apply_bdof_8_16x16_neon: 19.0 ( 1.94x) apply_bdof_10_8x16_c: 18.0 ( 1.00x) apply_bdof_10_8x16_neon: 10.0 ( 1.80x) apply_bdof_10_16x8_c: 18.0 ( 1.00x) apply_bdof_10_16x8_neon: 9.5 ( 1.90x) apply_bdof_10_16x16_c: 35.5 ( 1.00x) apply_bdof_10_16x16_neon: 19.0 ( 1.87x) apply_bdof_12_8x16_c: 17.5 ( 1.00x) apply_bdof_12_8x16_neon: 9.7 ( 1.80x) apply_bdof_12_16x8_c: 18.2 ( 1.00x) apply_bdof_12_16x8_neon: 9.5 ( 1.92x) apply_bdof_12_16x16_c: 34.5 ( 1.00x) apply_bdof_12_16x16_neon: 18.7 ( 1.84x) --- libavcodec/aarch64/vvc/dsp_init.c | 9 + libavcodec/aarch64/vvc/inter.S | 351 +++++++++++++++++++++++++++ libavcodec/aarch64/vvc/of_template.c | 70 ++++++ 3 files changed, 430 insertions(+) create mode 100644 libavcodec/aarch64/vvc/of_template.c