diff mbox series

[FFmpeg-devel,2/4] aarch64/vvc: Add apply_bdof

Message ID tencent_4E27A05F4DB04FAD552AA701B8CD16274D07@qq.com
State New
Headers show
Series [FFmpeg-devel,1/4] aarch64/vvc: Add w_avg | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Zhao Zhili Sept. 21, 2024, 5:41 p.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

apply_bdof_8_8x16_c:                                    18.7 ( 1.00x)
apply_bdof_8_8x16_neon:                                  9.7 ( 1.93x)
apply_bdof_8_16x8_c:                                    20.0 ( 1.00x)
apply_bdof_8_16x8_neon:                                  9.5 ( 2.11x)
apply_bdof_8_16x16_c:                                   36.7 ( 1.00x)
apply_bdof_8_16x16_neon:                                19.0 ( 1.94x)
apply_bdof_10_8x16_c:                                   18.0 ( 1.00x)
apply_bdof_10_8x16_neon:                                10.0 ( 1.80x)
apply_bdof_10_16x8_c:                                   18.0 ( 1.00x)
apply_bdof_10_16x8_neon:                                 9.5 ( 1.90x)
apply_bdof_10_16x16_c:                                  35.5 ( 1.00x)
apply_bdof_10_16x16_neon:                               19.0 ( 1.87x)
apply_bdof_12_8x16_c:                                   17.5 ( 1.00x)
apply_bdof_12_8x16_neon:                                 9.7 ( 1.80x)
apply_bdof_12_16x8_c:                                   18.2 ( 1.00x)
apply_bdof_12_16x8_neon:                                 9.5 ( 1.92x)
apply_bdof_12_16x16_c:                                  34.5 ( 1.00x)
apply_bdof_12_16x16_neon:                               18.7 ( 1.84x)
---
 libavcodec/aarch64/vvc/dsp_init.c    |   9 +
 libavcodec/aarch64/vvc/inter.S       | 351 +++++++++++++++++++++++++++
 libavcodec/aarch64/vvc/of_template.c |  70 ++++++
 3 files changed, 430 insertions(+)
 create mode 100644 libavcodec/aarch64/vvc/of_template.c

Comments

Zhao Zhili Sept. 23, 2024, 9:09 a.m. UTC | #1
Drop patch 2/4 for now. It needs more polish.

See patch v2

https://ffmpeg.org/pipermail/ffmpeg-devel/2024-September/333800.html

> On Sep 22, 2024, at 01:41, Zhao Zhili <quinkblack@foxmail.com> wrote:
> 
> From: Zhao Zhili <zhilizhao@tencent.com>
> 
> apply_bdof_8_8x16_c:                                    18.7 ( 1.00x)
> apply_bdof_8_8x16_neon:                                  9.7 ( 1.93x)
> apply_bdof_8_16x8_c:                                    20.0 ( 1.00x)
> apply_bdof_8_16x8_neon:                                  9.5 ( 2.11x)
> apply_bdof_8_16x16_c:                                   36.7 ( 1.00x)
> apply_bdof_8_16x16_neon:                                19.0 ( 1.94x)
> apply_bdof_10_8x16_c:                                   18.0 ( 1.00x)
> apply_bdof_10_8x16_neon:                                10.0 ( 1.80x)
> apply_bdof_10_16x8_c:                                   18.0 ( 1.00x)
> apply_bdof_10_16x8_neon:                                 9.5 ( 1.90x)
> apply_bdof_10_16x16_c:                                  35.5 ( 1.00x)
> apply_bdof_10_16x16_neon:                               19.0 ( 1.87x)
> apply_bdof_12_8x16_c:                                   17.5 ( 1.00x)
> apply_bdof_12_8x16_neon:                                 9.7 ( 1.80x)
> apply_bdof_12_16x8_c:                                   18.2 ( 1.00x)
> apply_bdof_12_16x8_neon:                                 9.5 ( 1.92x)
> apply_bdof_12_16x16_c:                                  34.5 ( 1.00x)
> apply_bdof_12_16x16_neon:                               18.7 ( 1.84x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c    |   9 +
> libavcodec/aarch64/vvc/inter.S       | 351 +++++++++++++++++++++++++++
> libavcodec/aarch64/vvc/of_template.c |  70 ++++++
> 3 files changed, 430 insertions(+)
> create mode 100644 libavcodec/aarch64/vvc/of_template.c
> 
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index b39ebb83fc..03a4c62310 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -27,16 +27,22 @@
> #include "libavcodec/vvc/dec.h"
> #include "libavcodec/vvc/ctu.h"
> 
> +#define BDOF_BLOCK_SIZE         16
> +#define BDOF_MIN_BLOCK_SIZE     4
> +
> #define BIT_DEPTH 8
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
> 
> #define BIT_DEPTH 10
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
> 
> #define BIT_DEPTH 12
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
> 
> int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
> @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
> 
>         c->inter.avg = ff_vvc_avg_8_neon;
>         c->inter.w_avg = vvc_w_avg_8;
> +        c->inter.apply_bdof = apply_bdof_8;
> 
>         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
>             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
> @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
>     } else if (bd == 10) {
>         c->inter.avg = ff_vvc_avg_10_neon;
>         c->inter.w_avg = vvc_w_avg_10;
> +        c->inter.apply_bdof = apply_bdof_10;
> 
>         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
>         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
>     } else if (bd == 12) {
>         c->inter.avg = ff_vvc_avg_12_neon;
>         c->inter.w_avg = vvc_w_avg_12;
> +        c->inter.apply_bdof = apply_bdof_12;
> 
>         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
>         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index 49e1050aee..8cfacef44f 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -21,6 +21,8 @@
> #include "libavutil/aarch64/asm.S"
> 
> #define VVC_MAX_PB_SIZE 128
> +#define BDOF_BLOCK_SIZE 16
> +#define BDOF_MIN_BLOCK_SIZE 4
> 
> .macro vvc_avg type, bit_depth
> 
> @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> 32:
>         ret
> endfunc
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq width
> +.unreq height
> .endm
> 
> vvc_avg avg, 8
> @@ -219,3 +228,345 @@ vvc_avg avg, 12
> vvc_avg w_avg, 8
> vvc_avg w_avg, 10
> vvc_avg w_avg, 12
> +
> +function ff_vvc_prof_grad_filter_8x_neon, export=1
> +        gh              .req x0
> +        gv              .req x1
> +        gstride         .req x2
> +        src             .req x3
> +        src_stride      .req x4
> +        width           .req w5
> +        height          .req w6
> +
> +        lsl             src_stride, src_stride, #1
> +        neg             x7, src_stride
> +1:
> +        mov             x10, src
> +        mov             w11, width
> +        mov             x12, gh
> +        mov             x13, gv
> +2:
> +        ldur            q0, [x10, #2]
> +        ldur            q1, [x10, #-2]
> +        subs            w11, w11, #8
> +        ldr             q2, [x10, src_stride]
> +        ldr             q3, [x10, x7]
> +        sshr            v0.8h, v0.8h, #6
> +        sshr            v1.8h, v1.8h, #6
> +        sshr            v2.8h, v2.8h, #6
> +        sshr            v3.8h, v3.8h, #6
> +        sub             v0.8h, v0.8h, v1.8h
> +        sub             v2.8h, v2.8h, v3.8h
> +        st1             {v0.8h}, [x12], #16
> +        st1             {v2.8h}, [x13], #16
> +        add             x10, x10, #16
> +        b.ne            2b
> +
> +        subs            height, height, #1
> +        add             gh, gh, gstride, lsl #1
> +        add             gv, gv, gstride, lsl #1
> +        add             src, src, src_stride
> +        b.ne            1b
> +        ret
> +
> +.unreq gh
> +.unreq gv
> +.unreq gstride
> +.unreq src
> +.unreq src_stride
> +.unreq width
> +.unreq height
> +
> +endfunc
> +
> +.macro vvc_apply_bdof_min_block bit_depth
> +        dst             .req x0
> +        dst_stride      .req x1
> +        src0            .req x2
> +        src1            .req x3
> +        gh              .req x4
> +        gv              .req x5
> +        vx              .req w6
> +        vy              .req w7
> +
> +        dup             v0.4h, vx
> +        dup             v1.4h, vy
> +        movi            v7.4s, #(1 << (14 - \bit_depth))
> +        ldp             x8, x9, [gh]
> +        ldp             x10, x11, [gv]
> +        mov             x12, #(BDOF_BLOCK_SIZE * 2)
> +        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
> +        mov             x14, #(VVC_MAX_PB_SIZE * 2)
> +.if \bit_depth >= 10
> +        // clip pixel
> +        mov             w15, #((1 << \bit_depth) - 1)
> +        movi            v18.8h, #0
> +        lsl             dst_stride, dst_stride, #1
> +        dup             v17.8h, w15
> +.endif
> +1:
> +        ld1             {v2.4h}, [x8], x12
> +        ld1             {v3.4h}, [x9], x12
> +        ld1             {v4.4h}, [x10], x12
> +        ld1             {v5.4h}, [x11], x12
> +        sub             v2.4h, v2.4h, v3.4h
> +        sub             v4.4h, v4.4h, v5.4h
> +        smull           v2.4s, v0.4h, v2.4h
> +        smlal           v2.4s, v1.4h, v4.4h
> +
> +        ld1             {v5.4h}, [src0], x14
> +        ld1             {v6.4h}, [src1], x14
> +        saddl           v5.4s, v5.4h, v6.4h
> +        add             v5.4s, v5.4s, v7.4s
> +        add             v5.4s, v5.4s, v2.4s
> +        sqshrn          v5.4h, v5.4s, #(15 - \bit_depth)
> +        subs            w13, w13, #1
> +.if \bit_depth == 8
> +        sqxtun          v5.8b, v5.8h
> +        str             s5, [dst]
> +        add             dst, dst, dst_stride
> +.else
> +        smin            v5.4h, v5.4h, v17.4h
> +        smax            v5.4h, v5.4h, v18.4h
> +        st1             {v5.4h}, [dst], dst_stride
> +.endif
> +        b.ne            1b
> +        ret
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.endm
> +
> +function ff_vvc_apply_bdof_min_block_8_neon, export=1
> +        vvc_apply_bdof_min_block 8
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_10_neon, export=1
> +        vvc_apply_bdof_min_block 10
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_12_neon, export=1
> +        vvc_apply_bdof_min_block 12
> +endfunc
> +
> +.macro derive_bdof_vx_vy_x_begin_end
> +        ldrh            w19, [x14, x16, lsl #1]     // load from src0
> +        ldrh            w20, [x15, x16, lsl #1]     // load from src1
> +        sxth            w19, w19
> +        sxth            w20, w20
> +        asr             w19, w19, #4
> +        asr             w20, w20, #4
> +        sub             w19, w19, w20               // diff
> +        add             x17, x16, x13, lsl #4       // idx
> +        ldrh            w3, [gh0, x17, lsl #1]      // load from gh0
> +        ldrh            w4, [gh1, x17, lsl #1]      // load from gh1
> +        sxth            w3, w3
> +        sxth            w4, w4
> +        ldrh            w22, [gv0, x17, lsl #1]     // load from gv0
> +        ldrh            w23, [gv1, x17, lsl #1]     // load from gv1
> +        add             w3, w3, w4
> +        asr             w21, w3, #1                 // temph
> +        sxth            w3, w22
> +        sxth            w4, w23
> +        add             w3, w3, w4
> +        cmp             w21, #0
> +        asr             w22, w3, #1                 // tempv
> +        cneg            w20, w21, mi
> +        csetm           w23, ne
> +        csinc           w23, w23, wzr, ge           // -VVC_SIGN(temph)
> +        cmp             w22, #0
> +        add             sgx2, sgx2, w20
> +        cneg            w20, w22, mi
> +        cset            w24, ne
> +        csinv           w24, w24, wzr, ge           // VVC_SIGN(tempv)
> +        add             sgy2, sgy2, w20
> +        madd            sgxgy, w24, w21, sgxgy
> +        madd            sgxdi, w23, w19, sgxdi
> +        csetm           w24, ne
> +        csinc           w24, w24, wzr, ge           // -VVC_SIGN(tempv)
> +        madd            sgydi, w24, w19, sgydi
> +.endm
> +
> +function ff_vvc_derive_bdof_vx_vy_neon, export=1
> +        src0            .req x0
> +        src1            .req x1
> +        pad_mask        .req w2
> +        gh              .req x3
> +        gv              .req x4
> +        gh0             .req x27
> +        gh1             .req x28
> +        gv0             .req x25
> +        gv1             .req x26
> +        vx              .req x5
> +        vy              .req x6
> +        sgx2            .req w7
> +        sgy2            .req w8
> +        sgxgy           .req w9
> +        sgxdi           .req w10
> +        sgydi           .req w11
> +        y               .req x12
> +
> +        stp             x27, x28, [sp, #-80]!
> +        stp             x25, x26, [sp, #16]
> +        stp             x23, x24, [sp, #32]
> +        stp             x21, x22, [sp, #48]
> +        stp             x19, x20, [sp, #64]
> +
> +        ldp             gh0, gh1, [gh]
> +        mov             sgx2, #0
> +        mov             sgy2, #0
> +        mov             sgxgy, #0
> +        mov             sgxdi, #0
> +        mov             sgydi, #0
> +        ldp             gv0, gv1, [gv]
> +
> +        mov             y, #-1
> +        mov             x13, #-1                    // dy
> +        tst             pad_mask, #2
> +        b.eq            1f
> +        mov             x13, #0                     // dy: pad top
> +1:
> +        add             x14, src0, x13, lsl #8      // local src0
> +        add             x15, src1, x13, lsl #8      // local src1
> +
> +        // x = -1
> +        mov             x16, #-1                    // dx
> +        tst             pad_mask, #1
> +        b.eq            2f
> +        mov             x16, #0
> +2:
> +        derive_bdof_vx_vy_x_begin_end
> +
> +        // x = 0 to BDOF_MIN_BLOCK_SIZE - 1
> +        ldr             d0, [x14]
> +        ldr             d1, [x15]
> +        lsl             x19, x13, #5
> +        ldr             d2, [gh0, x19]
> +        ldr             d3, [gh1, x19]
> +        sshr            v0.4h, v0.4h, #4
> +        sshr            v1.4h, v1.4h, #4
> +        ssubl           v0.4s, v0.4h, v1.4h         // diff
> +        ldr             d4, [gv0, x19]
> +        ldr             d5, [gv1, x19]
> +        saddl           v2.4s, v2.4h, v3.4h
> +        saddl           v4.4s, v4.4h, v5.4h
> +        sshr            v2.4s, v2.4s, #1            // temph
> +        sshr            v4.4s, v4.4s, #1            // tempv
> +        abs             v3.4s, v2.4s
> +        abs             v5.4s, v4.4s
> +        addv            s3, v3.4s
> +        addv            s5, v5.4s
> +        mov             w19, v3.s[0]
> +        mov             w20, v5.s[0]
> +        add             sgx2, sgx2, w19
> +        add             sgy2, sgy2, w20
> +
> +        movi            v5.4s, #1
> +        cmgt            v17.4s, v4.4s, #0           // mask > 0
> +        cmlt            v18.4s, v4.4s, #0           // mask < 0
> +        and             v17.16b, v17.16b, v5.16b
> +        and             v18.16b, v18.16b, v5.16b
> +        neg             v19.4s, v18.4s
> +        add             v20.4s, v17.4s, v19.4s      // VVC_SIGN(tempv)
> +        smull           v21.2d, v20.2s, v2.2s
> +        smlal2          v21.2d, v20.4s, v2.4s
> +        addp            d21, v21.2d
> +        mov             w19, v21.s[0]
> +        add             sgxgy, sgxgy, w19
> +
> +        smull           v16.2d, v20.2s, v0.2s
> +        smlal2          v16.2d, v20.4s, v0.4s
> +        addp            d16, v16.2d
> +        mov             w19, v16.s[0]
> +        sub             sgydi, sgydi, w19
> +
> +        cmgt            v17.4s, v2.4s, #0
> +        cmlt            v18.4s, v2.4s, #0
> +        and             v17.16b, v17.16b, v5.16b
> +        and             v18.16b, v18.16b, v5.16b
> +        neg             v21.4s, v17.4s
> +        add             v16.4s, v21.4s, v18.4s      // -VVC_SIGN(temph)
> +        smull           v20.2d, v16.2s, v0.2s
> +        smlal2          v20.2d, v16.4s, v0.4s
> +        addp            d20, v20.2d
> +        mov             w19, v20.s[0]
> +        add             sgxdi, sgxdi, w19
> +
> +        // x = BDOF_MIN_BLOCK_SIZE
> +        mov             x16, #BDOF_MIN_BLOCK_SIZE   // dx
> +        tst             pad_mask, #4
> +        b.eq            3f
> +        mov             x16, #(BDOF_MIN_BLOCK_SIZE - 1)
> +3:
> +        derive_bdof_vx_vy_x_begin_end
> +
> +        add             y, y, #1
> +        cmp             y, #(BDOF_MIN_BLOCK_SIZE)
> +        mov             x13, y
> +        b.gt            4f
> +        b.lt            1b
> +        tst             pad_mask, #8
> +        b.eq            1b
> +        sub             x13, x13, #1                // pad bottom
> +        b               1b
> +4:
> +        mov             w3, #31
> +        mov             w14, #0
> +        mov             w16, #-15
> +        mov             w17, #15
> +        cbz             sgx2, 5f
> +        clz             w12, sgx2
> +        lsl             sgxdi, sgxdi, #2
> +        sub             w13, w3, w12                // log2(sgx2)
> +        asr             sgxdi, sgxdi, w13
> +        cmp             sgxdi, w16
> +        csel            w14, w16, sgxdi, lt         // clip to -15
> +        b.le            5f
> +        cmp             sgxdi, w17
> +        csel            w14, w17, sgxdi, gt         // clip to 15
> +5:
> +        str             w14, [vx]
> +
> +        mov             w15, #0
> +        cbz             sgy2, 6f
> +        lsl             sgydi, sgydi, #2
> +        smull           x14, w14, sgxgy
> +        asr             w14, w14, #1
> +        sub             sgydi, sgydi, w14
> +        clz             w12, sgy2
> +        sub             w13, w3, w12                // log2(sgy2)
> +        asr             sgydi, sgydi, w13
> +        cmp             sgydi, w16
> +        csel            w15, w16, sgydi, lt         // clip to -15
> +        b.le            6f
> +        cmp             sgydi, w17
> +        csel            w15, w17, sgydi, gt         // clip to 15
> +6:
> +        str             w15, [vy]
> +        ldp             x25, x26, [sp, #16]
> +        ldp             x23, x24, [sp, #32]
> +        ldp             x21, x22, [sp, #48]
> +        ldp             x19, x20, [sp, #64]
> +        ldp             x27, x28, [sp], #80
> +        ret
> +.unreq src0
> +.unreq src1
> +.unreq pad_mask
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.unreq sgx2
> +.unreq sgy2
> +.unreq sgxgy
> +.unreq sgxdi
> +.unreq sgydi
> +.unreq y
> +endfunc
> +
> diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
> new file mode 100644
> index 0000000000..508ea6d99d
> --- /dev/null
> +++ b/libavcodec/aarch64/vvc/of_template.c
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavcodec/bit_depth_template.c"
> +
> +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
> +                                     int16_t *gradient_v,
> +                                     const ptrdiff_t gradient_stride,
> +                                     const int16_t *_src,
> +                                     const ptrdiff_t src_stride,
> +                                     const int width, const int height);
> +
> +void ff_vvc_derive_bdof_vx_vy_neon(
> +        const int16_t *_src0, const int16_t *_src1, int pad_mask,
> +        const int16_t **gradient_h, const int16_t **gradient_v,
> +        int *vx, int *vy);
> +
> +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst,
> +        const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
> +        const int16_t **gh, const int16_t **gv, const int vx, const int vy);
> +
> +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride,
> +        const int16_t *_src0, const int16_t *_src1,
> +        const int block_w, const int block_h)
> +{
> +    int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> +    int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> +    int vx, vy;
> +    const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
> +    pixel* dst                  = (pixel*)_dst;
> +
> +    ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE,
> +                           _src0, MAX_PB_SIZE, block_w, block_h);
> +    ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE,
> +                           _src1, MAX_PB_SIZE, block_w, block_h);
> +
> +    for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
> +        for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) {
> +            const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x;
> +            const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x;
> +            pixel *d            = dst + x;
> +            const int idx       = BDOF_BLOCK_SIZE * y  + x;
> +            const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx };
> +            const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx };
> +            const int pad_mask = !x | ((!y) << 1) |
> +                        ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
> +                        ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
> +            ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy);
> +            FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy);
> +        }
> +        dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
> +    }
> +}
> -- 
> 2.42.0
> 
> From: Zhao Zhili <zhilizhao@tencent.com>
> 
> apply_bdof_8_8x16_c:                                    18.7 ( 1.00x)
> apply_bdof_8_8x16_neon:                                  9.7 ( 1.93x)
> apply_bdof_8_16x8_c:                                    20.0 ( 1.00x)
> apply_bdof_8_16x8_neon:                                  9.5 ( 2.11x)
> apply_bdof_8_16x16_c:                                   36.7 ( 1.00x)
> apply_bdof_8_16x16_neon:                                19.0 ( 1.94x)
> apply_bdof_10_8x16_c:                                   18.0 ( 1.00x)
> apply_bdof_10_8x16_neon:                                10.0 ( 1.80x)
> apply_bdof_10_16x8_c:                                   18.0 ( 1.00x)
> apply_bdof_10_16x8_neon:                                 9.5 ( 1.90x)
> apply_bdof_10_16x16_c:                                  35.5 ( 1.00x)
> apply_bdof_10_16x16_neon:                               19.0 ( 1.87x)
> apply_bdof_12_8x16_c:                                   17.5 ( 1.00x)
> apply_bdof_12_8x16_neon:                                 9.7 ( 1.80x)
> apply_bdof_12_16x8_c:                                   18.2 ( 1.00x)
> apply_bdof_12_16x8_neon:                                 9.5 ( 1.92x)
> apply_bdof_12_16x16_c:                                  34.5 ( 1.00x)
> apply_bdof_12_16x16_neon:                               18.7 ( 1.84x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c    |   9 +
> libavcodec/aarch64/vvc/inter.S       | 351 +++++++++++++++++++++++++++
> libavcodec/aarch64/vvc/of_template.c |  70 ++++++
> 3 files changed, 430 insertions(+)
> create mode 100644 libavcodec/aarch64/vvc/of_template.c
> 
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index b39ebb83fc..03a4c62310 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -27,16 +27,22 @@
> #include "libavcodec/vvc/dec.h"
> #include "libavcodec/vvc/ctu.h"
> 
> +#define BDOF_BLOCK_SIZE         16
> +#define BDOF_MIN_BLOCK_SIZE     4
> +
> #define BIT_DEPTH 8
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
> 
> #define BIT_DEPTH 10
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
> 
> #define BIT_DEPTH 12
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
> 
> int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
> @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
> 
>         c->inter.avg = ff_vvc_avg_8_neon;
>         c->inter.w_avg = vvc_w_avg_8;
> +        c->inter.apply_bdof = apply_bdof_8;
> 
>         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
>             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
> @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
>     } else if (bd == 10) {
>         c->inter.avg = ff_vvc_avg_10_neon;
>         c->inter.w_avg = vvc_w_avg_10;
> +        c->inter.apply_bdof = apply_bdof_10;
> 
>         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
>         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
>     } else if (bd == 12) {
>         c->inter.avg = ff_vvc_avg_12_neon;
>         c->inter.w_avg = vvc_w_avg_12;
> +        c->inter.apply_bdof = apply_bdof_12;
> 
>         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
>         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index 49e1050aee..8cfacef44f 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -21,6 +21,8 @@
> #include "libavutil/aarch64/asm.S"
> 
> #define VVC_MAX_PB_SIZE 128
> +#define BDOF_BLOCK_SIZE 16
> +#define BDOF_MIN_BLOCK_SIZE 4
> 
> .macro vvc_avg type, bit_depth
> 
> @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> 32:
>         ret
> endfunc
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq width
> +.unreq height
> .endm
> 
> vvc_avg avg, 8
> @@ -219,3 +228,345 @@ vvc_avg avg, 12
> vvc_avg w_avg, 8
> vvc_avg w_avg, 10
> vvc_avg w_avg, 12
> +
> +function ff_vvc_prof_grad_filter_8x_neon, export=1
> +        gh              .req x0
> +        gv              .req x1
> +        gstride         .req x2
> +        src             .req x3
> +        src_stride      .req x4
> +        width           .req w5
> +        height          .req w6
> +
> +        lsl             src_stride, src_stride, #1
> +        neg             x7, src_stride
> +1:
> +        mov             x10, src
> +        mov             w11, width
> +        mov             x12, gh
> +        mov             x13, gv
> +2:
> +        ldur            q0, [x10, #2]
> +        ldur            q1, [x10, #-2]
> +        subs            w11, w11, #8
> +        ldr             q2, [x10, src_stride]
> +        ldr             q3, [x10, x7]
> +        sshr            v0.8h, v0.8h, #6
> +        sshr            v1.8h, v1.8h, #6
> +        sshr            v2.8h, v2.8h, #6
> +        sshr            v3.8h, v3.8h, #6
> +        sub             v0.8h, v0.8h, v1.8h
> +        sub             v2.8h, v2.8h, v3.8h
> +        st1             {v0.8h}, [x12], #16
> +        st1             {v2.8h}, [x13], #16
> +        add             x10, x10, #16
> +        b.ne            2b
> +
> +        subs            height, height, #1
> +        add             gh, gh, gstride, lsl #1
> +        add             gv, gv, gstride, lsl #1
> +        add             src, src, src_stride
> +        b.ne            1b
> +        ret
> +
> +.unreq gh
> +.unreq gv
> +.unreq gstride
> +.unreq src
> +.unreq src_stride
> +.unreq width
> +.unreq height
> +
> +endfunc
> +
> +.macro vvc_apply_bdof_min_block bit_depth
> +        dst             .req x0
> +        dst_stride      .req x1
> +        src0            .req x2
> +        src1            .req x3
> +        gh              .req x4
> +        gv              .req x5
> +        vx              .req w6
> +        vy              .req w7
> +
> +        dup             v0.4h, vx
> +        dup             v1.4h, vy
> +        movi            v7.4s, #(1 << (14 - \bit_depth))
> +        ldp             x8, x9, [gh]
> +        ldp             x10, x11, [gv]
> +        mov             x12, #(BDOF_BLOCK_SIZE * 2)
> +        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
> +        mov             x14, #(VVC_MAX_PB_SIZE * 2)
> +.if \bit_depth >= 10
> +        // clip pixel
> +        mov             w15, #((1 << \bit_depth) - 1)
> +        movi            v18.8h, #0
> +        lsl             dst_stride, dst_stride, #1
> +        dup             v17.8h, w15
> +.endif
> +1:
> +        ld1             {v2.4h}, [x8], x12
> +        ld1             {v3.4h}, [x9], x12
> +        ld1             {v4.4h}, [x10], x12
> +        ld1             {v5.4h}, [x11], x12
> +        sub             v2.4h, v2.4h, v3.4h
> +        sub             v4.4h, v4.4h, v5.4h
> +        smull           v2.4s, v0.4h, v2.4h
> +        smlal           v2.4s, v1.4h, v4.4h
> +
> +        ld1             {v5.4h}, [src0], x14
> +        ld1             {v6.4h}, [src1], x14
> +        saddl           v5.4s, v5.4h, v6.4h
> +        add             v5.4s, v5.4s, v7.4s
> +        add             v5.4s, v5.4s, v2.4s
> +        sqshrn          v5.4h, v5.4s, #(15 - \bit_depth)
> +        subs            w13, w13, #1
> +.if \bit_depth == 8
> +        sqxtun          v5.8b, v5.8h
> +        str             s5, [dst]
> +        add             dst, dst, dst_stride
> +.else
> +        smin            v5.4h, v5.4h, v17.4h
> +        smax            v5.4h, v5.4h, v18.4h
> +        st1             {v5.4h}, [dst], dst_stride
> +.endif
> +        b.ne            1b
> +        ret
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.endm
> +
> +function ff_vvc_apply_bdof_min_block_8_neon, export=1
> +        vvc_apply_bdof_min_block 8
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_10_neon, export=1
> +        vvc_apply_bdof_min_block 10
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_12_neon, export=1
> +        vvc_apply_bdof_min_block 12
> +endfunc
> +
> +.macro derive_bdof_vx_vy_x_begin_end
> +        ldrh            w19, [x14, x16, lsl #1]     // load from src0
> +        ldrh            w20, [x15, x16, lsl #1]     // load from src1
> +        sxth            w19, w19
> +        sxth            w20, w20
> +        asr             w19, w19, #4
> +        asr             w20, w20, #4
> +        sub             w19, w19, w20               // diff
> +        add             x17, x16, x13, lsl #4       // idx
> +        ldrh            w3, [gh0, x17, lsl #1]      // load from gh0
> +        ldrh            w4, [gh1, x17, lsl #1]      // load from gh1
> +        sxth            w3, w3
> +        sxth            w4, w4
> +        ldrh            w22, [gv0, x17, lsl #1]     // load from gv0
> +        ldrh            w23, [gv1, x17, lsl #1]     // load from gv1
> +        add             w3, w3, w4
> +        asr             w21, w3, #1                 // temph
> +        sxth            w3, w22
> +        sxth            w4, w23
> +        add             w3, w3, w4
> +        cmp             w21, #0
> +        asr             w22, w3, #1                 // tempv
> +        cneg            w20, w21, mi
> +        csetm           w23, ne
> +        csinc           w23, w23, wzr, ge           // -VVC_SIGN(temph)
> +        cmp             w22, #0
> +        add             sgx2, sgx2, w20
> +        cneg            w20, w22, mi
> +        cset            w24, ne
> +        csinv           w24, w24, wzr, ge           // VVC_SIGN(tempv)
> +        add             sgy2, sgy2, w20
> +        madd            sgxgy, w24, w21, sgxgy
> +        madd            sgxdi, w23, w19, sgxdi
> +        csetm           w24, ne
> +        csinc           w24, w24, wzr, ge           // -VVC_SIGN(tempv)
> +        madd            sgydi, w24, w19, sgydi
> +.endm
> +
> +function ff_vvc_derive_bdof_vx_vy_neon, export=1
> +        src0            .req x0
> +        src1            .req x1
> +        pad_mask        .req w2
> +        gh              .req x3
> +        gv              .req x4
> +        gh0             .req x27
> +        gh1             .req x28
> +        gv0             .req x25
> +        gv1             .req x26
> +        vx              .req x5
> +        vy              .req x6
> +        sgx2            .req w7
> +        sgy2            .req w8
> +        sgxgy           .req w9
> +        sgxdi           .req w10
> +        sgydi           .req w11
> +        y               .req x12
> +
> +        stp             x27, x28, [sp, #-80]!
> +        stp             x25, x26, [sp, #16]
> +        stp             x23, x24, [sp, #32]
> +        stp             x21, x22, [sp, #48]
> +        stp             x19, x20, [sp, #64]
> +
> +        ldp             gh0, gh1, [gh]
> +        mov             sgx2, #0
> +        mov             sgy2, #0
> +        mov             sgxgy, #0
> +        mov             sgxdi, #0
> +        mov             sgydi, #0
> +        ldp             gv0, gv1, [gv]
> +
> +        mov             y, #-1
> +        mov             x13, #-1                    // dy
> +        tst             pad_mask, #2
> +        b.eq            1f
> +        mov             x13, #0                     // dy: pad top
> +1:
> +        add             x14, src0, x13, lsl #8      // local src0
> +        add             x15, src1, x13, lsl #8      // local src1
> +
> +        // x = -1
> +        mov             x16, #-1                    // dx
> +        tst             pad_mask, #1
> +        b.eq            2f
> +        mov             x16, #0
> +2:
> +        derive_bdof_vx_vy_x_begin_end
> +
> +        // x = 0 to BDOF_MIN_BLOCK_SIZE - 1
> +        ldr             d0, [x14]
> +        ldr             d1, [x15]
> +        lsl             x19, x13, #5
> +        ldr             d2, [gh0, x19]
> +        ldr             d3, [gh1, x19]
> +        sshr            v0.4h, v0.4h, #4
> +        sshr            v1.4h, v1.4h, #4
> +        ssubl           v0.4s, v0.4h, v1.4h         // diff
> +        ldr             d4, [gv0, x19]
> +        ldr             d5, [gv1, x19]
> +        saddl           v2.4s, v2.4h, v3.4h
> +        saddl           v4.4s, v4.4h, v5.4h
> +        sshr            v2.4s, v2.4s, #1            // temph
> +        sshr            v4.4s, v4.4s, #1            // tempv
> +        abs             v3.4s, v2.4s
> +        abs             v5.4s, v4.4s
> +        addv            s3, v3.4s
> +        addv            s5, v5.4s
> +        mov             w19, v3.s[0]
> +        mov             w20, v5.s[0]
> +        add             sgx2, sgx2, w19
> +        add             sgy2, sgy2, w20
> +
> +        movi            v5.4s, #1
> +        cmgt            v17.4s, v4.4s, #0           // mask > 0
> +        cmlt            v18.4s, v4.4s, #0           // mask < 0
> +        and             v17.16b, v17.16b, v5.16b
> +        and             v18.16b, v18.16b, v5.16b
> +        neg             v19.4s, v18.4s
> +        add             v20.4s, v17.4s, v19.4s      // VVC_SIGN(tempv)
> +        smull           v21.2d, v20.2s, v2.2s
> +        smlal2          v21.2d, v20.4s, v2.4s
> +        addp            d21, v21.2d
> +        mov             w19, v21.s[0]
> +        add             sgxgy, sgxgy, w19
> +
> +        smull           v16.2d, v20.2s, v0.2s
> +        smlal2          v16.2d, v20.4s, v0.4s
> +        addp            d16, v16.2d
> +        mov             w19, v16.s[0]
> +        sub             sgydi, sgydi, w19
> +
> +        cmgt            v17.4s, v2.4s, #0
> +        cmlt            v18.4s, v2.4s, #0
> +        and             v17.16b, v17.16b, v5.16b
> +        and             v18.16b, v18.16b, v5.16b
> +        neg             v21.4s, v17.4s
> +        add             v16.4s, v21.4s, v18.4s      // -VVC_SIGN(temph)
> +        smull           v20.2d, v16.2s, v0.2s
> +        smlal2          v20.2d, v16.4s, v0.4s
> +        addp            d20, v20.2d
> +        mov             w19, v20.s[0]
> +        add             sgxdi, sgxdi, w19
> +
> +        // x = BDOF_MIN_BLOCK_SIZE
> +        mov             x16, #BDOF_MIN_BLOCK_SIZE   // dx
> +        tst             pad_mask, #4
> +        b.eq            3f
> +        mov             x16, #(BDOF_MIN_BLOCK_SIZE - 1)
> +3:
> +        derive_bdof_vx_vy_x_begin_end
> +
> +        add             y, y, #1
> +        cmp             y, #(BDOF_MIN_BLOCK_SIZE)
> +        mov             x13, y
> +        b.gt            4f
> +        b.lt            1b
> +        tst             pad_mask, #8
> +        b.eq            1b
> +        sub             x13, x13, #1                // pad bottom
> +        b               1b
> +4:
> +        mov             w3, #31
> +        mov             w14, #0
> +        mov             w16, #-15
> +        mov             w17, #15
> +        cbz             sgx2, 5f
> +        clz             w12, sgx2
> +        lsl             sgxdi, sgxdi, #2
> +        sub             w13, w3, w12                // log2(sgx2)
> +        asr             sgxdi, sgxdi, w13
> +        cmp             sgxdi, w16
> +        csel            w14, w16, sgxdi, lt         // clip to -15
> +        b.le            5f
> +        cmp             sgxdi, w17
> +        csel            w14, w17, sgxdi, gt         // clip to 15
> +5:
> +        str             w14, [vx]
> +
> +        mov             w15, #0
> +        cbz             sgy2, 6f
> +        lsl             sgydi, sgydi, #2
> +        smull           x14, w14, sgxgy
> +        asr             w14, w14, #1
> +        sub             sgydi, sgydi, w14
> +        clz             w12, sgy2
> +        sub             w13, w3, w12                // log2(sgy2)
> +        asr             sgydi, sgydi, w13
> +        cmp             sgydi, w16
> +        csel            w15, w16, sgydi, lt         // clip to -15
> +        b.le            6f
> +        cmp             sgydi, w17
> +        csel            w15, w17, sgydi, gt         // clip to 15
> +6:
> +        str             w15, [vy]
> +        ldp             x25, x26, [sp, #16]
> +        ldp             x23, x24, [sp, #32]
> +        ldp             x21, x22, [sp, #48]
> +        ldp             x19, x20, [sp, #64]
> +        ldp             x27, x28, [sp], #80
> +        ret
> +.unreq src0
> +.unreq src1
> +.unreq pad_mask
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.unreq sgx2
> +.unreq sgy2
> +.unreq sgxgy
> +.unreq sgxdi
> +.unreq sgydi
> +.unreq y
> +endfunc
> +
> diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
> new file mode 100644
> index 0000000000..508ea6d99d
> --- /dev/null
> +++ b/libavcodec/aarch64/vvc/of_template.c
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavcodec/bit_depth_template.c"
> +
> +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
> +                                     int16_t *gradient_v,
> +                                     const ptrdiff_t gradient_stride,
> +                                     const int16_t *_src,
> +                                     const ptrdiff_t src_stride,
> +                                     const int width, const int height);
> +
> +void ff_vvc_derive_bdof_vx_vy_neon(
> +        const int16_t *_src0, const int16_t *_src1, int pad_mask,
> +        const int16_t **gradient_h, const int16_t **gradient_v,
> +        int *vx, int *vy);
> +
> +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst,
> +        const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
> +        const int16_t **gh, const int16_t **gv, const int vx, const int vy);
> +
> +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride,
> +        const int16_t *_src0, const int16_t *_src1,
> +        const int block_w, const int block_h)
> +{
> +    int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> +    int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> +    int vx, vy;
> +    const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
> +    pixel* dst                  = (pixel*)_dst;
> +
> +    ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE,
> +                           _src0, MAX_PB_SIZE, block_w, block_h);
> +    ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE,
> +                           _src1, MAX_PB_SIZE, block_w, block_h);
> +
> +    for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
> +        for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) {
> +            const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x;
> +            const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x;
> +            pixel *d            = dst + x;
> +            const int idx       = BDOF_BLOCK_SIZE * y  + x;
> +            const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx };
> +            const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx };
> +            const int pad_mask = !x | ((!y) << 1) |
> +                        ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
> +                        ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
> +            ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy);
> +            FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy);
> +        }
> +        dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
> +    }
> +}
> -- 
> 2.42.0
>
Nuo Mi Sept. 24, 2024, 2:38 p.m. UTC | #2
On Sun, Sep 22, 2024 at 1:42 AM Zhao Zhili <quinkblack@foxmail.com> wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> apply_bdof_8_8x16_c:                                    18.7 ( 1.00x)
> apply_bdof_8_8x16_neon:                                  9.7 ( 1.93x)
> apply_bdof_8_16x8_c:                                    20.0 ( 1.00x)
> apply_bdof_8_16x8_neon:                                  9.5 ( 2.11x)
> apply_bdof_8_16x16_c:                                   36.7 ( 1.00x)
> apply_bdof_8_16x16_neon:                                19.0 ( 1.94x)
> apply_bdof_10_8x16_c:                                   18.0 ( 1.00x)
> apply_bdof_10_8x16_neon:                                10.0 ( 1.80x)
> apply_bdof_10_16x8_c:                                   18.0 ( 1.00x)
> apply_bdof_10_16x8_neon:                                 9.5 ( 1.90x)
> apply_bdof_10_16x16_c:                                  35.5 ( 1.00x)
> apply_bdof_10_16x16_neon:                               19.0 ( 1.87x)
> apply_bdof_12_8x16_c:                                   17.5 ( 1.00x)
> apply_bdof_12_8x16_neon:                                 9.7 ( 1.80x)
> apply_bdof_12_16x8_c:                                   18.2 ( 1.00x)
> apply_bdof_12_16x8_neon:                                 9.5 ( 1.92x)
> apply_bdof_12_16x16_c:                                  34.5 ( 1.00x)
> apply_bdof_12_16x16_neon:                               18.7 ( 1.84x)
>
Hi Zhili,
Thank you for the patch
AVX2 can achieve a 10-20x performance increase for a width of 16, as
demonstrated in this commit
<https://github.com/FFmpeg/FFmpeg/commit/7175544c0bab30c12c24a2c440bff40a28ea83d3>
.
Considering that NEON operates on 128-bit vectors, a more reasonable
expectation would be a 5-10x speedup

> ---
>  libavcodec/aarch64/vvc/dsp_init.c    |   9 +
>  libavcodec/aarch64/vvc/inter.S       | 351 +++++++++++++++++++++++++++
>  libavcodec/aarch64/vvc/of_template.c |  70 ++++++
>  3 files changed, 430 insertions(+)
>  create mode 100644 libavcodec/aarch64/vvc/of_template.c
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c
> b/libavcodec/aarch64/vvc/dsp_init.c
> index b39ebb83fc..03a4c62310 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -27,16 +27,22 @@
>  #include "libavcodec/vvc/dec.h"
>  #include "libavcodec/vvc/ctu.h"
>
> +#define BDOF_BLOCK_SIZE         16
> +#define BDOF_MIN_BLOCK_SIZE     4
> +
>  #define BIT_DEPTH 8
>  #include "alf_template.c"
> +#include "of_template.c"
>  #undef BIT_DEPTH
>
>  #define BIT_DEPTH 10
>  #include "alf_template.c"
> +#include "of_template.c"
>  #undef BIT_DEPTH
>
>  #define BIT_DEPTH 12
>  #include "alf_template.c"
> +#include "of_template.c"
>  #undef BIT_DEPTH
>
>  int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int
> dy,
> @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c,
> const int bd)
>
>          c->inter.avg = ff_vvc_avg_8_neon;
>          c->inter.w_avg = vvc_w_avg_8;
> +        c->inter.apply_bdof = apply_bdof_8;
>
>          for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
>              c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
> @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c,
> const int bd)
>      } else if (bd == 10) {
>          c->inter.avg = ff_vvc_avg_10_neon;
>          c->inter.w_avg = vvc_w_avg_10;
> +        c->inter.apply_bdof = apply_bdof_10;
>
>          c->alf.filter[LUMA] = alf_filter_luma_10_neon;
>          c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
>      } else if (bd == 12) {
>          c->inter.avg = ff_vvc_avg_12_neon;
>          c->inter.w_avg = vvc_w_avg_12;
> +        c->inter.apply_bdof = apply_bdof_12;
>
>          c->alf.filter[LUMA] = alf_filter_luma_12_neon;
>          c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
> diff --git a/libavcodec/aarch64/vvc/inter.S
> b/libavcodec/aarch64/vvc/inter.S
> index 49e1050aee..8cfacef44f 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -21,6 +21,8 @@
>  #include "libavutil/aarch64/asm.S"
>
>  #define VVC_MAX_PB_SIZE 128
> +#define BDOF_BLOCK_SIZE 16
> +#define BDOF_MIN_BLOCK_SIZE 4
>
>  .macro vvc_avg type, bit_depth
>
> @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
>  32:
>          ret
>  endfunc
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq width
> +.unreq height
>  .endm
>
>  vvc_avg avg, 8
> @@ -219,3 +228,345 @@ vvc_avg avg, 12
>  vvc_avg w_avg, 8
>  vvc_avg w_avg, 10
>  vvc_avg w_avg, 12
> +
> +function ff_vvc_prof_grad_filter_8x_neon, export=1
> +        gh              .req x0
> +        gv              .req x1
> +        gstride         .req x2
> +        src             .req x3
> +        src_stride      .req x4
> +        width           .req w5
> +        height          .req w6
> +
> +        lsl             src_stride, src_stride, #1
> +        neg             x7, src_stride
> +1:
> +        mov             x10, src
> +        mov             w11, width
> +        mov             x12, gh
> +        mov             x13, gv
> +2:
> +        ldur            q0, [x10, #2]
> +        ldur            q1, [x10, #-2]
> +        subs            w11, w11, #8
> +        ldr             q2, [x10, src_stride]
> +        ldr             q3, [x10, x7]
> +        sshr            v0.8h, v0.8h, #6
> +        sshr            v1.8h, v1.8h, #6
> +        sshr            v2.8h, v2.8h, #6
> +        sshr            v3.8h, v3.8h, #6
> +        sub             v0.8h, v0.8h, v1.8h
> +        sub             v2.8h, v2.8h, v3.8h
> +        st1             {v0.8h}, [x12], #16
> +        st1             {v2.8h}, [x13], #16
> +        add             x10, x10, #16
> +        b.ne            2b
> +
> +        subs            height, height, #1
> +        add             gh, gh, gstride, lsl #1
> +        add             gv, gv, gstride, lsl #1
> +        add             src, src, src_stride
> +        b.ne            1b
> +        ret
> +
> +.unreq gh
> +.unreq gv
> +.unreq gstride
> +.unreq src
> +.unreq src_stride
> +.unreq width
> +.unreq height
> +
> +endfunc
> +
> +.macro vvc_apply_bdof_min_block bit_depth
> +        dst             .req x0
> +        dst_stride      .req x1
> +        src0            .req x2
> +        src1            .req x3
> +        gh              .req x4
> +        gv              .req x5
> +        vx              .req w6
> +        vy              .req w7
> +
> +        dup             v0.4h, vx
> +        dup             v1.4h, vy
> +        movi            v7.4s, #(1 << (14 - \bit_depth))
> +        ldp             x8, x9, [gh]
> +        ldp             x10, x11, [gv]
> +        mov             x12, #(BDOF_BLOCK_SIZE * 2)
> +        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
> +        mov             x14, #(VVC_MAX_PB_SIZE * 2)
> +.if \bit_depth >= 10
> +        // clip pixel
> +        mov             w15, #((1 << \bit_depth) - 1)
> +        movi            v18.8h, #0
> +        lsl             dst_stride, dst_stride, #1
> +        dup             v17.8h, w15
> +.endif
> +1:
> +        ld1             {v2.4h}, [x8], x12
> +        ld1             {v3.4h}, [x9], x12
> +        ld1             {v4.4h}, [x10], x12
> +        ld1             {v5.4h}, [x11], x12
> +        sub             v2.4h, v2.4h, v3.4h
> +        sub             v4.4h, v4.4h, v5.4h
> +        smull           v2.4s, v0.4h, v2.4h
> +        smlal           v2.4s, v1.4h, v4.4h
> +
> +        ld1             {v5.4h}, [src0], x14
> +        ld1             {v6.4h}, [src1], x14
> +        saddl           v5.4s, v5.4h, v6.4h
> +        add             v5.4s, v5.4s, v7.4s
> +        add             v5.4s, v5.4s, v2.4s
> +        sqshrn          v5.4h, v5.4s, #(15 - \bit_depth)
> +        subs            w13, w13, #1
> +.if \bit_depth == 8
> +        sqxtun          v5.8b, v5.8h
> +        str             s5, [dst]
> +        add             dst, dst, dst_stride
> +.else
> +        smin            v5.4h, v5.4h, v17.4h
> +        smax            v5.4h, v5.4h, v18.4h
> +        st1             {v5.4h}, [dst], dst_stride
> +.endif
> +        b.ne            1b
> +        ret
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.endm
> +
> +function ff_vvc_apply_bdof_min_block_8_neon, export=1
> +        vvc_apply_bdof_min_block 8
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_10_neon, export=1
> +        vvc_apply_bdof_min_block 10
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_12_neon, export=1
> +        vvc_apply_bdof_min_block 12
> +endfunc
> +
> +.macro derive_bdof_vx_vy_x_begin_end
> +        ldrh            w19, [x14, x16, lsl #1]     // load from src0
> +        ldrh            w20, [x15, x16, lsl #1]     // load from src1
> +        sxth            w19, w19
> +        sxth            w20, w20
> +        asr             w19, w19, #4
> +        asr             w20, w20, #4
> +        sub             w19, w19, w20               // diff
> +        add             x17, x16, x13, lsl #4       // idx
> +        ldrh            w3, [gh0, x17, lsl #1]      // load from gh0
> +        ldrh            w4, [gh1, x17, lsl #1]      // load from gh1
> +        sxth            w3, w3
> +        sxth            w4, w4
> +        ldrh            w22, [gv0, x17, lsl #1]     // load from gv0
> +        ldrh            w23, [gv1, x17, lsl #1]     // load from gv1
> +        add             w3, w3, w4
> +        asr             w21, w3, #1                 // temph
> +        sxth            w3, w22
> +        sxth            w4, w23
> +        add             w3, w3, w4
> +        cmp             w21, #0
> +        asr             w22, w3, #1                 // tempv
> +        cneg            w20, w21, mi
> +        csetm           w23, ne
> +        csinc           w23, w23, wzr, ge           // -VVC_SIGN(temph)
> +        cmp             w22, #0
> +        add             sgx2, sgx2, w20
> +        cneg            w20, w22, mi
> +        cset            w24, ne
> +        csinv           w24, w24, wzr, ge           // VVC_SIGN(tempv)
> +        add             sgy2, sgy2, w20
> +        madd            sgxgy, w24, w21, sgxgy
> +        madd            sgxdi, w23, w19, sgxdi
> +        csetm           w24, ne
> +        csinc           w24, w24, wzr, ge           // -VVC_SIGN(tempv)
> +        madd            sgydi, w24, w19, sgydi
> +.endm
> +
> +function ff_vvc_derive_bdof_vx_vy_neon, export=1
> +        src0            .req x0
> +        src1            .req x1
> +        pad_mask        .req w2
> +        gh              .req x3
> +        gv              .req x4
> +        gh0             .req x27
> +        gh1             .req x28
> +        gv0             .req x25
> +        gv1             .req x26
> +        vx              .req x5
> +        vy              .req x6
> +        sgx2            .req w7
> +        sgy2            .req w8
> +        sgxgy           .req w9
> +        sgxdi           .req w10
> +        sgydi           .req w11
> +        y               .req x12
> +
> +        stp             x27, x28, [sp, #-80]!
> +        stp             x25, x26, [sp, #16]
> +        stp             x23, x24, [sp, #32]
> +        stp             x21, x22, [sp, #48]
> +        stp             x19, x20, [sp, #64]
> +
> +        ldp             gh0, gh1, [gh]
> +        mov             sgx2, #0
> +        mov             sgy2, #0
> +        mov             sgxgy, #0
> +        mov             sgxdi, #0
> +        mov             sgydi, #0
> +        ldp             gv0, gv1, [gv]
> +
> +        mov             y, #-1
> +        mov             x13, #-1                    // dy
> +        tst             pad_mask, #2
> +        b.eq            1f
> +        mov             x13, #0                     // dy: pad top
> +1:
> +        add             x14, src0, x13, lsl #8      // local src0
> +        add             x15, src1, x13, lsl #8      // local src1
> +
> +        // x = -1
> +        mov             x16, #-1                    // dx
> +        tst             pad_mask, #1
> +        b.eq            2f
> +        mov             x16, #0
> +2:
> +        derive_bdof_vx_vy_x_begin_end
> +
> +        // x = 0 to BDOF_MIN_BLOCK_SIZE - 1
> +        ldr             d0, [x14]
> +        ldr             d1, [x15]
> +        lsl             x19, x13, #5
> +        ldr             d2, [gh0, x19]
> +        ldr             d3, [gh1, x19]
> +        sshr            v0.4h, v0.4h, #4
> +        sshr            v1.4h, v1.4h, #4
> +        ssubl           v0.4s, v0.4h, v1.4h         // diff
> +        ldr             d4, [gv0, x19]
> +        ldr             d5, [gv1, x19]
> +        saddl           v2.4s, v2.4h, v3.4h
> +        saddl           v4.4s, v4.4h, v5.4h
> +        sshr            v2.4s, v2.4s, #1            // temph
> +        sshr            v4.4s, v4.4s, #1            // tempv
> +        abs             v3.4s, v2.4s
> +        abs             v5.4s, v4.4s
> +        addv            s3, v3.4s
> +        addv            s5, v5.4s
> +        mov             w19, v3.s[0]
> +        mov             w20, v5.s[0]
> +        add             sgx2, sgx2, w19
> +        add             sgy2, sgy2, w20
> +
> +        movi            v5.4s, #1
> +        cmgt            v17.4s, v4.4s, #0           // mask > 0
> +        cmlt            v18.4s, v4.4s, #0           // mask < 0
> +        and             v17.16b, v17.16b, v5.16b
> +        and             v18.16b, v18.16b, v5.16b
> +        neg             v19.4s, v18.4s
> +        add             v20.4s, v17.4s, v19.4s      // VVC_SIGN(tempv)
> +        smull           v21.2d, v20.2s, v2.2s
> +        smlal2          v21.2d, v20.4s, v2.4s
> +        addp            d21, v21.2d
> +        mov             w19, v21.s[0]
> +        add             sgxgy, sgxgy, w19
> +
> +        smull           v16.2d, v20.2s, v0.2s
> +        smlal2          v16.2d, v20.4s, v0.4s
> +        addp            d16, v16.2d
> +        mov             w19, v16.s[0]
> +        sub             sgydi, sgydi, w19
> +
> +        cmgt            v17.4s, v2.4s, #0
> +        cmlt            v18.4s, v2.4s, #0
> +        and             v17.16b, v17.16b, v5.16b
> +        and             v18.16b, v18.16b, v5.16b
> +        neg             v21.4s, v17.4s
> +        add             v16.4s, v21.4s, v18.4s      // -VVC_SIGN(temph)
> +        smull           v20.2d, v16.2s, v0.2s
> +        smlal2          v20.2d, v16.4s, v0.4s
> +        addp            d20, v20.2d
> +        mov             w19, v20.s[0]
> +        add             sgxdi, sgxdi, w19
> +
> +        // x = BDOF_MIN_BLOCK_SIZE
> +        mov             x16, #BDOF_MIN_BLOCK_SIZE   // dx
> +        tst             pad_mask, #4
> +        b.eq            3f
> +        mov             x16, #(BDOF_MIN_BLOCK_SIZE - 1)
> +3:
> +        derive_bdof_vx_vy_x_begin_end
> +
> +        add             y, y, #1
> +        cmp             y, #(BDOF_MIN_BLOCK_SIZE)
> +        mov             x13, y
> +        b.gt            4f
> +        b.lt            1b
> +        tst             pad_mask, #8
> +        b.eq            1b
> +        sub             x13, x13, #1                // pad bottom
> +        b               1b
> +4:
> +        mov             w3, #31
> +        mov             w14, #0
> +        mov             w16, #-15
> +        mov             w17, #15
> +        cbz             sgx2, 5f
> +        clz             w12, sgx2
> +        lsl             sgxdi, sgxdi, #2
> +        sub             w13, w3, w12                // log2(sgx2)
> +        asr             sgxdi, sgxdi, w13
> +        cmp             sgxdi, w16
> +        csel            w14, w16, sgxdi, lt         // clip to -15
> +        b.le            5f
> +        cmp             sgxdi, w17
> +        csel            w14, w17, sgxdi, gt         // clip to 15
> +5:
> +        str             w14, [vx]
> +
> +        mov             w15, #0
> +        cbz             sgy2, 6f
> +        lsl             sgydi, sgydi, #2
> +        smull           x14, w14, sgxgy
> +        asr             w14, w14, #1
> +        sub             sgydi, sgydi, w14
> +        clz             w12, sgy2
> +        sub             w13, w3, w12                // log2(sgy2)
> +        asr             sgydi, sgydi, w13
> +        cmp             sgydi, w16
> +        csel            w15, w16, sgydi, lt         // clip to -15
> +        b.le            6f
> +        cmp             sgydi, w17
> +        csel            w15, w17, sgydi, gt         // clip to 15
> +6:
> +        str             w15, [vy]
> +        ldp             x25, x26, [sp, #16]
> +        ldp             x23, x24, [sp, #32]
> +        ldp             x21, x22, [sp, #48]
> +        ldp             x19, x20, [sp, #64]
> +        ldp             x27, x28, [sp], #80
> +        ret
> +.unreq src0
> +.unreq src1
> +.unreq pad_mask
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.unreq sgx2
> +.unreq sgy2
> +.unreq sgxgy
> +.unreq sgxdi
> +.unreq sgydi
> +.unreq y
> +endfunc
> +
> diff --git a/libavcodec/aarch64/vvc/of_template.c
> b/libavcodec/aarch64/vvc/of_template.c
> new file mode 100644
> index 0000000000..508ea6d99d
> --- /dev/null
> +++ b/libavcodec/aarch64/vvc/of_template.c
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavcodec/bit_depth_template.c"
> +
> +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
> +                                     int16_t *gradient_v,
> +                                     const ptrdiff_t gradient_stride,
> +                                     const int16_t *_src,
> +                                     const ptrdiff_t src_stride,
> +                                     const int width, const int height);
> +
> +void ff_vvc_derive_bdof_vx_vy_neon(
> +        const int16_t *_src0, const int16_t *_src1, int pad_mask,
> +        const int16_t **gradient_h, const int16_t **gradient_v,
> +        int *vx, int *vy);
> +
> +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst,
> +        const ptrdiff_t dst_stride, const int16_t *src0, const int16_t
> *src1,
> +        const int16_t **gh, const int16_t **gv, const int vx, const int
> vy);
> +
> +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride,
> +        const int16_t *_src0, const int16_t *_src1,
> +        const int block_w, const int block_h)
> +{
> +    int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> +    int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> +    int vx, vy;
> +    const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
> +    pixel* dst                  = (pixel*)_dst;
> +
> +    ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0],
> BDOF_BLOCK_SIZE,
> +                           _src0, MAX_PB_SIZE, block_w, block_h);
> +    ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
> BDOF_BLOCK_SIZE,
> +                           _src1, MAX_PB_SIZE, block_w, block_h);
> +
> +    for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
> +        for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) {
> +            const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x;
> +            const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x;
> +            pixel *d            = dst + x;
> +            const int idx       = BDOF_BLOCK_SIZE * y  + x;
> +            const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] +
> idx };
> +            const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] +
> idx };
> +            const int pad_mask = !x | ((!y) << 1) |
> +                        ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
> +                        ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
> +            ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv,
> &vx, &vy);
> +            FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d,
> dst_stride, src0, src1, gh, gv, vx, vy);
> +        }
> +        dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
> +    }
> +}
> --
> 2.42.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index b39ebb83fc..03a4c62310 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -27,16 +27,22 @@ 
 #include "libavcodec/vvc/dec.h"
 #include "libavcodec/vvc/ctu.h"
 
+#define BDOF_BLOCK_SIZE         16
+#define BDOF_MIN_BLOCK_SIZE     4
+
 #define BIT_DEPTH 8
 #include "alf_template.c"
+#include "of_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 10
 #include "alf_template.c"
+#include "of_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 12
 #include "alf_template.c"
+#include "of_template.c"
 #undef BIT_DEPTH
 
 int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
@@ -155,6 +161,7 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 
         c->inter.avg = ff_vvc_avg_8_neon;
         c->inter.w_avg = vvc_w_avg_8;
+        c->inter.apply_bdof = apply_bdof_8;
 
         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@@ -196,12 +203,14 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
         c->inter.w_avg = vvc_w_avg_10;
+        c->inter.apply_bdof = apply_bdof_10;
 
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
     } else if (bd == 12) {
         c->inter.avg = ff_vvc_avg_12_neon;
         c->inter.w_avg = vvc_w_avg_12;
+        c->inter.apply_bdof = apply_bdof_12;
 
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 49e1050aee..8cfacef44f 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -21,6 +21,8 @@ 
 #include "libavutil/aarch64/asm.S"
 
 #define VVC_MAX_PB_SIZE 128
+#define BDOF_BLOCK_SIZE 16
+#define BDOF_MIN_BLOCK_SIZE 4
 
 .macro vvc_avg type, bit_depth
 
@@ -211,6 +213,13 @@  function ff_vvc_\type\()_\bit_depth\()_neon, export=1
 32:
         ret
 endfunc
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
 .endm
 
 vvc_avg avg, 8
@@ -219,3 +228,345 @@  vvc_avg avg, 12
 vvc_avg w_avg, 8
 vvc_avg w_avg, 10
 vvc_avg w_avg, 12
+
+function ff_vvc_prof_grad_filter_8x_neon, export=1
+        gh              .req x0
+        gv              .req x1
+        gstride         .req x2
+        src             .req x3
+        src_stride      .req x4
+        width           .req w5
+        height          .req w6
+
+        lsl             src_stride, src_stride, #1
+        neg             x7, src_stride
+1:
+        mov             x10, src
+        mov             w11, width
+        mov             x12, gh
+        mov             x13, gv
+2:
+        ldur            q0, [x10, #2]
+        ldur            q1, [x10, #-2]
+        subs            w11, w11, #8
+        ldr             q2, [x10, src_stride]
+        ldr             q3, [x10, x7]
+        sshr            v0.8h, v0.8h, #6
+        sshr            v1.8h, v1.8h, #6
+        sshr            v2.8h, v2.8h, #6
+        sshr            v3.8h, v3.8h, #6
+        sub             v0.8h, v0.8h, v1.8h
+        sub             v2.8h, v2.8h, v3.8h
+        st1             {v0.8h}, [x12], #16
+        st1             {v2.8h}, [x13], #16
+        add             x10, x10, #16
+        b.ne            2b
+
+        subs            height, height, #1
+        add             gh, gh, gstride, lsl #1
+        add             gv, gv, gstride, lsl #1
+        add             src, src, src_stride
+        b.ne            1b
+        ret
+
+.unreq gh
+.unreq gv
+.unreq gstride
+.unreq src
+.unreq src_stride
+.unreq width
+.unreq height
+
+endfunc
+
+.macro vvc_apply_bdof_min_block bit_depth
+        dst             .req x0
+        dst_stride      .req x1
+        src0            .req x2
+        src1            .req x3
+        gh              .req x4
+        gv              .req x5
+        vx              .req w6
+        vy              .req w7
+
+        dup             v0.4h, vx
+        dup             v1.4h, vy
+        movi            v7.4s, #(1 << (14 - \bit_depth))
+        ldp             x8, x9, [gh]
+        ldp             x10, x11, [gv]
+        mov             x12, #(BDOF_BLOCK_SIZE * 2)
+        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
+        mov             x14, #(VVC_MAX_PB_SIZE * 2)
+.if \bit_depth >= 10
+        // clip pixel
+        mov             w15, #((1 << \bit_depth) - 1)
+        movi            v18.8h, #0
+        lsl             dst_stride, dst_stride, #1
+        dup             v17.8h, w15
+.endif
+1:
+        ld1             {v2.4h}, [x8], x12
+        ld1             {v3.4h}, [x9], x12
+        ld1             {v4.4h}, [x10], x12
+        ld1             {v5.4h}, [x11], x12
+        sub             v2.4h, v2.4h, v3.4h
+        sub             v4.4h, v4.4h, v5.4h
+        smull           v2.4s, v0.4h, v2.4h
+        smlal           v2.4s, v1.4h, v4.4h
+
+        ld1             {v5.4h}, [src0], x14
+        ld1             {v6.4h}, [src1], x14
+        saddl           v5.4s, v5.4h, v6.4h
+        add             v5.4s, v5.4s, v7.4s
+        add             v5.4s, v5.4s, v2.4s
+        sqshrn          v5.4h, v5.4s, #(15 - \bit_depth)
+        subs            w13, w13, #1
+.if \bit_depth == 8
+        sqxtun          v5.8b, v5.8h
+        str             s5, [dst]
+        add             dst, dst, dst_stride
+.else
+        smin            v5.4h, v5.4h, v17.4h
+        smax            v5.4h, v5.4h, v18.4h
+        st1             {v5.4h}, [dst], dst_stride
+.endif
+        b.ne            1b
+        ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq gh
+.unreq gv
+.unreq vx
+.unreq vy
+.endm
+
+function ff_vvc_apply_bdof_min_block_8_neon, export=1
+        vvc_apply_bdof_min_block 8
+endfunc
+
+function ff_vvc_apply_bdof_min_block_10_neon, export=1
+        vvc_apply_bdof_min_block 10
+endfunc
+
+function ff_vvc_apply_bdof_min_block_12_neon, export=1
+        vvc_apply_bdof_min_block 12
+endfunc
+
+.macro derive_bdof_vx_vy_x_begin_end
+        ldrh            w19, [x14, x16, lsl #1]     // load from src0
+        ldrh            w20, [x15, x16, lsl #1]     // load from src1
+        sxth            w19, w19
+        sxth            w20, w20
+        asr             w19, w19, #4
+        asr             w20, w20, #4
+        sub             w19, w19, w20               // diff
+        add             x17, x16, x13, lsl #4       // idx
+        ldrh            w3, [gh0, x17, lsl #1]      // load from gh0
+        ldrh            w4, [gh1, x17, lsl #1]      // load from gh1
+        sxth            w3, w3
+        sxth            w4, w4
+        ldrh            w22, [gv0, x17, lsl #1]     // load from gv0
+        ldrh            w23, [gv1, x17, lsl #1]     // load from gv1
+        add             w3, w3, w4
+        asr             w21, w3, #1                 // temph
+        sxth            w3, w22
+        sxth            w4, w23
+        add             w3, w3, w4
+        cmp             w21, #0
+        asr             w22, w3, #1                 // tempv
+        cneg            w20, w21, mi
+        csetm           w23, ne
+        csinc           w23, w23, wzr, ge           // -VVC_SIGN(temph)
+        cmp             w22, #0
+        add             sgx2, sgx2, w20
+        cneg            w20, w22, mi
+        cset            w24, ne
+        csinv           w24, w24, wzr, ge           // VVC_SIGN(tempv)
+        add             sgy2, sgy2, w20
+        madd            sgxgy, w24, w21, sgxgy
+        madd            sgxdi, w23, w19, sgxdi
+        csetm           w24, ne
+        csinc           w24, w24, wzr, ge           // -VVC_SIGN(tempv)
+        madd            sgydi, w24, w19, sgydi
+.endm
+
+function ff_vvc_derive_bdof_vx_vy_neon, export=1
+        src0            .req x0
+        src1            .req x1
+        pad_mask        .req w2
+        gh              .req x3
+        gv              .req x4
+        gh0             .req x27
+        gh1             .req x28
+        gv0             .req x25
+        gv1             .req x26
+        vx              .req x5
+        vy              .req x6
+        sgx2            .req w7
+        sgy2            .req w8
+        sgxgy           .req w9
+        sgxdi           .req w10
+        sgydi           .req w11
+        y               .req x12
+
+        stp             x27, x28, [sp, #-80]!
+        stp             x25, x26, [sp, #16]
+        stp             x23, x24, [sp, #32]
+        stp             x21, x22, [sp, #48]
+        stp             x19, x20, [sp, #64]
+
+        ldp             gh0, gh1, [gh]
+        mov             sgx2, #0
+        mov             sgy2, #0
+        mov             sgxgy, #0
+        mov             sgxdi, #0
+        mov             sgydi, #0
+        ldp             gv0, gv1, [gv]
+
+        mov             y, #-1
+        mov             x13, #-1                    // dy
+        tst             pad_mask, #2
+        b.eq            1f
+        mov             x13, #0                     // dy: pad top
+1:
+        add             x14, src0, x13, lsl #8      // local src0
+        add             x15, src1, x13, lsl #8      // local src1
+
+        // x = -1
+        mov             x16, #-1                    // dx
+        tst             pad_mask, #1
+        b.eq            2f
+        mov             x16, #0
+2:
+        derive_bdof_vx_vy_x_begin_end
+
+        // x = 0 to BDOF_MIN_BLOCK_SIZE - 1
+        ldr             d0, [x14]
+        ldr             d1, [x15]
+        lsl             x19, x13, #5
+        ldr             d2, [gh0, x19]
+        ldr             d3, [gh1, x19]
+        sshr            v0.4h, v0.4h, #4
+        sshr            v1.4h, v1.4h, #4
+        ssubl           v0.4s, v0.4h, v1.4h         // diff
+        ldr             d4, [gv0, x19]
+        ldr             d5, [gv1, x19]
+        saddl           v2.4s, v2.4h, v3.4h
+        saddl           v4.4s, v4.4h, v5.4h
+        sshr            v2.4s, v2.4s, #1            // temph
+        sshr            v4.4s, v4.4s, #1            // tempv
+        abs             v3.4s, v2.4s
+        abs             v5.4s, v4.4s
+        addv            s3, v3.4s
+        addv            s5, v5.4s
+        mov             w19, v3.s[0]
+        mov             w20, v5.s[0]
+        add             sgx2, sgx2, w19
+        add             sgy2, sgy2, w20
+
+        movi            v5.4s, #1
+        cmgt            v17.4s, v4.4s, #0           // mask > 0
+        cmlt            v18.4s, v4.4s, #0           // mask < 0
+        and             v17.16b, v17.16b, v5.16b
+        and             v18.16b, v18.16b, v5.16b
+        neg             v19.4s, v18.4s
+        add             v20.4s, v17.4s, v19.4s      // VVC_SIGN(tempv)
+        smull           v21.2d, v20.2s, v2.2s
+        smlal2          v21.2d, v20.4s, v2.4s
+        addp            d21, v21.2d
+        mov             w19, v21.s[0]
+        add             sgxgy, sgxgy, w19
+
+        smull           v16.2d, v20.2s, v0.2s
+        smlal2          v16.2d, v20.4s, v0.4s
+        addp            d16, v16.2d
+        mov             w19, v16.s[0]
+        sub             sgydi, sgydi, w19
+
+        cmgt            v17.4s, v2.4s, #0
+        cmlt            v18.4s, v2.4s, #0
+        and             v17.16b, v17.16b, v5.16b
+        and             v18.16b, v18.16b, v5.16b
+        neg             v21.4s, v17.4s
+        add             v16.4s, v21.4s, v18.4s      // -VVC_SIGN(temph)
+        smull           v20.2d, v16.2s, v0.2s
+        smlal2          v20.2d, v16.4s, v0.4s
+        addp            d20, v20.2d
+        mov             w19, v20.s[0]
+        add             sgxdi, sgxdi, w19
+
+        // x = BDOF_MIN_BLOCK_SIZE
+        mov             x16, #BDOF_MIN_BLOCK_SIZE   // dx
+        tst             pad_mask, #4
+        b.eq            3f
+        mov             x16, #(BDOF_MIN_BLOCK_SIZE - 1)
+3:
+        derive_bdof_vx_vy_x_begin_end
+
+        add             y, y, #1
+        cmp             y, #(BDOF_MIN_BLOCK_SIZE)
+        mov             x13, y
+        b.gt            4f
+        b.lt            1b
+        tst             pad_mask, #8
+        b.eq            1b
+        sub             x13, x13, #1                // pad bottom
+        b               1b
+4:
+        mov             w3, #31
+        mov             w14, #0
+        mov             w16, #-15
+        mov             w17, #15
+        cbz             sgx2, 5f
+        clz             w12, sgx2
+        lsl             sgxdi, sgxdi, #2
+        sub             w13, w3, w12                // log2(sgx2)
+        asr             sgxdi, sgxdi, w13
+        cmp             sgxdi, w16
+        csel            w14, w16, sgxdi, lt         // clip to -15
+        b.le            5f
+        cmp             sgxdi, w17
+        csel            w14, w17, sgxdi, gt         // clip to 15
+5:
+        str             w14, [vx]
+
+        mov             w15, #0
+        cbz             sgy2, 6f
+        lsl             sgydi, sgydi, #2
+        smull           x14, w14, sgxgy
+        asr             w14, w14, #1
+        sub             sgydi, sgydi, w14
+        clz             w12, sgy2
+        sub             w13, w3, w12                // log2(sgy2)
+        asr             sgydi, sgydi, w13
+        cmp             sgydi, w16
+        csel            w15, w16, sgydi, lt         // clip to -15
+        b.le            6f
+        cmp             sgydi, w17
+        csel            w15, w17, sgydi, gt         // clip to 15
+6:
+        str             w15, [vy]
+        ldp             x25, x26, [sp, #16]
+        ldp             x23, x24, [sp, #32]
+        ldp             x21, x22, [sp, #48]
+        ldp             x19, x20, [sp, #64]
+        ldp             x27, x28, [sp], #80
+        ret
+.unreq src0
+.unreq src1
+.unreq pad_mask
+.unreq gh
+.unreq gv
+.unreq vx
+.unreq vy
+.unreq sgx2
+.unreq sgy2
+.unreq sgxgy
+.unreq sgxdi
+.unreq sgydi
+.unreq y
+endfunc
+
diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
new file mode 100644
index 0000000000..508ea6d99d
--- /dev/null
+++ b/libavcodec/aarch64/vvc/of_template.c
@@ -0,0 +1,70 @@ 
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+
+void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
+                                     int16_t *gradient_v,
+                                     const ptrdiff_t gradient_stride,
+                                     const int16_t *_src,
+                                     const ptrdiff_t src_stride,
+                                     const int width, const int height);
+
+void ff_vvc_derive_bdof_vx_vy_neon(
+        const int16_t *_src0, const int16_t *_src1, int pad_mask,
+        const int16_t **gradient_h, const int16_t **gradient_v,
+        int *vx, int *vy);
+
+void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst,
+        const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
+        const int16_t **gh, const int16_t **gv, const int vx, const int vy);
+
+static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+        const int16_t *_src0, const int16_t *_src1,
+        const int block_w, const int block_h)
+{
+    int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
+    int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
+    int vx, vy;
+    const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+    pixel* dst                  = (pixel*)_dst;
+
+    ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE,
+                           _src0, MAX_PB_SIZE, block_w, block_h);
+    ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE,
+                           _src1, MAX_PB_SIZE, block_w, block_h);
+
+    for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
+        for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) {
+            const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x;
+            const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x;
+            pixel *d            = dst + x;
+            const int idx       = BDOF_BLOCK_SIZE * y  + x;
+            const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx };
+            const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx };
+            const int pad_mask = !x | ((!y) << 1) |
+                        ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
+                        ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
+            ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy);
+            FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy);
+        }
+        dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
+    }
+}