diff mbox series

[FFmpeg-devel,v2,1/3] aarch64/vvc: Add w_avg

Message ID tencent_3AB79468AA02A96AEA7C5EAD8E39F9C0F90A@qq.com
State New
Headers show
Series [FFmpeg-devel,v2,1/3] aarch64/vvc: Add w_avg | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Zhao Zhili Sept. 23, 2024, 9:05 a.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

w_avg_8_2x2_c:                                           0.0 ( 0.00x)
w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
w_avg_8_4x4_c:                                           0.2 ( 1.00x)
w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
w_avg_8_8x8_c:                                           1.2 ( 1.00x)
w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
w_avg_8_16x16_c:                                         4.2 ( 1.00x)
w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
w_avg_8_32x32_c:                                        16.2 ( 1.00x)
w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
w_avg_8_64x64_c:                                        64.5 ( 1.00x)
w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
w_avg_8_128x128_c:                                     269.5 ( 1.00x)
w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
w_avg_10_2x2_c:                                          0.2 ( 1.00x)
w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
w_avg_10_4x4_c:                                          0.2 ( 1.00x)
w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
w_avg_10_8x8_c:                                          1.0 ( 1.00x)
w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
w_avg_10_16x16_c:                                        4.2 ( 1.00x)
w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
w_avg_10_32x32_c:                                       16.2 ( 1.00x)
w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
w_avg_10_64x64_c:                                       66.2 ( 1.00x)
w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
w_avg_10_128x128_c:                                    277.8 ( 1.00x)
w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
w_avg_12_2x2_c:                                          0.0 ( 0.00x)
w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
w_avg_12_4x4_c:                                          0.2 ( 1.00x)
w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
w_avg_12_8x8_c:                                          1.2 ( 1.00x)
w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
w_avg_12_16x16_c:                                        4.8 ( 1.00x)
w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
w_avg_12_32x32_c:                                       17.0 ( 1.00x)
w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
w_avg_12_64x64_c:                                       64.0 ( 1.00x)
w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
w_avg_12_128x128_c:                                    269.2 ( 1.00x)
w_avg_12_128x128_neon:                                  42.0 ( 6.41x)
---
 libavcodec/aarch64/vvc/dsp_init.c | 34 +++++++++++
 libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
 2 files changed, 116 insertions(+), 17 deletions(-)

Comments

Martin Storsjö Sept. 26, 2024, 11:25 a.m. UTC | #1
On Mon, 23 Sep 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> w_avg_8_2x2_c:                                           0.0 ( 0.00x)
> w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
> w_avg_8_4x4_c:                                           0.2 ( 1.00x)
> w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
> w_avg_8_8x8_c:                                           1.2 ( 1.00x)
> w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
> w_avg_8_16x16_c:                                         4.2 ( 1.00x)
> w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
> w_avg_8_32x32_c:                                        16.2 ( 1.00x)
> w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
> w_avg_8_64x64_c:                                        64.5 ( 1.00x)
> w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
> w_avg_8_128x128_c:                                     269.5 ( 1.00x)
> w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
> w_avg_10_2x2_c:                                          0.2 ( 1.00x)
> w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
> w_avg_10_4x4_c:                                          0.2 ( 1.00x)
> w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
> w_avg_10_8x8_c:                                          1.0 ( 1.00x)
> w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
> w_avg_10_16x16_c:                                        4.2 ( 1.00x)
> w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
> w_avg_10_32x32_c:                                       16.2 ( 1.00x)
> w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
> w_avg_10_64x64_c:                                       66.2 ( 1.00x)
> w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
> w_avg_10_128x128_c:                                    277.8 ( 1.00x)
> w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
> w_avg_12_2x2_c:                                          0.0 ( 0.00x)
> w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
> w_avg_12_4x4_c:                                          0.2 ( 1.00x)
> w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
> w_avg_12_8x8_c:                                          1.2 ( 1.00x)
> w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
> w_avg_12_16x16_c:                                        4.8 ( 1.00x)
> w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
> w_avg_12_32x32_c:                                       17.0 ( 1.00x)
> w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
> w_avg_12_64x64_c:                                       64.0 ( 1.00x)
> w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
> w_avg_12_128x128_c:                                    269.2 ( 1.00x)
> w_avg_12_128x128_neon:                                  42.0 ( 6.41x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c | 34 +++++++++++
> libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
> 2 files changed, 116 insertions(+), 17 deletions(-)
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index ad767d17e2..b39ebb83fc 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -52,6 +52,37 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
>                         const int16_t *src0, const int16_t *src1, int width,
>                         int height);
>
> +void ff_vvc_w_avg_8_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
> +                         const int16_t *src0, const int16_t *src1,
> +                         const int width, const int height,
> +                         uintptr_t w0_w1, uintptr_t offset_shift);

Including "const" on scalar parameters is entirely redundant, and we don't 
prescribe use of that elsewhere in ffmpeg, and just makes the whole 
declaration more noisy.

> +void ff_vvc_w_avg_10_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
> +                         const int16_t *src0, const int16_t *src1,
> +                         const int width, const int height,
> +                         uintptr_t w0_w1, uintptr_t offset_shift);
> +void ff_vvc_w_avg_12_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
> +                          const int16_t *src0, const int16_t *src1,
> +                          const int width, const int height,
> +                          uintptr_t w0_w1, uintptr_t offset_shift);
> +/* When passing arguments to functions, Apple platforms diverge from the ARM64
> + * standard ABI, that we can't implement the function directly in asm.
> + */

It's fully possible to implement that in assembly, but it usually requires 
ugly ifdefs.

That said, I'm ok with this kind of wrapper, as it avoids the problem 
kinda neatly, but ifdefs in the assembly can also be needed at times.

> +#define W_AVG_FUN(bit_depth) \
> +static void vvc_w_avg_ ## bit_depth(uint8_t *dst, const ptrdiff_t dst_stride, \
> +    const int16_t *src0, const int16_t *src1, const int width, const int height, \
> +    const int denom, const int w0, const int w1, const int o0, const int o1) \
> +{ \
> +    const int shift = denom + FFMAX(3, 15 - bit_depth); \
> +    const int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \

Same about the superfluous "const" everywhere. For local variables, I 
guess it can be argued that marking them as const can aid readability in 
some way, but I don't think we generally prescribe doing that.

The rest of the patch seems fine, thanks!

// Martin
Zhao Zhili Sept. 26, 2024, noon UTC | #2
> On Sep 26, 2024, at 19:25, Martin Storsjö <martin@martin.st> wrote:
> 
> On Mon, 23 Sep 2024, Zhao Zhili wrote:
> 
>> From: Zhao Zhili <zhilizhao@tencent.com>
>> 
>> w_avg_8_2x2_c:                                           0.0 ( 0.00x)
>> w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
>> w_avg_8_4x4_c:                                           0.2 ( 1.00x)
>> w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
>> w_avg_8_8x8_c:                                           1.2 ( 1.00x)
>> w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
>> w_avg_8_16x16_c:                                         4.2 ( 1.00x)
>> w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
>> w_avg_8_32x32_c:                                        16.2 ( 1.00x)
>> w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
>> w_avg_8_64x64_c:                                        64.5 ( 1.00x)
>> w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
>> w_avg_8_128x128_c:                                     269.5 ( 1.00x)
>> w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
>> w_avg_10_2x2_c:                                          0.2 ( 1.00x)
>> w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
>> w_avg_10_4x4_c:                                          0.2 ( 1.00x)
>> w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
>> w_avg_10_8x8_c:                                          1.0 ( 1.00x)
>> w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
>> w_avg_10_16x16_c:                                        4.2 ( 1.00x)
>> w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
>> w_avg_10_32x32_c:                                       16.2 ( 1.00x)
>> w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
>> w_avg_10_64x64_c:                                       66.2 ( 1.00x)
>> w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
>> w_avg_10_128x128_c:                                    277.8 ( 1.00x)
>> w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
>> w_avg_12_2x2_c:                                          0.0 ( 0.00x)
>> w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
>> w_avg_12_4x4_c:                                          0.2 ( 1.00x)
>> w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
>> w_avg_12_8x8_c:                                          1.2 ( 1.00x)
>> w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
>> w_avg_12_16x16_c:                                        4.8 ( 1.00x)
>> w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
>> w_avg_12_32x32_c:                                       17.0 ( 1.00x)
>> w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
>> w_avg_12_64x64_c:                                       64.0 ( 1.00x)
>> w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
>> w_avg_12_128x128_c:                                    269.2 ( 1.00x)
>> w_avg_12_128x128_neon:                                  42.0 ( 6.41x)
>> ---
>> libavcodec/aarch64/vvc/dsp_init.c | 34 +++++++++++
>> libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
>> 2 files changed, 116 insertions(+), 17 deletions(-)
>> 
>> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
>> index ad767d17e2..b39ebb83fc 100644
>> --- a/libavcodec/aarch64/vvc/dsp_init.c
>> +++ b/libavcodec/aarch64/vvc/dsp_init.c
>> @@ -52,6 +52,37 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
>>                        const int16_t *src0, const int16_t *src1, int width,
>>                        int height);
>> 
>> +void ff_vvc_w_avg_8_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
>> +                         const int16_t *src0, const int16_t *src1,
>> +                         const int width, const int height,
>> +                         uintptr_t w0_w1, uintptr_t offset_shift);
> 
> Including "const" on scalar parameters is entirely redundant, and we don't prescribe use of that elsewhere in ffmpeg, and just makes the whole declaration more noisy.

I see these “const” make clang-tidy not happy. They are here to keep consistent with the prototypes
in vvc/dsp.h. There are three options:

1. Keep “const” as current state
2. Drop “const” only for these new functions
3. Remove “const” from vvc/dsp.h and all implementations

I can’t decide which way to go.

> 
>> +void ff_vvc_w_avg_10_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
>> +                         const int16_t *src0, const int16_t *src1,
>> +                         const int width, const int height,
>> +                         uintptr_t w0_w1, uintptr_t offset_shift);
>> +void ff_vvc_w_avg_12_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
>> +                          const int16_t *src0, const int16_t *src1,
>> +                          const int width, const int height,
>> +                          uintptr_t w0_w1, uintptr_t offset_shift);
>> +/* When passing arguments to functions, Apple platforms diverge from the ARM64
>> + * standard ABI, that we can't implement the function directly in asm.
>> + */
> 
> It's fully possible to implement that in assembly, but it usually requires ugly ifdefs.
> 
> That said, I'm ok with this kind of wrapper, as it avoids the problem kinda neatly, but ifdefs in the assembly can also be needed at times.
> 
>> +#define W_AVG_FUN(bit_depth) \
>> +static void vvc_w_avg_ ## bit_depth(uint8_t *dst, const ptrdiff_t dst_stride, \
>> +    const int16_t *src0, const int16_t *src1, const int width, const int height, \
>> +    const int denom, const int w0, const int w1, const int o0, const int o1) \
>> +{ \
>> +    const int shift = denom + FFMAX(3, 15 - bit_depth); \
>> +    const int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
> 
> Same about the superfluous "const" everywhere. For local variables, I guess it can be argued that marking them as const can aid readability in some way, but I don't think we generally prescribe doing that.
> 
> The rest of the patch seems fine, thanks!
> 
> // Martin
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
Martin Storsjö Sept. 26, 2024, 12:17 p.m. UTC | #3
On Thu, 26 Sep 2024, Zhao Zhili wrote:

>       --- a/libavcodec/aarch64/vvc/dsp_init.c
>       +++ b/libavcodec/aarch64/vvc/dsp_init.c
>       @@ -52,6 +52,37 @@ void ff_vvc_avg_12_neon(uint8_t *dst,
>       ptrdiff_t dst_stride,
>                              const int16_t *src0, const int16_t
>       *src1, int width,
>                              int height);
>
>       +void ff_vvc_w_avg_8_neon(uint8_t *_dst, const ptrdiff_t
>       _dst_stride,
>       +                         const int16_t *src0, const
>       int16_t *src1,
>       +                         const int width, const int
>       height,
>       +                         uintptr_t w0_w1, uintptr_t
>       offset_shift);
> 
> 
> Including "const" on scalar parameters is entirely redundant, and we
> don't prescribe use of that elsewhere in ffmpeg, and just makes the
> whole declaration more noisy.
> 
> 
> I see these “const” make clang-tidy not happy. They are here to keep
> consistent with the prototypes
> in vvc/dsp.h.

Hmm, I don't quite understand this comment - so you say that clang-tidy, 
in addition to me, also complain about them? But they are added manually 
to keep the prototypes exactly in sync? Or does clang-tidy complain about 
differences here, if we differ on the constness here?

> There are three options:
> 
> 1. Keep “const” as current state
> 2. Drop “const” only for these new functions
> 3. Remove “const” from vvc/dsp.h and all implementations
> 
> I can’t decide which way to go.

I would go for 3, at least long term.

If you need to keep the const within the function prototypes here for now 
to please some tool (I think most compilers wouldn't complain about 
differences in const on scalar parameters, although I think old MSVC did 
that), that's ok, but I would remove it from the unnecessary places (the 
local variables in the function, the parameter/register mappings in 
assembly).

Then we can try to do 3 as a later step.

// Martin
Zhao Zhili Sept. 26, 2024, 12:34 p.m. UTC | #4
> On Sep 26, 2024, at 20:17, Martin Storsjö <martin@martin.st> wrote:
> 
> On Thu, 26 Sep 2024, Zhao Zhili wrote:
> 
>>      --- a/libavcodec/aarch64/vvc/dsp_init.c
>>      +++ b/libavcodec/aarch64/vvc/dsp_init.c
>>      @@ -52,6 +52,37 @@ void ff_vvc_avg_12_neon(uint8_t *dst,
>>      ptrdiff_t dst_stride,
>>                             const int16_t *src0, const int16_t
>>      *src1, int width,
>>                             int height);
>> 
>>      +void ff_vvc_w_avg_8_neon(uint8_t *_dst, const ptrdiff_t
>>      _dst_stride,
>>      +                         const int16_t *src0, const
>>      int16_t *src1,
>>      +                         const int width, const int
>>      height,
>>      +                         uintptr_t w0_w1, uintptr_t
>>      offset_shift);
>> Including "const" on scalar parameters is entirely redundant, and we
>> don't prescribe use of that elsewhere in ffmpeg, and just makes the
>> whole declaration more noisy.
>> I see these “const” make clang-tidy not happy. They are here to keep
>> consistent with the prototypes
>> in vvc/dsp.h.
> 
> Hmm, I don't quite understand this comment - so you say that clang-tidy, in addition to me, also complain about them? But they are added manually to keep the prototypes exactly in sync? Or does clang-tidy complain about differences here, if we differ on the constness here?

Clang-tidy complains about these “const” be added:

Clang-Tidy: Parameter 'block_w' is const-qualified in the function declaration; const-qualification of parameters only has an effect in function definitions

> 
>> There are three options:
>> 1. Keep “const” as current state
>> 2. Drop “const” only for these new functions
>> 3. Remove “const” from vvc/dsp.h and all implementations
>> I can’t decide which way to go.
> 
> I would go for 3, at least long term.
> 
> If you need to keep the const within the function prototypes here for now to please some tool (I think most compilers wouldn't complain about differences in const on scalar parameters, although I think old MSVC did that), that's ok, but I would remove it from the unnecessary places (the local variables in the function, the parameter/register mappings in assembly).
> 
> Then we can try to do 3 as a later step.
> 
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index ad767d17e2..b39ebb83fc 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -52,6 +52,37 @@  void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *src0, const int16_t *src1, int width,
                         int height);
 
+void ff_vvc_w_avg_8_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         const int width, const int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_10_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         const int width, const int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_12_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
+                          const int16_t *src0, const int16_t *src1,
+                          const int width, const int height,
+                          uintptr_t w0_w1, uintptr_t offset_shift);
+/* When passing arguments to functions, Apple platforms diverge from the ARM64
+ * standard ABI, that we can't implement the function directly in asm.
+ */
+#define W_AVG_FUN(bit_depth) \
+static void vvc_w_avg_ ## bit_depth(uint8_t *dst, const ptrdiff_t dst_stride, \
+    const int16_t *src0, const int16_t *src1, const int width, const int height, \
+    const int denom, const int w0, const int w1, const int o0, const int o1) \
+{ \
+    const int shift = denom + FFMAX(3, 15 - bit_depth); \
+    const int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
+    uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
+    uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
+    ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
+}
+
+W_AVG_FUN(8)
+W_AVG_FUN(10)
+W_AVG_FUN(12)
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -123,6 +154,7 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
 
         c->inter.avg = ff_vvc_avg_8_neon;
+        c->inter.w_avg = vvc_w_avg_8;
 
         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@@ -163,11 +195,13 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         }
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
+        c->inter.w_avg = vvc_w_avg_10;
 
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
     } else if (bd == 12) {
         c->inter.avg = ff_vvc_avg_12_neon;
+        c->inter.w_avg = vvc_w_avg_12;
 
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 2f69274b86..c4c6ab1a72 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -22,9 +22,9 @@ 
 
 #define VVC_MAX_PB_SIZE 128
 
-.macro vvc_avg, bit_depth
+.macro vvc_avg type, bit_depth
 
-.macro vvc_avg_\bit_depth\()_2_4, tap
+.macro vvc_\type\()_\bit_depth\()_2_4 tap
 .if \tap == 2
         ldr             s0, [src0]
         ldr             s2, [src1]
@@ -32,9 +32,19 @@ 
         ldr             d0, [src0]
         ldr             d2, [src1]
 .endif
+
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         add             v4.4s, v4.4s, v16.4s
         sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+.endif
+
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
 .if \tap == 2
@@ -57,7 +67,7 @@ 
         add             dst, dst, dst_stride
 .endm
 
-function ff_vvc_avg_\bit_depth\()_neon, export=1
+function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         dst             .req x0
         dst_stride      .req x1
         src0            .req x2
@@ -67,42 +77,64 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
 
         mov             x10, #(VVC_MAX_PB_SIZE * 2)
         cmp             width, #8
-.if \bit_depth == 8
-        movi            v16.4s, #64
-.else
-.if \bit_depth == 10
-        mov             w6, #1023
-        movi            v16.4s, #16
+.ifc \type, avg
+        movi            v16.4s, #(1 << (14 - \bit_depth))
 .else
-        mov             w6, #4095
-        movi            v16.4s, #4
-.endif
+        lsr             x11, x6, #32        // weight0
+        mov             w12, w6             // weight1
+        lsr             x13, x7, #32        // offset
+        mov             w14, w7             // shift
+
+        dup             v19.8h, w11
+        neg             w14, w14            // so we can use sqshl
+        dup             v20.8h, w12
+        dup             v16.4s, w13
+        dup             v22.4s, w14
+.endif // avg
+
+ .if \bit_depth >= 10
+        // clip pixel
+        mov             w6, #((1 << \bit_depth) - 1)
         movi            v18.8h, #0
         dup             v17.8h, w6
 .endif
+
         b.eq            8f
         b.hi            16f
         cmp             width, #4
         b.eq            4f
 2:      // width == 2
         subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 2
+        vvc_\type\()_\bit_depth\()_2_4 2
         b.ne            2b
         b               32f
 4:      // width == 4
         subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 4
+        vvc_\type\()_\bit_depth\()_2_4 4
         b.ne            4b
         b               32f
 8:      // width == 8
         ld1             {v0.8h}, [src0], x10
         ld1             {v2.8h}, [src1], x10
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         saddl2          v5.4s, v0.8h, v2.8h
         add             v4.4s, v4.4s, v16.4s
         add             v5.4s, v5.4s, v16.4s
         sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
         sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn2          v4.8h, v5.4s
+.endif
         subs            height, height, #1
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
@@ -122,6 +154,7 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
 17:
         ldp             q0, q1, [x7], #32
         ldp             q2, q3, [x8], #32
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         saddl2          v5.4s, v0.8h, v2.8h
         saddl           v6.4s, v1.4h, v3.4h
@@ -134,6 +167,28 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
         sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
         sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
         sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
+.else   // avg
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        mov             v6.16b, v16.16b
+        mov             v7.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        smlal           v6.4s, v1.4h, v19.4h
+        smlal           v6.4s, v3.4h, v20.4h
+        smlal2          v7.4s, v1.8h, v19.8h
+        smlal2          v7.4s, v3.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqshl           v6.4s, v6.4s, v22.4s
+        sqshl           v7.4s, v7.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn           v6.4h, v6.4s
+        sqxtn2          v4.8h, v5.4s
+        sqxtn2          v6.8h, v7.4s
+.endif  // w_avg
         subs            w6, w6, #16
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
@@ -155,9 +210,19 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
         b.ne            16b
 32:
         ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
 endfunc
 .endm
 
-vvc_avg 8
-vvc_avg 10
-vvc_avg 12
+vvc_avg avg, 8
+vvc_avg avg, 10
+vvc_avg avg, 12
+vvc_avg w_avg, 8
+vvc_avg w_avg, 10
+vvc_avg w_avg, 12