diff mbox series

[FFmpeg-devel,v4,1/3] aarch64/vvc: Add w_avg

Message ID tencent_A3CF81D54E2709B8AC165DCD702054CD8907@qq.com
State New
Headers show
Series aarch64/vvc add w_avg and dmvr/dmvr_hv | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Zhao Zhili Sept. 29, 2024, 12:02 p.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

w_avg_8_2x2_c:                                           0.0 ( 0.00x)
w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
w_avg_8_4x4_c:                                           0.2 ( 1.00x)
w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
w_avg_8_8x8_c:                                           1.2 ( 1.00x)
w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
w_avg_8_16x16_c:                                         4.2 ( 1.00x)
w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
w_avg_8_32x32_c:                                        16.2 ( 1.00x)
w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
w_avg_8_64x64_c:                                        64.5 ( 1.00x)
w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
w_avg_8_128x128_c:                                     269.5 ( 1.00x)
w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
w_avg_10_2x2_c:                                          0.2 ( 1.00x)
w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
w_avg_10_4x4_c:                                          0.2 ( 1.00x)
w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
w_avg_10_8x8_c:                                          1.0 ( 1.00x)
w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
w_avg_10_16x16_c:                                        4.2 ( 1.00x)
w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
w_avg_10_32x32_c:                                       16.2 ( 1.00x)
w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
w_avg_10_64x64_c:                                       66.2 ( 1.00x)
w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
w_avg_10_128x128_c:                                    277.8 ( 1.00x)
w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
w_avg_12_2x2_c:                                          0.0 ( 0.00x)
w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
w_avg_12_4x4_c:                                          0.2 ( 1.00x)
w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
w_avg_12_8x8_c:                                          1.2 ( 1.00x)
w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
w_avg_12_16x16_c:                                        4.8 ( 1.00x)
w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
w_avg_12_32x32_c:                                       17.0 ( 1.00x)
w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
w_avg_12_64x64_c:                                       64.0 ( 1.00x)
w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
w_avg_12_128x128_c:                                    269.2 ( 1.00x)
w_avg_12_128x128_neon:                                  42.0 ( 6.41x)

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
---
 libavcodec/aarch64/vvc/dsp_init.c | 36 +++++++++++
 libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
 2 files changed, 118 insertions(+), 17 deletions(-)

Comments

Martin Storsjö Sept. 29, 2024, 2:16 p.m. UTC | #1
On Sun, 29 Sep 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> w_avg_8_2x2_c:                                           0.0 ( 0.00x)
> w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
> w_avg_8_4x4_c:                                           0.2 ( 1.00x)
> w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
> w_avg_8_8x8_c:                                           1.2 ( 1.00x)
> w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
> w_avg_8_16x16_c:                                         4.2 ( 1.00x)
> w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
> w_avg_8_32x32_c:                                        16.2 ( 1.00x)
> w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
> w_avg_8_64x64_c:                                        64.5 ( 1.00x)
> w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
> w_avg_8_128x128_c:                                     269.5 ( 1.00x)
> w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
> w_avg_10_2x2_c:                                          0.2 ( 1.00x)
> w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
> w_avg_10_4x4_c:                                          0.2 ( 1.00x)
> w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
> w_avg_10_8x8_c:                                          1.0 ( 1.00x)
> w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
> w_avg_10_16x16_c:                                        4.2 ( 1.00x)
> w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
> w_avg_10_32x32_c:                                       16.2 ( 1.00x)
> w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
> w_avg_10_64x64_c:                                       66.2 ( 1.00x)
> w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
> w_avg_10_128x128_c:                                    277.8 ( 1.00x)
> w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
> w_avg_12_2x2_c:                                          0.0 ( 0.00x)
> w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
> w_avg_12_4x4_c:                                          0.2 ( 1.00x)
> w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
> w_avg_12_8x8_c:                                          1.2 ( 1.00x)
> w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
> w_avg_12_16x16_c:                                        4.8 ( 1.00x)
> w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
> w_avg_12_32x32_c:                                       17.0 ( 1.00x)
> w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
> w_avg_12_64x64_c:                                       64.0 ( 1.00x)
> w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
> w_avg_12_128x128_c:                                    269.2 ( 1.00x)
> w_avg_12_128x128_neon:                                  42.0 ( 6.41x)
>
> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
> ---
> libavcodec/aarch64/vvc/dsp_init.c | 36 +++++++++++
> libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
> 2 files changed, 118 insertions(+), 17 deletions(-)
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index ad767d17e2..ebe58a2ba5 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -52,6 +52,39 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
>                         const int16_t *src0, const int16_t *src1, int width,
>                         int height);
>
> +void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> +                         const int16_t *src0, const int16_t *src1,
> +                         int width, int height,
> +                         uintptr_t w0_w1, uintptr_t offset_shift);
> +void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> +                         const int16_t *src0, const int16_t *src1,
> +                         int width, int height,
> +                         uintptr_t w0_w1, uintptr_t offset_shift);
> +void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> +                          const int16_t *src0, const int16_t *src1,
> +                          int width, int height,
> +                          uintptr_t w0_w1, uintptr_t offset_shift);
> +/* When passing arguments to functions, Apple platforms diverge from the ARM64
> + * standard ABI for functions that require passing arguments on the stack. To
> + * simplify portability in the assembly function interface, use a different
> + * function signature that doesn't require passing arguments on the stack.
> + */
> +#define W_AVG_FUN(bit_depth) \
> +static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
> +    const int16_t *src0, const int16_t *src1, int width, int height, \
> +    int denom, int w0, int w1, int o0, int o1) \
> +{ \
> +    int shift = denom + FFMAX(3, 15 - bit_depth); \
> +    int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
> +    uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
> +    uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
> +    ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
> +}

This LGTM now (and the rest was ok already before), thanks!

// Martin
Nuo Mi Oct. 1, 2024, 2:50 a.m. UTC | #2
On Sun, Sep 29, 2024 at 10:16 PM Martin Storsjö <martin@martin.st> wrote:

> On Sun, 29 Sep 2024, Zhao Zhili wrote:
>
> > From: Zhao Zhili <zhilizhao@tencent.com>
> >
> > w_avg_8_2x2_c:                                           0.0 ( 0.00x)
> > w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
> > w_avg_8_4x4_c:                                           0.2 ( 1.00x)
> > w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
> > w_avg_8_8x8_c:                                           1.2 ( 1.00x)
> > w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
> > w_avg_8_16x16_c:                                         4.2 ( 1.00x)
> > w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
> > w_avg_8_32x32_c:                                        16.2 ( 1.00x)
> > w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
> > w_avg_8_64x64_c:                                        64.5 ( 1.00x)
> > w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
> > w_avg_8_128x128_c:                                     269.5 ( 1.00x)
> > w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
> > w_avg_10_2x2_c:                                          0.2 ( 1.00x)
> > w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
> > w_avg_10_4x4_c:                                          0.2 ( 1.00x)
> > w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
> > w_avg_10_8x8_c:                                          1.0 ( 1.00x)
> > w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
> > w_avg_10_16x16_c:                                        4.2 ( 1.00x)
> > w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
> > w_avg_10_32x32_c:                                       16.2 ( 1.00x)
> > w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
> > w_avg_10_64x64_c:                                       66.2 ( 1.00x)
> > w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
> > w_avg_10_128x128_c:                                    277.8 ( 1.00x)
> > w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
> > w_avg_12_2x2_c:                                          0.0 ( 0.00x)
> > w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
> > w_avg_12_4x4_c:                                          0.2 ( 1.00x)
> > w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
> > w_avg_12_8x8_c:                                          1.2 ( 1.00x)
> > w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
> > w_avg_12_16x16_c:                                        4.8 ( 1.00x)
> > w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
> > w_avg_12_32x32_c:                                       17.0 ( 1.00x)
> > w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
> > w_avg_12_64x64_c:                                       64.0 ( 1.00x)
> > w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
> > w_avg_12_128x128_c:                                    269.2 ( 1.00x)
> > w_avg_12_128x128_neon:                                  42.0 ( 6.41x)
> >
> > Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
> > ---
> > libavcodec/aarch64/vvc/dsp_init.c | 36 +++++++++++
> > libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
> > 2 files changed, 118 insertions(+), 17 deletions(-)
> >
> > diff --git a/libavcodec/aarch64/vvc/dsp_init.c
> b/libavcodec/aarch64/vvc/dsp_init.c
> > index ad767d17e2..ebe58a2ba5 100644
> > --- a/libavcodec/aarch64/vvc/dsp_init.c
> > +++ b/libavcodec/aarch64/vvc/dsp_init.c
> > @@ -52,6 +52,39 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t
> dst_stride,
> >                         const int16_t *src0, const int16_t *src1, int
> width,
> >                         int height);
> >
> > +void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> > +                         const int16_t *src0, const int16_t *src1,
> > +                         int width, int height,
> > +                         uintptr_t w0_w1, uintptr_t offset_shift);
> > +void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> > +                         const int16_t *src0, const int16_t *src1,
> > +                         int width, int height,
> > +                         uintptr_t w0_w1, uintptr_t offset_shift);
> > +void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> > +                          const int16_t *src0, const int16_t *src1,
> > +                          int width, int height,
> > +                          uintptr_t w0_w1, uintptr_t offset_shift);
> > +/* When passing arguments to functions, Apple platforms diverge from
> the ARM64
> > + * standard ABI for functions that require passing arguments on the
> stack. To
> > + * simplify portability in the assembly function interface, use a
> different
> > + * function signature that doesn't require passing arguments on the
> stack.
> > + */
> > +#define W_AVG_FUN(bit_depth) \
> > +static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride,
> \
> > +    const int16_t *src0, const int16_t *src1, int width, int height, \
> > +    int denom, int w0, int w1, int o0, int o1) \
> > +{ \
> > +    int shift = denom + FFMAX(3, 15 - bit_depth); \
> > +    int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 <<
> (shift - 1)); \
> > +    uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
> > +    uintptr_t offset_shift = ((uintptr_t)offset << 32) |
> (uint32_t)shift; \
> > +    ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1,
> width, height, w0_w1, offset_shift); \
> > +}
>
> This LGTM now (and the rest was ok already before), thanks!
>
Applied.
Thank you, Zhili and Martin.

>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index ad767d17e2..ebe58a2ba5 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -52,6 +52,39 @@  void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *src0, const int16_t *src1, int width,
                         int height);
 
+void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         int width, int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         int width, int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                          const int16_t *src0, const int16_t *src1,
+                          int width, int height,
+                          uintptr_t w0_w1, uintptr_t offset_shift);
+/* When passing arguments to functions, Apple platforms diverge from the ARM64
+ * standard ABI for functions that require passing arguments on the stack. To
+ * simplify portability in the assembly function interface, use a different
+ * function signature that doesn't require passing arguments on the stack.
+ */
+#define W_AVG_FUN(bit_depth) \
+static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
+    const int16_t *src0, const int16_t *src1, int width, int height, \
+    int denom, int w0, int w1, int o0, int o1) \
+{ \
+    int shift = denom + FFMAX(3, 15 - bit_depth); \
+    int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
+    uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
+    uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
+    ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
+}
+
+W_AVG_FUN(8)
+W_AVG_FUN(10)
+W_AVG_FUN(12)
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -123,6 +156,7 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
 
         c->inter.avg = ff_vvc_avg_8_neon;
+        c->inter.w_avg = vvc_w_avg_8;
 
         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@@ -163,11 +197,13 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         }
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
+        c->inter.w_avg = vvc_w_avg_10;
 
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
     } else if (bd == 12) {
         c->inter.avg = ff_vvc_avg_12_neon;
+        c->inter.w_avg = vvc_w_avg_12;
 
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 2f69274b86..c4c6ab1a72 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -22,9 +22,9 @@ 
 
 #define VVC_MAX_PB_SIZE 128
 
-.macro vvc_avg, bit_depth
+.macro vvc_avg type, bit_depth
 
-.macro vvc_avg_\bit_depth\()_2_4, tap
+.macro vvc_\type\()_\bit_depth\()_2_4 tap
 .if \tap == 2
         ldr             s0, [src0]
         ldr             s2, [src1]
@@ -32,9 +32,19 @@ 
         ldr             d0, [src0]
         ldr             d2, [src1]
 .endif
+
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         add             v4.4s, v4.4s, v16.4s
         sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+.endif
+
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
 .if \tap == 2
@@ -57,7 +67,7 @@ 
         add             dst, dst, dst_stride
 .endm
 
-function ff_vvc_avg_\bit_depth\()_neon, export=1
+function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         dst             .req x0
         dst_stride      .req x1
         src0            .req x2
@@ -67,42 +77,64 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
 
         mov             x10, #(VVC_MAX_PB_SIZE * 2)
         cmp             width, #8
-.if \bit_depth == 8
-        movi            v16.4s, #64
-.else
-.if \bit_depth == 10
-        mov             w6, #1023
-        movi            v16.4s, #16
+.ifc \type, avg
+        movi            v16.4s, #(1 << (14 - \bit_depth))
 .else
-        mov             w6, #4095
-        movi            v16.4s, #4
-.endif
+        lsr             x11, x6, #32        // weight0
+        mov             w12, w6             // weight1
+        lsr             x13, x7, #32        // offset
+        mov             w14, w7             // shift
+
+        dup             v19.8h, w11
+        neg             w14, w14            // so we can use sqshl
+        dup             v20.8h, w12
+        dup             v16.4s, w13
+        dup             v22.4s, w14
+.endif // avg
+
+ .if \bit_depth >= 10
+        // clip pixel
+        mov             w6, #((1 << \bit_depth) - 1)
         movi            v18.8h, #0
         dup             v17.8h, w6
 .endif
+
         b.eq            8f
         b.hi            16f
         cmp             width, #4
         b.eq            4f
 2:      // width == 2
         subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 2
+        vvc_\type\()_\bit_depth\()_2_4 2
         b.ne            2b
         b               32f
 4:      // width == 4
         subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 4
+        vvc_\type\()_\bit_depth\()_2_4 4
         b.ne            4b
         b               32f
 8:      // width == 8
         ld1             {v0.8h}, [src0], x10
         ld1             {v2.8h}, [src1], x10
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         saddl2          v5.4s, v0.8h, v2.8h
         add             v4.4s, v4.4s, v16.4s
         add             v5.4s, v5.4s, v16.4s
         sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
         sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn2          v4.8h, v5.4s
+.endif
         subs            height, height, #1
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
@@ -122,6 +154,7 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
 17:
         ldp             q0, q1, [x7], #32
         ldp             q2, q3, [x8], #32
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         saddl2          v5.4s, v0.8h, v2.8h
         saddl           v6.4s, v1.4h, v3.4h
@@ -134,6 +167,28 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
         sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
         sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
         sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
+.else   // avg
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        mov             v6.16b, v16.16b
+        mov             v7.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        smlal           v6.4s, v1.4h, v19.4h
+        smlal           v6.4s, v3.4h, v20.4h
+        smlal2          v7.4s, v1.8h, v19.8h
+        smlal2          v7.4s, v3.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqshl           v6.4s, v6.4s, v22.4s
+        sqshl           v7.4s, v7.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn           v6.4h, v6.4s
+        sqxtn2          v4.8h, v5.4s
+        sqxtn2          v6.8h, v7.4s
+.endif  // w_avg
         subs            w6, w6, #16
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
@@ -155,9 +210,19 @@  function ff_vvc_avg_\bit_depth\()_neon, export=1
         b.ne            16b
 32:
         ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
 endfunc
 .endm
 
-vvc_avg 8
-vvc_avg 10
-vvc_avg 12
+vvc_avg avg, 8
+vvc_avg avg, 10
+vvc_avg avg, 12
+vvc_avg w_avg, 8
+vvc_avg w_avg, 10
+vvc_avg w_avg, 12