diff mbox series

[FFmpeg-devel,4/6] aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w

Message ID tencent_39A20543331D085733E9D36B75C4E9168E05@qq.com
State New
Headers show
Series [FFmpeg-devel,1/6] aarch64/hevc: Simplify function prototypes by macro | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Zhao Zhili Sept. 7, 2024, 5:13 p.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

put_luma_pixels_8_4x4_c:                                 0.2 ( 1.00x)
put_luma_pixels_8_4x4_neon:                              0.2 ( 1.00x)
put_luma_pixels_8_8x8_c:                                 0.7 ( 1.00x)
put_luma_pixels_8_8x8_neon:                              0.2 ( 3.22x)
put_luma_pixels_8_16x16_c:                               2.2 ( 1.00x)
put_luma_pixels_8_16x16_neon:                            0.2 ( 9.89x)
put_luma_pixels_8_32x32_c:                               8.2 ( 1.00x)
put_luma_pixels_8_32x32_neon:                            1.2 ( 6.71x)
put_luma_pixels_8_64x64_c:                              33.7 ( 1.00x)
put_luma_pixels_8_64x64_neon:                            2.5 (13.63x)
put_luma_pixels_8_128x128_c:                           145.5 ( 1.00x)
put_luma_pixels_8_128x128_neon:                         10.2 (14.23x)
put_uni_pixels_luma_8_4x4_c:                             0.5 ( 1.00x)
put_uni_pixels_luma_8_4x4_neon:                          0.0 ( 0.00x)
put_uni_pixels_luma_8_8x8_c:                             0.5 ( 1.00x)
put_uni_pixels_luma_8_8x8_neon:                          0.2 ( 2.11x)
put_uni_pixels_luma_8_16x16_c:                           1.2 ( 1.00x)
put_uni_pixels_luma_8_16x16_neon:                        0.2 ( 5.44x)
put_uni_pixels_luma_8_32x32_c:                           3.0 ( 1.00x)
put_uni_pixels_luma_8_32x32_neon:                        0.5 ( 6.26x)
put_uni_pixels_luma_8_64x64_c:                           3.0 ( 1.00x)
put_uni_pixels_luma_8_64x64_neon:                        1.7 ( 1.72x)
put_uni_pixels_luma_8_128x128_c:                         6.5 ( 1.00x)
put_uni_pixels_luma_8_128x128_neon:                      6.5 ( 1.00x)
---
 libavcodec/aarch64/h26x/dsp.h       |  22 ++++
 libavcodec/aarch64/h26x/epel_neon.S | 193 +++++++++++++++++-----------
 libavcodec/aarch64/h26x/qpel_neon.S |  83 +++++++++++-
 libavcodec/aarch64/vvc/Makefile     |   1 +
 libavcodec/aarch64/vvc/dsp_init.c   |  21 +++
 5 files changed, 245 insertions(+), 75 deletions(-)

Comments

Martin Storsjö Sept. 11, 2024, 12:19 p.m. UTC | #1
On Sun, 8 Sep 2024, Zhao Zhili wrote:

> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
> index f72746ce03..076d01b477 100644
> --- a/libavcodec/aarch64/h26x/dsp.h
> +++ b/libavcodec/aarch64/h26x/dsp.h
> @@ -248,4 +248,26 @@ NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t *_src, ptrdiff_t _src
> NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
>         ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),)
>
> +#undef NEON8_FNPROTO_PARTIAL_6
> +#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \
> +    void ff_vvc_put_##fn##4_8_neon##ext args; \
> +    void ff_vvc_put_##fn##8_8_neon##ext args; \
> +    void ff_vvc_put_##fn##16_8_neon##ext args; \
> +    void ff_vvc_put_##fn##32_8_neon##ext args; \
> +    void ff_vvc_put_##fn##64_8_neon##ext args; \
> +    void ff_vvc_put_##fn##128_8_neon##ext args
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst,
> +        const uint8_t *src, ptrdiff_t srcstride, int height,
> +        const int8_t *hf, const int8_t *vf, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride, int height,
> +        const int8_t *hf, const int8_t *vf, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox,
> +        const int8_t *hf, const int8_t *vf, int width),);
> +
> #endif
> diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S
> index 378b0f7fb2..729395f2f0 100644
> --- a/libavcodec/aarch64/h26x/epel_neon.S
> +++ b/libavcodec/aarch64/h26x/epel_neon.S
> @@ -19,7 +19,8 @@
>  */
>
> #include "libavutil/aarch64/asm.S"
> -#define MAX_PB_SIZE 64
> +#define HEVC_MAX_PB_SIZE 64
> +#define VVC_MAX_PB_SIZE 128
>
> const epel_filters, align=4
>         .byte  0,  0,  0,  0
> @@ -131,8 +132,13 @@ endconst
>         b.ne            1b
> .endm
>
> +function ff_vvc_put_pel_pixels4_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.s}[0], [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         subs            w3, w3, #1
> @@ -142,7 +148,7 @@ function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2 - 8)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 8)
> 1:      ld1             {v0.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         st1             {v4.d}[0], [x0], #8
> @@ -152,8 +158,13 @@ function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
>         ret
> endfunc
>
> +function ff_vvc_put_pel_pixels8_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         subs            w3, w3, #1
> @@ -163,7 +174,7 @@ function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2 - 16)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 16)
> 1:      ld1             {v0.8b, v1.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         st1             {v4.8h}, [x0], #16
> @@ -174,8 +185,13 @@ function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
>         ret
> endfunc
>
> +function ff_vvc_put_pel_pixels16_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b, v1.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll           v5.8h, v1.8b, #6
> @@ -186,7 +202,7 @@ function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b-v2.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll           v5.8h, v1.8b, #6
> @@ -197,8 +213,13 @@ function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
>         ret
> endfunc
>
> +function ff_vvc_put_pel_pixels32_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b-v3.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll           v5.8h, v1.8b, #6
> @@ -211,7 +232,7 @@ function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE)
> +        mov             x7, #(HEVC_MAX_PB_SIZE)
> 1:      ld1             {v0.16b-v2.16b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll2          v5.8h, v0.16b, #6
> @@ -226,26 +247,50 @@ function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
>         ret
> endfunc
>
> -function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
> -1:      ld1             {v0.16b-v3.16b}, [x1], x2
> +.macro put_pel_pixels64_8_neon
>         ushll           v4.8h, v0.8b, #6
>         ushll2          v5.8h, v0.16b, #6
>         ushll           v6.8h, v1.8b, #6
>         ushll2          v7.8h, v1.16b, #6
> -        st1             {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
> +        st1             {v4.8h-v7.8h}, [x0], #64
>         ushll           v16.8h, v2.8b, #6
>         ushll2          v17.8h, v2.16b, #6
>         ushll           v18.8h, v3.8b, #6
>         ushll2          v19.8h, v3.16b, #6
> -        subs            w3, w3, #1
> -        st1             {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
> -        b.ne            1b
> +        st1             {v16.8h-v19.8h}, [x0], x7
> +.endm
> +
> +function ff_vvc_put_pel_pixels64_8_neon, export=1
> +        mov             x7, #(2 * VVC_MAX_PB_SIZE - 64)
> +        b               1f
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
> +        mov             x7, #(HEVC_MAX_PB_SIZE)
> +1:
> +        ld1             {v0.16b-v3.16b}, [x1], x2
> +        sub             w3, w3, #1
> +        put_pel_pixels64_8_neon
> +        cbnz            w3, 1b

We'd typically use subs + b.ne, rather than sub+cbnz for loops like these. 
Or is there anything inside the macros that clobber the condition flags?

The same thing in most of these functions you're touching in this patch.

> +function ff_vvc_put_pel_uni_pixels128_8_neon, export=1
> +1:
> +        mov             x5, x2
> +        mov             x6, x0
> +        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
> +        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x5]
> +        sub             w4, w4, #1
> +        add             x2, x2, x3
> +        add             x0, x0, x1
> +        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], #64
> +        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x6]
> +        cbnz            w4, 1b
> +        ret
> +endfunc

subs+b.ne rather than sub+cbnz, for consistency if nothing else.

The copying of values back and forth between x2/x5 and x0/x6 seems 
wasteful here. I'd suggest this instead:

   sub x1, x1, #64
   sub x3, x3, #64
1:
   ld1 [x2], #64
   subs w4, w4, #1
   ld1 [x2], x3
   ...
   st1 [x0], #64
   st1 [x0], x1
   b.ne 1b

The same goes in ff_vvc_put_pel_uni_w_pixels128_8_neon below as well.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index f72746ce03..076d01b477 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -248,4 +248,26 @@  NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t *_src, ptrdiff_t _src
 NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
         ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),)
 
+#undef NEON8_FNPROTO_PARTIAL_6
+#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \
+    void ff_vvc_put_##fn##4_8_neon##ext args; \
+    void ff_vvc_put_##fn##8_8_neon##ext args; \
+    void ff_vvc_put_##fn##16_8_neon##ext args; \
+    void ff_vvc_put_##fn##32_8_neon##ext args; \
+    void ff_vvc_put_##fn##64_8_neon##ext args; \
+    void ff_vvc_put_##fn##128_8_neon##ext args
+
+NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width),);
+
+NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width),);
+
+NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        const int8_t *hf, const int8_t *vf, int width),);
+
 #endif
diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S
index 378b0f7fb2..729395f2f0 100644
--- a/libavcodec/aarch64/h26x/epel_neon.S
+++ b/libavcodec/aarch64/h26x/epel_neon.S
@@ -19,7 +19,8 @@ 
  */
 
 #include "libavutil/aarch64/asm.S"
-#define MAX_PB_SIZE 64
+#define HEVC_MAX_PB_SIZE 64
+#define VVC_MAX_PB_SIZE 128
 
 const epel_filters, align=4
         .byte  0,  0,  0,  0
@@ -131,8 +132,13 @@  endconst
         b.ne            1b
 .endm
 
+function ff_vvc_put_pel_pixels4_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.s}[0], [x1], x2
         ushll           v4.8h, v0.8b, #6
         subs            w3, w3, #1
@@ -142,7 +148,7 @@  function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2 - 8)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 8)
 1:      ld1             {v0.8b}, [x1], x2
         ushll           v4.8h, v0.8b, #6
         st1             {v4.d}[0], [x0], #8
@@ -152,8 +158,13 @@  function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_pixels8_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.8b}, [x1], x2
         ushll           v4.8h, v0.8b, #6
         subs            w3, w3, #1
@@ -163,7 +174,7 @@  function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2 - 16)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 16)
 1:      ld1             {v0.8b, v1.8b}, [x1], x2
         ushll           v4.8h, v0.8b, #6
         st1             {v4.8h}, [x0], #16
@@ -174,8 +185,13 @@  function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_pixels16_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.8b, v1.8b}, [x1], x2
         ushll           v4.8h, v0.8b, #6
         ushll           v5.8h, v1.8b, #6
@@ -186,7 +202,7 @@  function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.8b-v2.8b}, [x1], x2
         ushll           v4.8h, v0.8b, #6
         ushll           v5.8h, v1.8b, #6
@@ -197,8 +213,13 @@  function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_pixels32_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.8b-v3.8b}, [x1], x2
         ushll           v4.8h, v0.8b, #6
         ushll           v5.8h, v1.8b, #6
@@ -211,7 +232,7 @@  function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE)
+        mov             x7, #(HEVC_MAX_PB_SIZE)
 1:      ld1             {v0.16b-v2.16b}, [x1], x2
         ushll           v4.8h, v0.8b, #6
         ushll2          v5.8h, v0.16b, #6
@@ -226,26 +247,50 @@  function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
-1:      ld1             {v0.16b-v3.16b}, [x1], x2
+.macro put_pel_pixels64_8_neon
         ushll           v4.8h, v0.8b, #6
         ushll2          v5.8h, v0.16b, #6
         ushll           v6.8h, v1.8b, #6
         ushll2          v7.8h, v1.16b, #6
-        st1             {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
+        st1             {v4.8h-v7.8h}, [x0], #64
         ushll           v16.8h, v2.8b, #6
         ushll2          v17.8h, v2.16b, #6
         ushll           v18.8h, v3.8b, #6
         ushll2          v19.8h, v3.16b, #6
-        subs            w3, w3, #1
-        st1             {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
-        b.ne            1b
+        st1             {v16.8h-v19.8h}, [x0], x7
+.endm
+
+function ff_vvc_put_pel_pixels64_8_neon, export=1
+        mov             x7, #(2 * VVC_MAX_PB_SIZE - 64)
+        b               1f
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
+        mov             x7, #(HEVC_MAX_PB_SIZE)
+1:
+        ld1             {v0.16b-v3.16b}, [x1], x2
+        sub             w3, w3, #1
+        put_pel_pixels64_8_neon
+        cbnz            w3, 1b
         ret
 endfunc
 
+function ff_vvc_put_pel_pixels128_8_neon, export=1
+        mov             x7, #64
+1:
+        mov             x6, x1
+        ld1             {v0.16b-v3.16b}, [x6], #64
+        add             x1, x1, x2
+        sub             w3, w3, #1
+        put_pel_pixels64_8_neon
+        ld1             {v0.16b-v3.16b}, [x6], #64
+        put_pel_pixels64_8_neon
+        cbnz            w3, 1b
+        ret
+endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.s}[0], [x2], x3 // src
         ushll           v16.8h, v0.8b, #6
         ld1             {v20.4h}, [x4], x10 // src2
@@ -258,7 +303,7 @@  function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         sub             x1, x1, #4
 1:      ld1             {v0.8b}, [x2], x3
         ushll           v16.8h, v0.8b, #6
@@ -273,7 +318,7 @@  function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.8b}, [x2], x3    // src
         ushll           v16.8h, v0.8b, #6
         ld1             {v20.8h}, [x4], x10  // src2
@@ -286,7 +331,7 @@  function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         sub             x1, x1, #8
 1:      ld1             {v0.16b}, [x2], x3
         ushll           v16.8h, v0.8b, #6
@@ -304,7 +349,7 @@  function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.16b}, [x2], x3 // src
         ushll           v16.8h, v0.8b, #6
         ushll2          v17.8h, v0.16b, #6
@@ -320,7 +365,7 @@  function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.8b-v2.8b}, [x2], x3 // src
         ushll           v16.8h, v0.8b, #6
         ushll           v17.8h, v1.8b, #6
@@ -339,7 +384,7 @@  function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v0.16b-v1.16b}, [x2], x3 // src
         ushll           v16.8h, v0.8b, #6
         ushll2          v17.8h, v0.16b, #6
@@ -361,7 +406,7 @@  function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
 endfunc
 
 function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
-        mov             x10, #(MAX_PB_SIZE)
+        mov             x10, #(HEVC_MAX_PB_SIZE)
 1:      ld1             {v0.16b-v2.16b}, [x2], x3 // src
         ushll           v16.8h, v0.8b, #6
         ushll2          v17.8h, v0.16b, #6
@@ -369,7 +414,7 @@  function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
         ushll2          v19.8h, v1.16b, #6
         ushll           v20.8h, v2.8b, #6
         ushll2          v21.8h, v2.16b, #6
-        ld1             {v24.8h-v27.8h}, [x4], #(MAX_PB_SIZE) // src2
+        ld1             {v24.8h-v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2
         sqadd           v16.8h, v16.8h, v24.8h
         sqadd           v17.8h, v17.8h, v25.8h
         sqadd           v18.8h, v18.8h, v26.8h
@@ -399,12 +444,12 @@  function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
         ushll2          v21.8h, v2.16b, #6
         ushll           v22.8h, v3.8b, #6
         ushll2          v23.8h, v3.16b, #6
-        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) // src2
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2
         sqadd           v16.8h, v16.8h, v24.8h
         sqadd           v17.8h, v17.8h, v25.8h
         sqadd           v18.8h, v18.8h, v26.8h
         sqadd           v19.8h, v19.8h, v27.8h
-        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE)
         sqadd           v20.8h, v20.8h, v24.8h
         sqadd           v21.8h, v21.8h, v25.8h
         sqadd           v22.8h, v22.8h, v26.8h
@@ -427,7 +472,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
         load_epel_filterb x6, x7
         sub             x2, x2, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v4.8b}, [x2], x3
         ext             v5.8b, v4.8b, v4.8b, #1
         ext             v6.8b, v4.8b, v4.8b, #2
@@ -446,7 +491,7 @@  function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1
         load_epel_filterb x6, x7
         sub             w1, w1, #4
         sub             x2, x2, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v24.16b}, [x2], x3
         ext             v26.16b, v24.16b, v24.16b, #1
         ext             v27.16b, v24.16b, v24.16b, #2
@@ -465,7 +510,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1
         load_epel_filterb x6, x7
         sub             x2, x2, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v24.16b}, [x2], x3
         ext             v26.16b, v24.16b, v24.16b, #1
         ext             v27.16b, v24.16b, v24.16b, #2
@@ -484,7 +529,7 @@  function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1
         load_epel_filterb x6, x7
         sub             x1, x1, #8
         sub             x2, x2, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v24.16b}, [x2], x3
         ext             v26.16b, v24.16b, v24.16b, #1
         ext             v27.16b, v24.16b, v24.16b, #2
@@ -506,7 +551,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1
         load_epel_filterb x6, x7
         sub             x2, x2, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ldr             q24, [x2]
         ldr             s25, [x2, #16]
         add             x2, x2, x3
@@ -529,7 +574,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1
         load_epel_filterb x6, x7
         sub             x2, x2, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ld1             {v24.16b, v25.16b}, [x2], x3
         ext             v26.16b, v24.16b, v25.16b, #1
         ext             v27.16b, v24.16b, v25.16b, #2
@@ -556,7 +601,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1
         load_epel_filterb x6, x7
         sub             x2, x2, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 1:      ldp             q24, q25, [x2]
         ldr             s26, [x2, #32]
         add             x2, x2, x3
@@ -589,7 +634,7 @@  function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1
         load_epel_filterb x6, x7
         sub             x2, x2, #1
         mov             x7, #24
-        mov             x10, #(MAX_PB_SIZE * 2 - 48)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2 - 48)
 1:      ld1             {v24.16b, v25.16b, v26.16b}, [x2]
         ldr             s27, [x2, #48]
         add             x2, x2, x3
@@ -683,7 +728,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1
         load_epel_filterb x7, x6
         sub             x2, x2, x3
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.s}[0], [x2], x3
         ld1             {v17.s}[0], [x2], x3
         ld1             {v18.s}[0], [x2], x3
@@ -705,7 +750,7 @@  function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1
         load_epel_filterb x7, x6
         sub             x2, x2, x3
         sub             x1, x1, #4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8b}, [x2], x3
         ld1             {v17.8b}, [x2], x3
         ld1             {v18.8b}, [x2], x3
@@ -727,7 +772,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1
         load_epel_filterb x7, x6
         sub             x2, x2, x3
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8b}, [x2], x3
         ld1             {v17.8b}, [x2], x3
         ld1             {v18.8b}, [x2], x3
@@ -749,7 +794,7 @@  function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1
         load_epel_filterb x7, x6
         sub             x1, x1, #8
         sub             x2, x2, x3
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.16b}, [x2], x3
         ld1             {v17.16b}, [x2], x3
         ld1             {v18.16b}, [x2], x3
@@ -774,7 +819,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1
         load_epel_filterb x7, x6
         sub             x2, x2, x3
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.16b}, [x2], x3
         ld1             {v17.16b}, [x2], x3
         ld1             {v18.16b}, [x2], x3
@@ -798,7 +843,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1
         load_epel_filterb x7, x6
         sub             x2, x2, x3
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8b, v17.8b, v18.8b}, [x2], x3
         ld1             {v19.8b, v20.8b, v21.8b}, [x2], x3
         ld1             {v22.8b, v23.8b, v24.8b}, [x2], x3
@@ -825,7 +870,7 @@  endfunc
 function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
         load_epel_filterb x7, x6
         sub             x2, x2, x3
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.16b, v17.16b}, [x2], x3
         ld1             {v18.16b, v19.16b}, [x2], x3
         ld1             {v20.16b, v21.16b}, [x2], x3
@@ -895,7 +940,7 @@  endfunc
 function ff_hevc_put_hevc_epel_v4_8_neon, export=1
         load_epel_filterb x5, x4
         sub             x1, x1, x2
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ldr             s16, [x1]
         ldr             s17, [x1, x2]
         add             x1, x1, x2, lsl #1
@@ -915,7 +960,7 @@  endfunc
 function ff_hevc_put_hevc_epel_v6_8_neon, export=1
         load_epel_filterb x5, x4
         sub             x1, x1, x2
-        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2 - 8)
         ldr             d16, [x1]
         ldr             d17, [x1, x2]
         add             x1, x1, x2, lsl #1
@@ -936,7 +981,7 @@  endfunc
 function ff_hevc_put_hevc_epel_v8_8_neon, export=1
         load_epel_filterb x5, x4
         sub             x1, x1, x2
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ldr             d16, [x1]
         ldr             d17, [x1, x2]
         add             x1, x1, x2, lsl #1
@@ -956,7 +1001,7 @@  endfunc
 function ff_hevc_put_hevc_epel_v12_8_neon, export=1
         load_epel_filterb x5, x4
         sub             x1, x1, x2
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ldr             q16, [x1]
         ldr             q17, [x1, x2]
         add             x1, x1, x2, lsl #1
@@ -980,7 +1025,7 @@  endfunc
 function ff_hevc_put_hevc_epel_v16_8_neon, export=1
         load_epel_filterb x5, x4
         sub             x1, x1, x2
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ldr             q16, [x1]
         ldr             q17, [x1, x2]
         add             x1, x1, x2, lsl #1
@@ -1002,7 +1047,7 @@  endfunc
 function ff_hevc_put_hevc_epel_v24_8_neon, export=1
         load_epel_filterb x5, x4
         sub             x1, x1, x2
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8b, v17.8b, v18.8b}, [x1], x2
         ld1             {v19.8b, v20.8b, v21.8b}, [x1], x2
         ld1             {v22.8b, v23.8b, v24.8b}, [x1], x2
@@ -1025,7 +1070,7 @@  endfunc
 function ff_hevc_put_hevc_epel_v32_8_neon, export=1
         load_epel_filterb x5, x4
         sub             x1, x1, x2
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.16b, v17.16b}, [x1], x2
         ld1             {v18.16b, v19.16b}, [x1], x2
         ld1             {v20.16b, v21.16b}, [x1], x2
@@ -1327,7 +1372,7 @@  endfunc
         add             x5, x5, x4, lsl #2
         ld1r            {v30.4s}, [x5]
         sub             x1, x1, #1
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
 .endm
 
 function ff_hevc_put_hevc_epel_h4_8_neon, export=1
@@ -2179,7 +2224,7 @@  DISABLE_I8MM
 
 function hevc_put_hevc_epel_hv4_8_end_neon
         load_epel_filterh x5, x4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ldr             d16, [sp]
         ldr             d17, [sp, x10]
         add             sp, sp, x10, lsl #1
@@ -2198,7 +2243,7 @@  endfunc
 function hevc_put_hevc_epel_hv6_8_end_neon
         load_epel_filterh x5, x4
         mov             x5, #120
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ldr             q16, [sp]
         ldr             q17, [sp, x10]
         add             sp, sp, x10, lsl #1
@@ -2218,7 +2263,7 @@  endfunc
 
 function hevc_put_hevc_epel_hv8_8_end_neon
         load_epel_filterh x5, x4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ldr             q16, [sp]
         ldr             q17, [sp, x10]
         add             sp, sp, x10, lsl #1
@@ -2238,7 +2283,7 @@  endfunc
 function hevc_put_hevc_epel_hv12_8_end_neon
         load_epel_filterh x5, x4
         mov             x5, #112
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -2258,7 +2303,7 @@  endfunc
 
 function hevc_put_hevc_epel_hv16_8_end_neon
         load_epel_filterh x5, x4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -2278,7 +2323,7 @@  endfunc
 
 function hevc_put_hevc_epel_hv24_8_end_neon
         load_epel_filterh x5, x4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
         ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
         ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
@@ -2462,7 +2507,7 @@  epel_hv neon
 
 function hevc_put_hevc_epel_uni_hv4_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.4h}, [sp], x10
         ld1             {v17.4h}, [sp], x10
         ld1             {v18.4h}, [sp], x10
@@ -2481,7 +2526,7 @@  endfunc
 function hevc_put_hevc_epel_uni_hv6_8_end_neon
         load_epel_filterh x6, x5
         sub             x1, x1, #4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h}, [sp], x10
         ld1             {v17.8h}, [sp], x10
         ld1             {v18.8h}, [sp], x10
@@ -2501,7 +2546,7 @@  endfunc
 
 function hevc_put_hevc_epel_uni_hv8_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h}, [sp], x10
         ld1             {v17.8h}, [sp], x10
         ld1             {v18.8h}, [sp], x10
@@ -2521,7 +2566,7 @@  endfunc
 function hevc_put_hevc_epel_uni_hv12_8_end_neon
         load_epel_filterh x6, x5
         sub             x1, x1, #8
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -2543,7 +2588,7 @@  endfunc
 
 function hevc_put_hevc_epel_uni_hv16_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -2565,7 +2610,7 @@  endfunc
 
 function hevc_put_hevc_epel_uni_hv24_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
         ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
         ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
@@ -3223,7 +3268,7 @@  DISABLE_I8MM
 
 function hevc_put_hevc_epel_uni_w_hv4_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.4h}, [sp], x10
         ld1             {v17.4h}, [sp], x10
         ld1             {v18.4h}, [sp], x10
@@ -3273,7 +3318,7 @@  endfunc
 function hevc_put_hevc_epel_uni_w_hv6_8_end_neon
         load_epel_filterh x6, x5
         sub             x1, x1, #4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h}, [sp], x10
         ld1             {v17.8h}, [sp], x10
         ld1             {v18.8h}, [sp], x10
@@ -3326,7 +3371,7 @@  endfunc
 
 function hevc_put_hevc_epel_uni_w_hv8_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h}, [sp], x10
         ld1             {v17.8h}, [sp], x10
         ld1             {v18.8h}, [sp], x10
@@ -3376,7 +3421,7 @@  endfunc
 function hevc_put_hevc_epel_uni_w_hv12_8_end_neon
         load_epel_filterh x6, x5
         sub             x1, x1, #8
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -3437,7 +3482,7 @@  endfunc
 
 function hevc_put_hevc_epel_uni_w_hv16_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -3498,7 +3543,7 @@  endfunc
 
 function hevc_put_hevc_epel_uni_w_hv24_8_end_neon
         load_epel_filterh x6, x5
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
         ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
         ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
@@ -3795,7 +3840,7 @@  epel_uni_w_hv neon
 
 function hevc_put_hevc_epel_bi_hv4_8_end_neon
         load_epel_filterh x7, x6
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.4h}, [sp], x10
         ld1             {v17.4h}, [sp], x10
         ld1             {v18.4h}, [sp], x10
@@ -3816,7 +3861,7 @@  endfunc
 function hevc_put_hevc_epel_bi_hv6_8_end_neon
         load_epel_filterh x7, x6
         sub             x1, x1, #4
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h}, [sp], x10
         ld1             {v17.8h}, [sp], x10
         ld1             {v18.8h}, [sp], x10
@@ -3838,7 +3883,7 @@  endfunc
 
 function hevc_put_hevc_epel_bi_hv8_8_end_neon
         load_epel_filterh x7, x6
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h}, [sp], x10
         ld1             {v17.8h}, [sp], x10
         ld1             {v18.8h}, [sp], x10
@@ -3860,7 +3905,7 @@  endfunc
 function hevc_put_hevc_epel_bi_hv12_8_end_neon
         load_epel_filterh x7, x6
         sub             x1, x1, #8
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -3885,7 +3930,7 @@  endfunc
 
 function hevc_put_hevc_epel_bi_hv16_8_end_neon
         load_epel_filterh x7, x6
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -3910,7 +3955,7 @@  endfunc
 
 function hevc_put_hevc_epel_bi_hv24_8_end_neon
         load_epel_filterh x7, x6
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
         ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
         ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
@@ -3939,7 +3984,7 @@  endfunc
 
 function hevc_put_hevc_epel_bi_hv32_8_end_neon
         load_epel_filterh x7, x6
-        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x10, #(HEVC_MAX_PB_SIZE * 2)
         ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
         ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10
diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S
index a05009c9d6..0585f03de9 100644
--- a/libavcodec/aarch64/h26x/qpel_neon.S
+++ b/libavcodec/aarch64/h26x/qpel_neon.S
@@ -1250,6 +1250,10 @@  function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
         b               X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
 endfunc
 
+function ff_vvc_put_pel_uni_pixels4_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_pixels4_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
 1:
         ldr             s0, [x2]
@@ -1278,6 +1282,10 @@  function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_pixels8_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_pixels8_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
 1:
         ldr             d0, [x2]
@@ -1306,6 +1314,10 @@  function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_pixels16_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_pixels16_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
 1:
         ldr             q0, [x2]
@@ -1328,6 +1340,10 @@  function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_pixels32_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_pixels32_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
 1:
         ld1             {v0.16b, v1.16b}, [x2], x3
@@ -1346,6 +1362,10 @@  function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_pixels64_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_pixels64_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
 1:
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
@@ -1355,6 +1375,21 @@  function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_pixels128_8_neon, export=1
+1:
+        mov             x5, x2
+        mov             x6, x0
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x5]
+        sub             w4, w4, #1
+        add             x2, x2, x3
+        add             x0, x0, x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], #64
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x6]
+        cbnz            w4, 1b
+        ret
+endfunc
+
 function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
         load_qpel_filterb x6, x5
         sub             x2, x2, x3, lsl #1
@@ -1528,6 +1563,10 @@  function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
         b               X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
 endfunc
 
+function ff_vvc_put_pel_uni_w_pixels4_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
         mov             w10, #-6
         sub             w10, w10, w5
@@ -1598,6 +1637,10 @@  function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_w_pixels8_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
         mov             w10, #-6
         sub             w10, w10, w5
@@ -1741,7 +1784,9 @@  function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
         ret
 endfunc
 
-
+function ff_vvc_put_pel_uni_w_pixels16_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon)
+endfunc
 
 function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
         mov             w10, #-6
@@ -1803,6 +1848,9 @@  function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_w_pixels32_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon)
+endfunc
 
 function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
         mov             w10, #-6
@@ -1839,6 +1887,39 @@  function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
         ret
 endfunc
 
+function ff_vvc_put_pel_uni_w_pixels64_8_neon, export=1
+        b               X(ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon)
+endfunc
+
+function ff_vvc_put_pel_uni_w_pixels128_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        mov             x11, x2
+        mov             x12, x0
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x11], #64
+        add             x2, x2, x3
+        add             x0, x0, x1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x12], #64
+
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x11], #64
+        sub             w4, w4, #1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x12], #64
+        cbnz            w4, 1b
+        ret
+endfunc
+
 .macro QPEL_UNI_W_V_HEADER
         ldur            x12, [sp, #8]          // my
         sub             x2, x2, x3, lsl #1
diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index a5ad24dfc5..a1c1f03e27 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -3,5 +3,6 @@  clean::
 
 OBJS-$(CONFIG_VVC_DECODER)              += aarch64/vvc/dsp_init.o
 NEON-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/alf.o \
+                                           aarch64/h26x/epel_neon.o \
                                            aarch64/h26x/qpel_neon.o \
                                            aarch64/h26x/sao_neon.o
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index ea6245d9a3..457be8c725 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -46,6 +46,13 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         return;
 
     if (bd == 8) {
+        c->inter.put[0][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
+        c->inter.put[0][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
+        c->inter.put[0][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
+        c->inter.put[0][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
+        c->inter.put[0][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
+        c->inter.put[0][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
+
         c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon;
         c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon;
         c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon;
@@ -53,6 +60,13 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put[0][5][0][1] =
         c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
 
+        c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
+        c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
+        c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
+        c->inter.put_uni[0][4][0][0] = ff_vvc_put_pel_uni_pixels32_8_neon;
+        c->inter.put_uni[0][5][0][0] = ff_vvc_put_pel_uni_pixels64_8_neon;
+        c->inter.put_uni[0][6][0][0] = ff_vvc_put_pel_uni_pixels128_8_neon;
+
         c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon;
         c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon;
         c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon;
@@ -60,6 +74,13 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put_uni[0][5][0][1] =
         c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon;
 
+        c->inter.put_uni_w[0][1][0][0] = ff_vvc_put_pel_uni_w_pixels4_8_neon;
+        c->inter.put_uni_w[0][2][0][0] = ff_vvc_put_pel_uni_w_pixels8_8_neon;
+        c->inter.put_uni_w[0][3][0][0] = ff_vvc_put_pel_uni_w_pixels16_8_neon;
+        c->inter.put_uni_w[0][4][0][0] = ff_vvc_put_pel_uni_w_pixels32_8_neon;
+        c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon;
+        c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
+
         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
         c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;