diff mbox series

[FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions

Message ID 20210408190044.70816-1-mnitenko@gmail.com
State Accepted
Headers show
Series [FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Mikhail Nitenko April 8, 2021, 7 p.m. UTC
here are the benchmarks https://0x1.st/kX.txt
---
 libavcodec/aarch64/h264pred_init.c |  75 +++++++++++-------
 libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 30 deletions(-)

Comments

Carl Eugen Hoyos April 8, 2021, 7:25 p.m. UTC | #1
Am Do., 8. Apr. 2021 um 21:10 Uhr schrieb Mikhail Nitenko <mnitenko@gmail.com>:
>
> here are the benchmarks https://0x1.st/kX.txt

Instead please add the relevant lines to the commit message.

> ---
>  libavcodec/aarch64/h264pred_init.c |  75 +++++++++++-------
>  libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++
>  2 files changed, 168 insertions(+), 30 deletions(-)
>
> diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
> index b144376f90..d74205c2de 100644
> --- a/libavcodec/aarch64/h264pred_init.c
> +++ b/libavcodec/aarch64/h264pred_init.c
> @@ -45,42 +45,57 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
>  void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
>  void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
>
> +void ff_pred16x16_128_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_left_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride);
> +
>  static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
>                                          const int bit_depth,
>                                          const int chroma_format_idc)
>  {
> -    const int high_depth = bit_depth > 8;
> -
> -    if (high_depth)
> -        return;
> -

> -    if (chroma_format_idc <= 1) {
> -        h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
> -        h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
> -        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
> -            h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
> -        h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
> -        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
> -            codec_id != AV_CODEC_ID_VP8) {
> -            h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
> -            h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
> -            h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;

> +    if (bit_depth == 8) {

> +        if (chroma_format_idc <= 1) {
> +            h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
> +            h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
> +            if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
> +                h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
> +            h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
> +            if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
> +                codec_id != AV_CODEC_ID_VP8) {
> +                h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
> +                h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
> +                h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;

Please do not re-indent these lines in the patch that adds the new functions,
feel free to send a separate patch for the re-indentation.

Carl Eugen
Martin Storsjö April 8, 2021, 8:55 p.m. UTC | #2
On Thu, 8 Apr 2021, Mikhail Nitenko wrote:

> here are the benchmarks https://0x1.st/kX.txt
> ---
> libavcodec/aarch64/h264pred_init.c |  75 +++++++++++-------
> libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++
> 2 files changed, 168 insertions(+), 30 deletions(-)
>
> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
> index 213b40b3e7..633b401d59 100644
> --- a/libavcodec/aarch64/h264pred_neon.S
> +++ b/libavcodec/aarch64/h264pred_neon.S
> @@ -359,3 +359,126 @@ function ff_pred8x8_0l0_dc_neon, export=1
>         dup             v1.8b,  v1.b[0]
>         b               .L_pred8x8_dc_end
> endfunc
> +
> +.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
> +.if \n >= 4 || \hi == 0
> +        ld1             {\rd\().h}[0],  [\rs], \rt
> +        ld1             {\rd\().h}[1],  [\rs], \rt
> +.endif
> +.if \n >= 4 || \hi == 1
> +        ld1             {\rd\().h}[2],  [\rs], \rt
> +        ld1             {\rd\().h}[3],  [\rs], \rt
> +.endif
> +.if \n == 8
> +        ld1             {\rd\().h}[4],  [\rs], \rt
> +        ld1             {\rd\().h}[5],  [\rs], \rt
> +        ld1             {\rd\().h}[6],  [\rs], \rt
> +        ld1             {\rd\().h}[7],  [\rs], \rt
> +.endif
> +.endm

I believe this could be a bit faster by using two alternating registers 
that are incremented - but as the existing code doesn't do that, it's not 
necessary.

> +
> +function ff_pred16x16_128_dc_neon_10, export=1
> +        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
> +
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_top_dc_neon_10, export=1
> +        sub             x2,  x0,  x1
> +
> +        ld1             {v0.8h},  [x2], #16
> +        ld1             {v1.8h},  [x2]

This can be one single instruction, ld1 {v0.8h, v1.8h}, [x2]

> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h

When adding up 8 elements that are 10 bit, they still fit in 16 bit (it 
only requires 13 bit), so you don't need uaddlv here, addv would be 
better. And when adding the two results, it still fits in 16 bit (then 
it'd use 14 bits).

> +
> +        add             v0.2s, v0.2s, v1.2s
> +
> +        rshrn           v0.4h,  v0.4s,  #4
> +        dup             v0.8h, v0.h[0]
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_left_dc_neon_10, export=1
> +        sub             x2,  x0,  #2 // access to the "left" column
> +        ldcol.16        v0,  x2,  x1,  8
> +        ldcol.16        v1,  x2,  x1,  8 // load "left" column
> +
> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h

Same thing here, addv+addv should be enough

> +
> +        add             v0.2s,  v0.2s,  v1.2s
> +
> +        rshrn           v0.4h,  v0.4s,  #4
> +        dup             v0.8h, v0.h[0]
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_dc_neon_10, export=1
> +        sub             x2,  x0,  x1 // access to the "top" row
> +        sub             x3,  x0,  #2 // access to the "left" column
> +
> +        ld1             {v0.8h}, [x2], #16
> +        ld1             {v1.8h}, [x2]

One single ld1 {v0.8h, v1.8h}

> +        ldcol.16        v2,  x3,  x1,  8
> +        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" col and "left" row
> +
> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h
> +        uaddlv          s2,  v2.8h // sum all pixels in the "top" row and "left" col
> +        uaddlv          s3,  v3.8h // (sum stays in v0-v3 registers)

addv

> +
> +        add             v0.2s,  v0.2s,  v1.2s
> +        add             v0.2s,  v0.2s,  v2.2s
> +        add             v0.2s,  v0.2s,  v3.2s // sum registers v0-v3
> +
> +        rshrn           v0.4h,  v0.4s,  #5 // right shift vector
> +        dup             v0.8h,  v0.h[0] // fill vector with 0th value (dcsplat)

These comments are kinda pointless here

> +.L_pred16x16_dc_10_end:
> +        sub             x1,  x1,  #16
> +        mov             w3,  #8
> +6:      st1             {v0.8h}, [x0], #16
> +        st1             {v0.8h}, [x0], x1

This can be one single "st1 {v0.8h, v1.8h}, [x0], x1" if you make sure 
that v1 contains the same

> +        st1             {v0.8h}, [x0], #16
> +        st1             {v0.8h}, [x0], x1
> +
> +        subs            w3,  w3,  #1
> +        b.ne            6b
> +        ret
> +endfunc
> +
> +function ff_pred16x16_hor_neon_10, export=1
> +        sub             x2,  x0,  #2
> +        sub             x3,  x1,  #16
> +
> +        mov             w4,  #16
> +1:      ld1r            {v0.8h},  [x2],  x1
> +        st1             {v0.8h},  [x0],  #16
> +        st1             {v0.8h},  [x0],  x3

This might be ok here, but also do check if copying the value to v1 and 
doing one single "st1 {v0.8h, v1.8h}" is faster.

> +
> +        subs            w4,  w4,  #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_pred16x16_vert_neon_10, export=1
> +        sub             x2,  x0,  x1
> +        add             x1,  x1,  x1
> +        sub             x1,  x1,  #16
> +
> +        ld1             {v0.8h},  [x2], #16
> +        ld1             {v1.8h},  [x2], x1

One single ld1

> +
> +        mov             w3,  #8
> +1:      st1             {v0.8h},  [x0],  #16
> +        st1             {v1.8h},  [x0],  x1

One single st1

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index b144376f90..d74205c2de 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -45,42 +45,57 @@  void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
 void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
 void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
 
+void ff_pred16x16_128_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_left_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride);
+
 static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
                                         const int bit_depth,
                                         const int chroma_format_idc)
 {
-    const int high_depth = bit_depth > 8;
-
-    if (high_depth)
-        return;
-
-    if (chroma_format_idc <= 1) {
-        h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
-        h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
-        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
-            h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
-        h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
-        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
-            codec_id != AV_CODEC_ID_VP8) {
-            h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
-            h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
-            h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+    if (bit_depth == 8) {
+        if (chroma_format_idc <= 1) {
+            h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
+            h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
+            if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+                h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
+            h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
+            if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
+                codec_id != AV_CODEC_ID_VP8) {
+                h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
+                h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
+                h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+            }
         }
-    }
 
-    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
-    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
-    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
-    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
-    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
-    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
-    if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
-        codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
-        h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+        h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
+        h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
+        h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
+        h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
+        h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
+        if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+            codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+            h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+    }
+    if (bit_depth == 10) {
+        h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon_10;
+        h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon_10;
+        h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon_10;
+        h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10;
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon_10;
+        h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon_10;
+        if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+            codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {}
+            //h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+    }
 }
 
 av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index 213b40b3e7..633b401d59 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -359,3 +359,126 @@  function ff_pred8x8_0l0_dc_neon, export=1
         dup             v1.8b,  v1.b[0]
         b               .L_pred8x8_dc_end
 endfunc
+
+.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
+.if \n >= 4 || \hi == 0
+        ld1             {\rd\().h}[0],  [\rs], \rt
+        ld1             {\rd\().h}[1],  [\rs], \rt
+.endif
+.if \n >= 4 || \hi == 1
+        ld1             {\rd\().h}[2],  [\rs], \rt
+        ld1             {\rd\().h}[3],  [\rs], \rt
+.endif
+.if \n == 8
+        ld1             {\rd\().h}[4],  [\rs], \rt
+        ld1             {\rd\().h}[5],  [\rs], \rt
+        ld1             {\rd\().h}[6],  [\rs], \rt
+        ld1             {\rd\().h}[7],  [\rs], \rt
+.endif
+.endm
+
+function ff_pred16x16_128_dc_neon_10, export=1
+        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
+
+        b               .L_pred16x16_dc_10_end
+endfunc
+
+function ff_pred16x16_top_dc_neon_10, export=1
+        sub             x2,  x0,  x1
+
+        ld1             {v0.8h},  [x2], #16
+        ld1             {v1.8h},  [x2]
+
+        uaddlv          s0,  v0.8h
+        uaddlv          s1,  v1.8h
+
+        add             v0.2s, v0.2s, v1.2s
+
+        rshrn           v0.4h,  v0.4s,  #4
+        dup             v0.8h, v0.h[0]
+        b               .L_pred16x16_dc_10_end
+endfunc
+
+function ff_pred16x16_left_dc_neon_10, export=1
+        sub             x2,  x0,  #2 // access to the "left" column
+        ldcol.16        v0,  x2,  x1,  8
+        ldcol.16        v1,  x2,  x1,  8 // load "left" column
+
+        uaddlv          s0,  v0.8h
+        uaddlv          s1,  v1.8h
+
+        add             v0.2s,  v0.2s,  v1.2s
+
+        rshrn           v0.4h,  v0.4s,  #4
+        dup             v0.8h, v0.h[0]
+        b               .L_pred16x16_dc_10_end
+endfunc
+
+function ff_pred16x16_dc_neon_10, export=1
+        sub             x2,  x0,  x1 // access to the "top" row
+        sub             x3,  x0,  #2 // access to the "left" column
+
+        ld1             {v0.8h}, [x2], #16
+        ld1             {v1.8h}, [x2]
+        ldcol.16        v2,  x3,  x1,  8
+        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" col and "left" row
+
+        uaddlv          s0,  v0.8h
+        uaddlv          s1,  v1.8h
+        uaddlv          s2,  v2.8h // sum all pixels in the "top" row and "left" col
+        uaddlv          s3,  v3.8h // (sum stays in v0-v3 registers)
+
+        add             v0.2s,  v0.2s,  v1.2s
+        add             v0.2s,  v0.2s,  v2.2s
+        add             v0.2s,  v0.2s,  v3.2s // sum registers v0-v3
+
+        rshrn           v0.4h,  v0.4s,  #5 // right shift vector
+        dup             v0.8h,  v0.h[0] // fill vector with 0th value (dcsplat)
+.L_pred16x16_dc_10_end:
+        sub             x1,  x1,  #16
+        mov             w3,  #8
+6:      st1             {v0.8h}, [x0], #16
+        st1             {v0.8h}, [x0], x1
+
+        st1             {v0.8h}, [x0], #16
+        st1             {v0.8h}, [x0], x1
+
+        subs            w3,  w3,  #1
+        b.ne            6b
+        ret
+endfunc
+
+function ff_pred16x16_hor_neon_10, export=1
+        sub             x2,  x0,  #2
+        sub             x3,  x1,  #16
+
+        mov             w4,  #16
+1:      ld1r            {v0.8h},  [x2],  x1
+        st1             {v0.8h},  [x0],  #16
+        st1             {v0.8h},  [x0],  x3
+
+        subs            w4,  w4,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred16x16_vert_neon_10, export=1
+        sub             x2,  x0,  x1
+        add             x1,  x1,  x1
+        sub             x1,  x1,  #16
+
+        ld1             {v0.8h},  [x2], #16
+        ld1             {v1.8h},  [x2], x1
+
+        mov             w3,  #8
+1:      st1             {v0.8h},  [x0],  #16
+        st1             {v1.8h},  [x0],  x1
+
+        st1             {v0.8h},  [x2],  #16
+        st1             {v1.8h},  [x2],  x1
+
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+