diff mbox series

[FFmpeg-devel] lavc/aarch64: add pred functions for 10-bit

Message ID 20210716123031.116678-1-mnitenko@gmail.com
State New
Headers show
Series [FFmpeg-devel] lavc/aarch64: add pred functions for 10-bit | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Mikhail Nitenko July 16, 2021, 12:30 p.m. UTC
Benchmarks:                        A53     A72
pred8x8_dc_10_c:                   64.2    49.5
pred8x8_dc_10_neon:                62.7    54.5
pred8x8_dc_128_10_c:               26.0    15.5
pred8x8_dc_128_10_neon:            28.2    16.0
pred8x8_horizontal_10_c:           60.0    27.7
pred8x8_horizontal_10_neon:        34.2    27.7
pred8x8_left_dc_10_c:              42.5    27.5
pred8x8_left_dc_10_neon:           50.7    41.2
pred8x8_mad_cow_dc_0l0_10_c:       55.7    37.2
pred8x8_mad_cow_dc_0l0_10_neon:    46.0    36.5
pred8x8_mad_cow_dc_0lt_10_c:       89.2    67.0
pred8x8_mad_cow_dc_0lt_10_neon:    50.2    46.7
pred8x8_mad_cow_dc_l0t_10_c:       75.5    51.0
pred8x8_mad_cow_dc_l0t_10_neon:    49.7    44.7
pred8x8_mad_cow_dc_l00_10_c:       58.0    38.0
pred8x8_mad_cow_dc_l00_10_neon:    41.0    37.5
pred8x8_plane_10_c:               347.5   288.7
pred8x8_plane_10_neon:            150.2   108.5
pred8x8_top_dc_10_c:               44.5    30.5
pred8x8_top_dc_10_neon:            39.7    31.5
pred8x8_vertical_10_c:             27.5    16.0
pred8x8_vertical_10_neon:          27.7    15.0
pred16x16_plane_10_c:            1245.5  1069.7
pred16x16_plane_10_neon:          349.0   208.7

Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com>
---
 libavcodec/aarch64/h264pred_init.c |  40 +++-
 libavcodec/aarch64/h264pred_neon.S | 369 ++++++++++++++++++++++++++++-
 2 files changed, 402 insertions(+), 7 deletions(-)

Comments

Martin Storsjö Aug. 4, 2021, 1:08 p.m. UTC | #1
On Fri, 16 Jul 2021, Mikhail Nitenko wrote:

> Benchmarks:                        A53     A72
> pred8x8_dc_10_c:                   64.2    49.5
> pred8x8_dc_10_neon:                62.7    54.5
> pred8x8_dc_128_10_c:               26.0    15.5
> pred8x8_dc_128_10_neon:            28.2    16.0
> pred8x8_horizontal_10_c:           60.0    27.7
> pred8x8_horizontal_10_neon:        34.2    27.7
> pred8x8_left_dc_10_c:              42.5    27.5
> pred8x8_left_dc_10_neon:           50.7    41.2
> pred8x8_mad_cow_dc_0l0_10_c:       55.7    37.2
> pred8x8_mad_cow_dc_0l0_10_neon:    46.0    36.5
> pred8x8_mad_cow_dc_0lt_10_c:       89.2    67.0
> pred8x8_mad_cow_dc_0lt_10_neon:    50.2    46.7
> pred8x8_mad_cow_dc_l0t_10_c:       75.5    51.0
> pred8x8_mad_cow_dc_l0t_10_neon:    49.7    44.7
> pred8x8_mad_cow_dc_l00_10_c:       58.0    38.0
> pred8x8_mad_cow_dc_l00_10_neon:    41.0    37.5
> pred8x8_plane_10_c:               347.5   288.7
> pred8x8_plane_10_neon:            150.2   108.5
> pred8x8_top_dc_10_c:               44.5    30.5
> pred8x8_top_dc_10_neon:            39.7    31.5
> pred8x8_vertical_10_c:             27.5    16.0
> pred8x8_vertical_10_neon:          27.7    15.0
> pred16x16_plane_10_c:            1245.5  1069.7
> pred16x16_plane_10_neon:          349.0   208.7
>
> Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com>
> ---
> libavcodec/aarch64/h264pred_init.c |  40 +++-
> libavcodec/aarch64/h264pred_neon.S | 369 ++++++++++++++++++++++++++++-
> 2 files changed, 402 insertions(+), 7 deletions(-)

> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
> index e40bdc8d53..735d20b49c 100644
> --- a/libavcodec/aarch64/h264pred_neon.S
> +++ b/libavcodec/aarch64/h264pred_neon.S
> @@ -467,3 +475,356 @@ function ff_pred16x16_vert_neon_10, export=1
>         b.ne            1b
>         ret
> endfunc
> +
> +function ff_pred16x16_plane_neon_10, export=1
> +        sub             x3,  x0,  x1
> +        movrel          x4,  p16weight_10
> +        add             x2,  x3,  #16
> +        sub             x3,  x3,  #2
> +
> +        ld1             {v0.8h},  [x3]
> +        ld1             {v2.8h},  [x2]
> +        ldcol.16        v1,  x3,  x1, 8
> +        add             x3,  x3,  x1
> +        ldcol.16        v3,  x3,  x1, 8
> +
> +        rev64           v16.8h,  v0.8h
> +        trn1            v0.2d,   v16.2d,  v16.2d
> +        trn2            v0.2d,   v16.2d,  v0.2d
> +
> +        rev64           v16.8h,  v1.8h
> +        trn1            v1.2d,   v16.2d,  v16.2d
> +        trn2            v1.2d,   v16.2d,  v1.2d
> +

Umm, these trn1+trn2 are really confusing to try to figure out here. Do 
you want to swap the two halfs of the register, to compensate for not 
having a rev128? You can do that with "ext v0.16b, v0.16b, v0.16b, #8" 
instead of these two instructions. (And it's better for pipelining to do 
two rev64 followed by two ext, instead of interleaving them tightly.)

> +        uaddl           v7.4s,  v2.4h,  v3.4h

I don't think you need to go to 32 bit here? If you add two 10 bit pixels 
together, the sum (11 bit) still fits in 16 bit elements just fine. (I 
haven't checked how large the intermediates become further in this 
calculation here, whether you need to go to 32 bit somewhere close to the 
end of the calculation or if you can do it all in 16 bit.)

The same applies to the 8x8 version below too.

> +        uaddl2          v16.4s, v2.8h,  v3.8h
> +        usubl           v4.4s,  v2.4h,  v0.4h
> +        usubl2          v5.4s,  v2.8h,  v0.8h
> +        usubl           v2.4s,  v3.4h,  v1.4h
> +        usubl2          v3.4s,  v3.8h,  v1.8h
> +
> +        ld1             {v0.4s, v1.4s},  [x4]
> +
> +        mul             v4.4s,  v4.4s,  v0.4s
> +        mul             v5.4s,  v5.4s,  v1.4s
> +        mul             v2.4s,  v2.4s,  v0.4s
> +        mul             v3.4s,  v3.4s,  v1.4s
> +
> +        addp            v4.4s,  v4.4s,  v5.4s
> +        addp            v2.4s,  v2.4s,  v3.4s
> +
> +        addp            v4.4s,  v4.4s,  v4.4s
> +        addp            v2.4s,  v2.4s,  v2.4s
> +
> +        addp            v4.2s,  v4.2s,  v4.2s
> +        addp            v2.2s,  v2.2s,  v2.2s
> +        mov             v2.s[0],  v4.s[0]       // H and V

I haven't really studied this in detail, but why do you need to do 
elementwise fiddling here, when it isn't needed in the 8 bit version of 
the function?

> +
> +        sshll           v3.2d,  v2.2s,  #2
> +        saddw           v2.2d,  v3.2d,  v2.2s
> +        rshrn           v4.2s,  v2.2d,  #6
> +        dup             v5.4s,  v4.s[1]
> +
> +        add             v2.2s,  v4.2s,  v5.2s
> +        shl             v3.4s,  v2.4s,  #3
> +
> +        mov             w2,  v7.s[0]
> +        mov             v7.s[0],  v16.s[3]
> +        mov             v16.s[3],  w2

Same here, there's no corresponding elementwise fiddling in the 8 bit 
version, so I don't think it should be needed here either?

> +
> +        sub             v3.4s,  v3.4s,  v2.4s   // 7 * (b + c)
> +        add             v7.4s,  v7.4s,  v0.4s
> +
> +        shl             v2.4s,  v7.4s,  #4
> +        sub             v2.4s,  v2.4s,  v3.4s
> +        shl             v3.4s,  v4.4s,  #4
> +
> +        movrel          x5,  p16weight_10_new
> +        ld1             {v0.4s, v1.4s},  [x5]

The 8 bit version uses an "ext; mov v0.h[0], wzr" instead of loading a 
whole new set of constants here. Would that work here too, or have you 
lost the original constant?

> +
> +        sub             v6.4s,  v5.4s,  v3.4s
> +        mul             v0.4s,  v0.4s,  v4.s[0]
> +        mul             v1.4s,  v1.4s,  v4.s[0]
> +        dup             v16.4s,  v2.s[0]
> +        dup             v17.4s,  v2.s[0]
> +        dup             v18.4s,  v4.s[0]
> +        dup             v19.4s,  v4.s[0]
> +        dup             v20.4s,  v6.s[0]
> +        dup             v21.4s,  v6.s[0]
> +        shl             v18.4s,  v18.4s,  #3
> +        shl             v19.4s,  v19.4s,  #3
> +        add             v16.4s,  v16.4s,  v0.4s
> +        add             v17.4s,  v17.4s,  v1.4s
> +        add             v20.4s,  v20.4s,  v18.4s
> +        add             v21.4s,  v21.4s,  v19.4s
> +        mov             w3,  #16
> +        mov             w2,  #1023              // for clipping
> +        dup             v3.8h,  w2

Instead of mov+dup, you can load this constant with "mvni #0xFC, lsl #8" 
which equals to loading 0x3ff.


> +1:
> +        sqshrun         v0.4h,  v16.4s,  #5
> +        sqshrun2        v0.8h,  v17.4s,  #5
> +
> +        add             v16.4s,  v16.4s,  v18.4s
> +        add             v17.4s,  v17.4s,  v19.4s
> +
> +        sqshrun         v1.4h,  v16.4s,  #5
> +        sqshrun2        v1.8h,  v17.4s,  #5
> +
> +        add             v16.4s,  v16.4s,  v20.4s
> +        add             v17.4s,  v17.4s,  v21.4s
> +
> +        subs            w3,  w3,  #1
> +
> +        smin            v0.8h,  v0.8h,  v3.8h
> +        smin            v1.8h,  v1.8h,  v3.8h
> +        st1             {v0.8h, v1.8h}, [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +
> +function ff_pred8x8_hor_neon_10, export=1
> +        sub             x2,  x0,  #2
> +        mov             w3,  #8
> +
> +1:      ld1r            {v0.8h},  [x2], x1
> +        subs            w3,  w3,  #1
> +        st1             {v0.8h},  [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_pred8x8_vert_neon_10, export=1
> +        sub             x2,  x0,  x1
> +        lsl             x1,  x1,  #1
> +
> +        ld1             {v0.8h},  [x2], x1
> +        mov             w3,  #4
> +1:      subs            w3,  w3,  #1
> +        st1             {v0.8h},  [x0], x1
> +        st1             {v0.8h},  [x2], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_pred8x8_plane_neon_10, export=1
> +        sub             x3,  x0,  x1
> +        movrel          x4,  p8weight_10
> +        movrel          x5,  p16weight_10
> +        add             x2,  x3,  #8
> +        sub             x3,  x3,  #2
> +
> +        ld1             {v0.d}[0],  [x3]
> +        ld1             {v2.d}[0],  [x2],  x1
> +        ldcol.16        v0,  x3,  x1,  hi=1
> +        add             x3,  x3,  x1
> +        ldcol.16        v3,  x3,  x1,  4
> +
> +        uaddl           v7.4s,  v2.4h,  v3.4h
> +        rev64           v0.8h,  v0.8h
> +        trn1            v2.2d,  v2.2d,  v3.2d
> +
> +        usubl2          v3.4s,  v2.8h,  v0.8h
> +        usubl           v2.4s,  v2.4h,  v0.4h
> +
> +        ld1             {v6.4s},  [x4]
> +        mul             v2.4s,  v2.4s,  v6.4s
> +        mul             v3.4s,  v3.4s,  v6.4s
> +        ld1             {v0.4s}, [x5]
> +
> +        saddlp          v2.2d,  v2.4s
> +        saddlp          v3.2d,  v3.4s
> +        addp            v2.2d,  v2.2d,  v2.2d
> +        addp            v3.2d,  v3.2d,  v3.2d
> +        mov             v2.d[1], v3.d[0]
> +        shl             v3.2d,  v2.2d,  #4
> +        add             v2.2d,  v3.2d,  v2.2d
> +        rshrn           v5.2s,  v2.2d,  #5
> +        addp            v2.4s,  v5.4s,  v5.4s
> +        shl             v3.4s,  v2.4s,  #1
> +        add             v3.4s,  v3.4s,  v2.4s
> +
> +        rev64           v1.4s,  v7.4s
> +        trn1            v7.2d,  v1.2d,  v1.2d
> +        trn2            v7.2d,  v1.2d,  v7.2d
> +
> +
> +        add             v7.4s,  v7.4s,  v0.4s
> +        shl             v2.4s,  v7.4s,  #4
> +        sub             v2.4s,  v2.4s,  v3.4s
> +
> +        movrel          x5,  p16weight_10_new
> +        ld1             {v6.4s, v7.4s},  [x5]
> +
> +        mul             v6.4s,  v6.4s,  v5.s[0]
> +        mul             v7.4s,  v7.4s,  v5.s[0]
> +
> +        dup             v1.4s,  v2.s[0]
> +        dup             v2.4s,  v2.s[0]
> +        dup             v3.4s,  v5.s[1]
> +
> +        add             v1.4s,  v1.4s,  v6.4s
> +        add             v2.4s,  v2.4s,  v7.4s
> +
> +        mov             w3,  #8
> +        mov             w2,  #1023              // for clipping
> +        dup             v4.8h,  w2
> +1:
> +        sqshrun         v0.4h,  v1.4s,  #5
> +        sqshrun2        v0.8h,  v2.4s,  #5
> +
> +        subs            w3,  w3,  #1
> +
> +        add             v1.4s,  v1.4s,  v3.4s
> +        add             v2.4s,  v2.4s,  v3.4s
> +
> +        smin            v0.8h,  v0.8h,  v4.8h
> +        st1             {v0.8h},  [x0],  x1
> +        b.ne            1b
> +        ret
> +endfunc

Partially the same comments as for 16x16 above also applies to this 
function


> +
> +function ff_pred8x8_128_dc_neon_10, export=1
> +        movi            v0.8h,  #2, lsl #8      // 512, 1 << (bit_depth - 1)
> +        movi            v1.8h,  #2, lsl #8
> +        b               .L_pred8x8_dc_10_end
> +endfunc
> +
> +function ff_pred8x8_top_dc_neon_10, export=1
> +        sub             x2,  x0,  x1
> +        ld1             {v0.8h},  [x2]
> +
> +        uaddlp          v0.4s,  v0.8h

No need to go to 32 bit here; the same applies to most of the other 
functions below too.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index 325a86bfcd..0ae8f70d23 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -45,10 +45,23 @@  void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
 void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
 void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
 
-void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
 void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+
+void ff_pred8x8_vert_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_hor_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_plane_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_128_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_left_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l0t_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0lt_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l00_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0l0_dc_neon_10(uint8_t *src, ptrdiff_t stride);
 
 static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
                                         const int bit_depth,
@@ -84,10 +97,31 @@  static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
             h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
     }
     if (bit_depth == 10) {
+        if (chroma_format_idc <= 1) {
+            h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon_10;
+            h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon_10;
+            if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+                h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon_10;
+            h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon_10;
+            if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
+                codec_id != AV_CODEC_ID_VP8) {
+                h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon_10;
+                h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon_10;
+                h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon_10;
+                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon_10;
+                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon_10;
+                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon_10;
+                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon_10;
+            }
+        }
+
         h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon_10;
         h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon_10;
         h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon_10;
         h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10;
+        if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+            codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+            h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon_10;
     }
 }
 
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index e40bdc8d53..735d20b49c 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -360,16 +360,24 @@  function ff_pred8x8_0l0_dc_neon, export=1
         b               .L_pred8x8_dc_end
 endfunc
 
+const   p16weight_10, align=4
+        .word           1,2,3,4,5,6,7,8
+endconst
+const   p16weight_10_new, align=4
+        .word           0,1,2,3,4,5,6,7
+endconst
+const   p8weight_10, align=4
+        .word           1,2,3,4,1,2,3,4
+endconst
+
 .macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
-.if \n >= 4 || \hi == 0
+.if \n >= 4 && \hi == 0
         ld1             {\rd\().h}[0],  [\rs], \rt
         ld1             {\rd\().h}[1],  [\rs], \rt
-.endif
-.if \n >= 4 || \hi == 1
         ld1             {\rd\().h}[2],  [\rs], \rt
         ld1             {\rd\().h}[3],  [\rs], \rt
 .endif
-.if \n == 8
+.if \n == 8 || \hi == 1
         ld1             {\rd\().h}[4],  [\rs], \rt
         ld1             {\rd\().h}[5],  [\rs], \rt
         ld1             {\rd\().h}[6],  [\rs], \rt
@@ -467,3 +475,356 @@  function ff_pred16x16_vert_neon_10, export=1
         b.ne            1b
         ret
 endfunc
+
+function ff_pred16x16_plane_neon_10, export=1
+        sub             x3,  x0,  x1
+        movrel          x4,  p16weight_10
+        add             x2,  x3,  #16
+        sub             x3,  x3,  #2
+
+        ld1             {v0.8h},  [x3]
+        ld1             {v2.8h},  [x2]
+        ldcol.16        v1,  x3,  x1, 8
+        add             x3,  x3,  x1
+        ldcol.16        v3,  x3,  x1, 8
+
+        rev64           v16.8h,  v0.8h
+        trn1            v0.2d,   v16.2d,  v16.2d
+        trn2            v0.2d,   v16.2d,  v0.2d
+
+        rev64           v16.8h,  v1.8h
+        trn1            v1.2d,   v16.2d,  v16.2d
+        trn2            v1.2d,   v16.2d,  v1.2d
+
+        uaddl           v7.4s,  v2.4h,  v3.4h
+        uaddl2          v16.4s, v2.8h,  v3.8h
+        usubl           v4.4s,  v2.4h,  v0.4h
+        usubl2          v5.4s,  v2.8h,  v0.8h
+        usubl           v2.4s,  v3.4h,  v1.4h
+        usubl2          v3.4s,  v3.8h,  v1.8h
+
+        ld1             {v0.4s, v1.4s},  [x4]
+
+        mul             v4.4s,  v4.4s,  v0.4s
+        mul             v5.4s,  v5.4s,  v1.4s
+        mul             v2.4s,  v2.4s,  v0.4s
+        mul             v3.4s,  v3.4s,  v1.4s
+
+        addp            v4.4s,  v4.4s,  v5.4s
+        addp            v2.4s,  v2.4s,  v3.4s
+
+        addp            v4.4s,  v4.4s,  v4.4s
+        addp            v2.4s,  v2.4s,  v2.4s
+
+        addp            v4.2s,  v4.2s,  v4.2s
+        addp            v2.2s,  v2.2s,  v2.2s
+        mov             v2.s[0],  v4.s[0]       // H and V
+
+        sshll           v3.2d,  v2.2s,  #2
+        saddw           v2.2d,  v3.2d,  v2.2s
+        rshrn           v4.2s,  v2.2d,  #6
+        dup             v5.4s,  v4.s[1]
+
+        add             v2.2s,  v4.2s,  v5.2s
+        shl             v3.4s,  v2.4s,  #3
+
+        mov             w2,  v7.s[0]
+        mov             v7.s[0],  v16.s[3]
+        mov             v16.s[3],  w2
+
+        sub             v3.4s,  v3.4s,  v2.4s   // 7 * (b + c)
+        add             v7.4s,  v7.4s,  v0.4s
+
+        shl             v2.4s,  v7.4s,  #4
+        sub             v2.4s,  v2.4s,  v3.4s
+        shl             v3.4s,  v4.4s,  #4
+
+        movrel          x5,  p16weight_10_new
+        ld1             {v0.4s, v1.4s},  [x5]
+
+        sub             v6.4s,  v5.4s,  v3.4s
+        mul             v0.4s,  v0.4s,  v4.s[0]
+        mul             v1.4s,  v1.4s,  v4.s[0]
+        dup             v16.4s,  v2.s[0]
+        dup             v17.4s,  v2.s[0]
+        dup             v18.4s,  v4.s[0]
+        dup             v19.4s,  v4.s[0]
+        dup             v20.4s,  v6.s[0]
+        dup             v21.4s,  v6.s[0]
+        shl             v18.4s,  v18.4s,  #3
+        shl             v19.4s,  v19.4s,  #3
+        add             v16.4s,  v16.4s,  v0.4s
+        add             v17.4s,  v17.4s,  v1.4s
+        add             v20.4s,  v20.4s,  v18.4s
+        add             v21.4s,  v21.4s,  v19.4s
+        mov             w3,  #16
+        mov             w2,  #1023              // for clipping
+        dup             v3.8h,  w2
+1:
+        sqshrun         v0.4h,  v16.4s,  #5
+        sqshrun2        v0.8h,  v17.4s,  #5
+
+        add             v16.4s,  v16.4s,  v18.4s
+        add             v17.4s,  v17.4s,  v19.4s
+
+        sqshrun         v1.4h,  v16.4s,  #5
+        sqshrun2        v1.8h,  v17.4s,  #5
+
+        add             v16.4s,  v16.4s,  v20.4s
+        add             v17.4s,  v17.4s,  v21.4s
+
+        subs            w3,  w3,  #1
+
+        smin            v0.8h,  v0.8h,  v3.8h
+        smin            v1.8h,  v1.8h,  v3.8h
+        st1             {v0.8h, v1.8h}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_pred8x8_hor_neon_10, export=1
+        sub             x2,  x0,  #2
+        mov             w3,  #8
+
+1:      ld1r            {v0.8h},  [x2], x1
+        subs            w3,  w3,  #1
+        st1             {v0.8h},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_vert_neon_10, export=1
+        sub             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+
+        ld1             {v0.8h},  [x2], x1
+        mov             w3,  #4
+1:      subs            w3,  w3,  #1
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x2], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_plane_neon_10, export=1
+        sub             x3,  x0,  x1
+        movrel          x4,  p8weight_10
+        movrel          x5,  p16weight_10
+        add             x2,  x3,  #8
+        sub             x3,  x3,  #2
+
+        ld1             {v0.d}[0],  [x3]
+        ld1             {v2.d}[0],  [x2],  x1
+        ldcol.16        v0,  x3,  x1,  hi=1
+        add             x3,  x3,  x1
+        ldcol.16        v3,  x3,  x1,  4
+
+        uaddl           v7.4s,  v2.4h,  v3.4h
+        rev64           v0.8h,  v0.8h
+        trn1            v2.2d,  v2.2d,  v3.2d
+
+        usubl2          v3.4s,  v2.8h,  v0.8h
+        usubl           v2.4s,  v2.4h,  v0.4h
+
+        ld1             {v6.4s},  [x4]
+        mul             v2.4s,  v2.4s,  v6.4s
+        mul             v3.4s,  v3.4s,  v6.4s
+        ld1             {v0.4s}, [x5]
+
+        saddlp          v2.2d,  v2.4s
+        saddlp          v3.2d,  v3.4s
+        addp            v2.2d,  v2.2d,  v2.2d
+        addp            v3.2d,  v3.2d,  v3.2d
+        mov             v2.d[1], v3.d[0]
+        shl             v3.2d,  v2.2d,  #4
+        add             v2.2d,  v3.2d,  v2.2d
+        rshrn           v5.2s,  v2.2d,  #5
+        addp            v2.4s,  v5.4s,  v5.4s
+        shl             v3.4s,  v2.4s,  #1
+        add             v3.4s,  v3.4s,  v2.4s
+
+        rev64           v1.4s,  v7.4s
+        trn1            v7.2d,  v1.2d,  v1.2d
+        trn2            v7.2d,  v1.2d,  v7.2d
+
+
+        add             v7.4s,  v7.4s,  v0.4s
+        shl             v2.4s,  v7.4s,  #4
+        sub             v2.4s,  v2.4s,  v3.4s
+
+        movrel          x5,  p16weight_10_new
+        ld1             {v6.4s, v7.4s},  [x5]
+
+        mul             v6.4s,  v6.4s,  v5.s[0]
+        mul             v7.4s,  v7.4s,  v5.s[0]
+
+        dup             v1.4s,  v2.s[0]
+        dup             v2.4s,  v2.s[0]
+        dup             v3.4s,  v5.s[1]
+
+        add             v1.4s,  v1.4s,  v6.4s
+        add             v2.4s,  v2.4s,  v7.4s
+
+        mov             w3,  #8
+        mov             w2,  #1023              // for clipping
+        dup             v4.8h,  w2
+1:
+        sqshrun         v0.4h,  v1.4s,  #5
+        sqshrun2        v0.8h,  v2.4s,  #5
+
+        subs            w3,  w3,  #1
+
+        add             v1.4s,  v1.4s,  v3.4s
+        add             v2.4s,  v2.4s,  v3.4s
+
+        smin            v0.8h,  v0.8h,  v4.8h
+        st1             {v0.8h},  [x0],  x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_128_dc_neon_10, export=1
+        movi            v0.8h,  #2, lsl #8      // 512, 1 << (bit_depth - 1)
+        movi            v1.8h,  #2, lsl #8
+        b               .L_pred8x8_dc_10_end
+endfunc
+
+function ff_pred8x8_top_dc_neon_10, export=1
+        sub             x2,  x0,  x1
+        ld1             {v0.8h},  [x2]
+
+        uaddlp          v0.4s,  v0.8h
+        addp            v0.4s,  v0.4s,  v0.4s
+        zip1            v0.4s,  v0.4s,  v0.4s
+        rshrn           v2.4h,  v0.4s,  #2
+        zip1            v0.8h,  v2.8h,  v2.8h
+        zip1            v1.8h,  v2.8h,  v2.8h
+        b               .L_pred8x8_dc_10_end
+endfunc
+
+function ff_pred8x8_left_dc_neon_10, export=1
+        sub             x2,  x0,  #2
+        ldcol.16        v0,  x2,  x1,  8
+
+        uaddlp          v0.4s,  v0.8h
+        addp            v0.4s,  v0.4s,  v0.4s
+        rshrn           v2.4h,  v0.4s,  #2
+        dup             v1.8h,  v2.h[1]
+        dup             v0.8h,  v2.h[0]
+        b               .L_pred8x8_dc_10_end
+endfunc
+
+function ff_pred8x8_dc_neon_10, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #2
+
+        ld1             {v0.8h}, [x2]
+        ldcol.16        v1,  x3,  x1, 8
+
+        uaddlp          v0.4s,  v0.8h
+        uaddlp          v1.4s,  v1.8h
+        trn1            v2.2d,  v0.2d,  v1.2d
+        trn2            v3.2d,  v0.2d,  v1.2d
+        addp            v4.4s,  v2.4s,  v3.4s
+        addp            v5.4s,  v4.4s,  v4.4s
+        rshrn           v6.4h,  v5.4s,  #3
+        rshrn           v7.4h,  v4.4s,  #2
+        dup             v0.8h,  v6.h[0]
+        dup             v2.8h,  v7.h[2]
+        dup             v1.8h,  v7.h[3]
+        dup             v3.8h,  v6.h[1]
+        zip1            v0.2d,  v0.2d,  v2.2d
+        zip1            v1.2d,  v1.2d,  v3.2d
+.L_pred8x8_dc_10_end:
+        mov             w3,  #4
+        add             x2,  x0,  x1,  lsl #2
+
+6:      st1             {v0.8h},  [x0], x1
+        subs            w3,  w3,  #1
+        st1             {v1.8h},  [x2], x1
+        b.ne            6b
+        ret
+endfunc
+
+function ff_pred8x8_l0t_dc_neon_10, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #2
+
+        ld1             {v0.8h},  [x2]
+        ldcol.16        v1,  x3,  x1, 4
+
+        uaddlp          v0.4s,  v0.8h
+        uaddlp          v1.2s,  v1.4h
+        addp            v0.4s,  v0.4s,  v0.4s
+        addp            v1.2s,  v1.2s,  v1.2s
+        add             v1.2s,  v1.2s,  v0.2s
+
+        rshrn           v2.4h,  v0.4s,  #2
+        rshrn           v3.4h,  v1.4s,  #3      // the pred4x4 part
+
+        dup             v4.4h,  v3.h[0]
+        dup             v5.4h,  v2.h[0]
+        dup             v6.4h,  v2.h[1]
+
+        zip1            v0.2d,  v4.2d,  v6.2d
+        zip1            v1.2d,  v5.2d,  v6.2d
+        b               .L_pred8x8_dc_10_end
+endfunc
+
+function ff_pred8x8_l00_dc_neon_10, export=1
+        sub             x2,  x0,  #2
+
+        ldcol.16        v0,  x2,  x1,  4
+
+        uaddlp          v0.2s,  v0.4h
+        addp            v0.2s,  v0.2s,  v0.2s
+        rshrn           v0.4h,  v0.4s,  #2
+
+        movi            v1.8h,  #2, lsl #8      // 512
+        dup             v0.8h,  v0.h[0]
+        b               .L_pred8x8_dc_10_end
+endfunc
+
+function ff_pred8x8_0lt_dc_neon_10, export=1
+        add             x3,  x0,  x1,  lsl #2
+        sub             x2,  x0,  x1
+        sub             x3,  x3,  #2
+
+        ld1             {v0.8h},  [x2]
+        ldcol.16        v1,  x3,  x1,  hi=1
+
+        uaddlp          v0.4s,  v0.8h
+        uaddlp          v1.4s,  v1.8h
+        addp            v0.4s,  v0.4s,  v0.4s
+        addp            v1.4s,  v1.4s,  v1.4s
+        zip1            v0.2d,  v0.2d,  v1.2d
+        add             v1.2s,  v0.2s,  v1.2s
+
+        rshrn           v2.4h,  v0.4s,  #2
+        rshrn           v3.4h,  v1.4s,  #3
+
+        dup             v4.4h,  v2.h[0]
+        dup             v5.4h,  v2.h[3]
+        dup             v6.4h,  v2.h[1]
+        dup             v7.4h,  v3.h[1]
+
+        zip1            v0.2d,  v4.2d,  v6.2d
+        zip1            v1.2d,  v5.2d,  v7.2d
+        b               .L_pred8x8_dc_10_end
+endfunc
+
+function ff_pred8x8_0l0_dc_neon_10, export=1
+        add             x2,  x0,  x1,  lsl #2
+        sub             x2,  x2,  #2
+
+        ldcol.16        v1,  x2,  x1,  4
+
+        uaddlp          v2.4s,  v1.8h
+        addp            v2.4s,  v2.4s,  v2.4s
+        rshrn           v1.4h,  v2.4s,  #2
+
+        movi            v0.8h,  #2,  lsl #8     // 512
+        dup             v1.8h,  v1.h[0]
+        b               .L_pred8x8_dc_10_end
+endfunc