[FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions

Message ID	20210408190044.70816-1-mnitenko@gmail.com
State	Accepted
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> From: Mikhail Nitenko <mnitenko@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Thu, 8 Apr 2021 22:00:44 +0300 Message-Id: <20210408190044.70816-1-mnitenko@gmail.com> In-Reply-To: <mnitenko@gmail.com> References: <mnitenko@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] lavc/aarch64: add pred16x16 10-bit functions Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Mikhail Nitenko <mnitenko@gmail.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions \| expand [FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions

Message ID

20210408190044.70816-1-mnitenko@gmail.com

State

Accepted

Headers

From: Mikhail Nitenko <mnitenko@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Date: Thu,  8 Apr 2021 22:00:44 +0300
Message-Id: <20210408190044.70816-1-mnitenko@gmail.com>
In-Reply-To: <mnitenko@gmail.com>
References: <mnitenko@gmail.com>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH] lavc/aarch64: add pred16x16 10-bit functions
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Mikhail Nitenko <mnitenko@gmail.com>
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions | expand

Checks

Context	Check	Description
andriy/x86_make	success	Make finished
andriy/x86_make_fate	success	Make fate finished
andriy/PPC64_make	success	Make finished
andriy/PPC64_make_fate	success	Make fate finished

Context

Check

Description

andriy/x86_make

success

Make finished

andriy/x86_make_fate

success

Make fate finished

andriy/PPC64_make

success

Make finished

andriy/PPC64_make_fate

success

Make fate finished

here are the benchmarks https://0x1.st/kX.txt --- libavcodec/aarch64/h264pred_init.c | 75 +++++++++++------- libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 30 deletions(-)

Comments

Carl Eugen Hoyos April 8, 2021, 7:25 p.m. UTC | #1

Am Do., 8. Apr. 2021 um 21:10 Uhr schrieb Mikhail Nitenko <mnitenko@gmail.com>:
>
> here are the benchmarks https://0x1.st/kX.txt

Instead please add the relevant lines to the commit message.

> ---
>  libavcodec/aarch64/h264pred_init.c |  75 +++++++++++-------
>  libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++
>  2 files changed, 168 insertions(+), 30 deletions(-)
>
> diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
> index b144376f90..d74205c2de 100644
> --- a/libavcodec/aarch64/h264pred_init.c
> +++ b/libavcodec/aarch64/h264pred_init.c
> @@ -45,42 +45,57 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
>  void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
>  void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
>
> +void ff_pred16x16_128_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_left_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride);
> +
>  static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
>                                          const int bit_depth,
>                                          const int chroma_format_idc)
>  {
> -    const int high_depth = bit_depth > 8;
> -
> -    if (high_depth)
> -        return;
> -

> -    if (chroma_format_idc <= 1) {
> -        h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
> -        h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
> -        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
> -            h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
> -        h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
> -        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
> -            codec_id != AV_CODEC_ID_VP8) {
> -            h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
> -            h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
> -            h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
> -            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;

> +    if (bit_depth == 8) {

> +        if (chroma_format_idc <= 1) {
> +            h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
> +            h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
> +            if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
> +                h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
> +            h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
> +            if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
> +                codec_id != AV_CODEC_ID_VP8) {
> +                h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
> +                h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
> +                h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
> +                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;

Please do not re-indent these lines in the patch that adds the new functions,
feel free to send a separate patch for the re-indentation.

Carl Eugen

Martin Storsjö April 8, 2021, 8:55 p.m. UTC | #2

On Thu, 8 Apr 2021, Mikhail Nitenko wrote:

> here are the benchmarks https://0x1.st/kX.txt
> ---
> libavcodec/aarch64/h264pred_init.c |  75 +++++++++++-------
> libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++
> 2 files changed, 168 insertions(+), 30 deletions(-)
>
> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
> index 213b40b3e7..633b401d59 100644
> --- a/libavcodec/aarch64/h264pred_neon.S
> +++ b/libavcodec/aarch64/h264pred_neon.S
> @@ -359,3 +359,126 @@ function ff_pred8x8_0l0_dc_neon, export=1
>         dup             v1.8b,  v1.b[0]
>         b               .L_pred8x8_dc_end
> endfunc
> +
> +.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
> +.if \n >= 4 || \hi == 0
> +        ld1             {\rd\().h}[0],  [\rs], \rt
> +        ld1             {\rd\().h}[1],  [\rs], \rt
> +.endif
> +.if \n >= 4 || \hi == 1
> +        ld1             {\rd\().h}[2],  [\rs], \rt
> +        ld1             {\rd\().h}[3],  [\rs], \rt
> +.endif
> +.if \n == 8
> +        ld1             {\rd\().h}[4],  [\rs], \rt
> +        ld1             {\rd\().h}[5],  [\rs], \rt
> +        ld1             {\rd\().h}[6],  [\rs], \rt
> +        ld1             {\rd\().h}[7],  [\rs], \rt
> +.endif
> +.endm

I believe this could be a bit faster by using two alternating registers 
that are incremented - but as the existing code doesn't do that, it's not 
necessary.

> +
> +function ff_pred16x16_128_dc_neon_10, export=1
> +        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
> +
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_top_dc_neon_10, export=1
> +        sub             x2,  x0,  x1
> +
> +        ld1             {v0.8h},  [x2], #16
> +        ld1             {v1.8h},  [x2]

This can be one single instruction, ld1 {v0.8h, v1.8h}, [x2]

> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h

When adding up 8 elements that are 10 bit, they still fit in 16 bit (it 
only requires 13 bit), so you don't need uaddlv here, addv would be 
better. And when adding the two results, it still fits in 16 bit (then 
it'd use 14 bits).

> +
> +        add             v0.2s, v0.2s, v1.2s
> +
> +        rshrn           v0.4h,  v0.4s,  #4
> +        dup             v0.8h, v0.h[0]
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_left_dc_neon_10, export=1
> +        sub             x2,  x0,  #2 // access to the "left" column
> +        ldcol.16        v0,  x2,  x1,  8
> +        ldcol.16        v1,  x2,  x1,  8 // load "left" column
> +
> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h

Same thing here, addv+addv should be enough

> +
> +        add             v0.2s,  v0.2s,  v1.2s
> +
> +        rshrn           v0.4h,  v0.4s,  #4
> +        dup             v0.8h, v0.h[0]
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_dc_neon_10, export=1
> +        sub             x2,  x0,  x1 // access to the "top" row
> +        sub             x3,  x0,  #2 // access to the "left" column
> +
> +        ld1             {v0.8h}, [x2], #16
> +        ld1             {v1.8h}, [x2]

One single ld1 {v0.8h, v1.8h}

> +        ldcol.16        v2,  x3,  x1,  8
> +        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" col and "left" row
> +
> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h
> +        uaddlv          s2,  v2.8h // sum all pixels in the "top" row and "left" col
> +        uaddlv          s3,  v3.8h // (sum stays in v0-v3 registers)

addv

> +
> +        add             v0.2s,  v0.2s,  v1.2s
> +        add             v0.2s,  v0.2s,  v2.2s
> +        add             v0.2s,  v0.2s,  v3.2s // sum registers v0-v3
> +
> +        rshrn           v0.4h,  v0.4s,  #5 // right shift vector
> +        dup             v0.8h,  v0.h[0] // fill vector with 0th value (dcsplat)

These comments are kinda pointless here

> +.L_pred16x16_dc_10_end:
> +        sub             x1,  x1,  #16
> +        mov             w3,  #8
> +6:      st1             {v0.8h}, [x0], #16
> +        st1             {v0.8h}, [x0], x1

This can be one single "st1 {v0.8h, v1.8h}, [x0], x1" if you make sure 
that v1 contains the same

> +        st1             {v0.8h}, [x0], #16
> +        st1             {v0.8h}, [x0], x1
> +
> +        subs            w3,  w3,  #1
> +        b.ne            6b
> +        ret
> +endfunc
> +
> +function ff_pred16x16_hor_neon_10, export=1
> +        sub             x2,  x0,  #2
> +        sub             x3,  x1,  #16
> +
> +        mov             w4,  #16
> +1:      ld1r            {v0.8h},  [x2],  x1
> +        st1             {v0.8h},  [x0],  #16
> +        st1             {v0.8h},  [x0],  x3

This might be ok here, but also do check if copying the value to v1 and 
doing one single "st1 {v0.8h, v1.8h}" is faster.

> +
> +        subs            w4,  w4,  #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_pred16x16_vert_neon_10, export=1
> +        sub             x2,  x0,  x1
> +        add             x1,  x1,  x1
> +        sub             x1,  x1,  #16
> +
> +        ld1             {v0.8h},  [x2], #16
> +        ld1             {v1.8h},  [x2], x1

One single ld1

> +
> +        mov             w3,  #8
> +1:      st1             {v0.8h},  [x0],  #16
> +        st1             {v1.8h},  [x0],  x1

One single st1

// Martin

diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index b144376f90..d74205c2de 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -45,42 +45,57 @@  void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
 void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
 void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
 
+void ff_pred16x16_128_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_left_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride);
+
 static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
                                         const int bit_depth,
                                         const int chroma_format_idc)
 {
-    const int high_depth = bit_depth > 8;
-
-    if (high_depth)
-        return;
-
-    if (chroma_format_idc <= 1) {
-        h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
-        h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
-        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
-            h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
-        h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
-        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
-            codec_id != AV_CODEC_ID_VP8) {
-            h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
-            h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
-            h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
-            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+    if (bit_depth == 8) {
+        if (chroma_format_idc <= 1) {
+            h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
+            h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
+            if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+                h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
+            h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
+            if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
+                codec_id != AV_CODEC_ID_VP8) {
+                h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
+                h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
+                h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
+                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+            }
         }
-    }
 
-    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
-    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
-    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
-    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
-    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
-    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
-    if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
-        codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
-        h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+        h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
+        h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
+        h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
+        h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
+        h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
+        if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+            codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+            h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+    }
+    if (bit_depth == 10) {
+        h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon_10;
+        h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon_10;
+        h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon_10;
+        h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10;
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon_10;
+        h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon_10;
+        if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+            codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {}
+            //h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+    }
 }
 
 av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index 213b40b3e7..633b401d59 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -359,3 +359,126 @@  function ff_pred8x8_0l0_dc_neon, export=1
         dup             v1.8b,  v1.b[0]
         b               .L_pred8x8_dc_end
 endfunc
+
+.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
+.if \n >= 4 || \hi == 0
+        ld1             {\rd\().h}[0],  [\rs], \rt
+        ld1             {\rd\().h}[1],  [\rs], \rt
+.endif
+.if \n >= 4 || \hi == 1
+        ld1             {\rd\().h}[2],  [\rs], \rt
+        ld1             {\rd\().h}[3],  [\rs], \rt
+.endif
+.if \n == 8
+        ld1             {\rd\().h}[4],  [\rs], \rt
+        ld1             {\rd\().h}[5],  [\rs], \rt
+        ld1             {\rd\().h}[6],  [\rs], \rt
+        ld1             {\rd\().h}[7],  [\rs], \rt
+.endif
+.endm
+
+function ff_pred16x16_128_dc_neon_10, export=1
+        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
+
+        b               .L_pred16x16_dc_10_end
+endfunc
+
+function ff_pred16x16_top_dc_neon_10, export=1
+        sub             x2,  x0,  x1
+
+        ld1             {v0.8h},  [x2], #16
+        ld1             {v1.8h},  [x2]
+
+        uaddlv          s0,  v0.8h
+        uaddlv          s1,  v1.8h
+
+        add             v0.2s, v0.2s, v1.2s
+
+        rshrn           v0.4h,  v0.4s,  #4
+        dup             v0.8h, v0.h[0]
+        b               .L_pred16x16_dc_10_end
+endfunc
+
+function ff_pred16x16_left_dc_neon_10, export=1
+        sub             x2,  x0,  #2 // access to the "left" column
+        ldcol.16        v0,  x2,  x1,  8
+        ldcol.16        v1,  x2,  x1,  8 // load "left" column
+
+        uaddlv          s0,  v0.8h
+        uaddlv          s1,  v1.8h
+
+        add             v0.2s,  v0.2s,  v1.2s
+
+        rshrn           v0.4h,  v0.4s,  #4
+        dup             v0.8h, v0.h[0]
+        b               .L_pred16x16_dc_10_end
+endfunc
+
+function ff_pred16x16_dc_neon_10, export=1
+        sub             x2,  x0,  x1 // access to the "top" row
+        sub             x3,  x0,  #2 // access to the "left" column
+
+        ld1             {v0.8h}, [x2], #16
+        ld1             {v1.8h}, [x2]
+        ldcol.16        v2,  x3,  x1,  8
+        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" col and "left" row
+
+        uaddlv          s0,  v0.8h
+        uaddlv          s1,  v1.8h
+        uaddlv          s2,  v2.8h // sum all pixels in the "top" row and "left" col
+        uaddlv          s3,  v3.8h // (sum stays in v0-v3 registers)
+
+        add             v0.2s,  v0.2s,  v1.2s
+        add             v0.2s,  v0.2s,  v2.2s
+        add             v0.2s,  v0.2s,  v3.2s // sum registers v0-v3
+
+        rshrn           v0.4h,  v0.4s,  #5 // right shift vector
+        dup             v0.8h,  v0.h[0] // fill vector with 0th value (dcsplat)
+.L_pred16x16_dc_10_end:
+        sub             x1,  x1,  #16
+        mov             w3,  #8
+6:      st1             {v0.8h}, [x0], #16
+        st1             {v0.8h}, [x0], x1
+
+        st1             {v0.8h}, [x0], #16
+        st1             {v0.8h}, [x0], x1
+
+        subs            w3,  w3,  #1
+        b.ne            6b
+        ret
+endfunc
+
+function ff_pred16x16_hor_neon_10, export=1
+        sub             x2,  x0,  #2
+        sub             x3,  x1,  #16
+
+        mov             w4,  #16
+1:      ld1r            {v0.8h},  [x2],  x1
+        st1             {v0.8h},  [x0],  #16
+        st1             {v0.8h},  [x0],  x3
+
+        subs            w4,  w4,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred16x16_vert_neon_10, export=1
+        sub             x2,  x0,  x1
+        add             x1,  x1,  x1
+        sub             x1,  x1,  #16
+
+        ld1             {v0.8h},  [x2], #16
+        ld1             {v1.8h},  [x2], x1
+
+        mov             w3,  #8
+1:      st1             {v0.8h},  [x0],  #16
+        st1             {v1.8h},  [x0],  x1
+
+        st1             {v0.8h},  [x2],  #16
+        st1             {v1.8h},  [x2],  x1
+
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+

[FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions

Checks

Commit Message

Comments

Patch