diff mbox series

[FFmpeg-devel] codec/aarch64/hevc: add transform_luma_neon

Message ID tencent_20795ACBA24269D930317A8C70E0A9D91008@qq.com
State Accepted
Commit bd2f00f665cc964fc1942518cdf27bd6e8b6d388
Headers show
Series [FFmpeg-devel] codec/aarch64/hevc: add transform_luma_neon | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

徐福隆 April 13, 2023, 1:34 p.m. UTC
got 56% speed up (run_count=1000, CPU=Cortex A53)
transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103

Signed-off-by: xufuji456 <839789740@qq.com>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 48 +++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
 2 files changed, 50 insertions(+)

Comments

Martin Storsjö April 14, 2023, 12:20 p.m. UTC | #1
On Thu, 13 Apr 2023, xufuji456 wrote:

> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 48 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
> 2 files changed, 50 insertions(+)

Thanks, this version can be applied - and still looks good, so I pushed 
it.

I see that you fixed the issue by just applying the new code in the middle 
of the file instead of at the end of the file though. You really should 
try to look into what it is that is causing the previous version of the 
file to be lacking the trailing newline, since that's not what is in the 
actual upstream git. So it looks like there's something off with your git 
workflow, and it would be very good to get that sorted out before going 
forward anyway.

// Martin
徐福隆 April 30, 2023, 7:28 a.m. UTC | #2
Thank you, Martin.
It's my mistake that delete an empty line in the end of file.
Should I submit a patch with a newline in the end of file or do something else?
Thanks for your review and point out the details of error.&nbsp;




------------------&nbsp;Original&nbsp;------------------
From:                                                                                                                        "Martin Storsj�"                                                                                    <martin@martin.st&gt;;
Date:&nbsp;Fri, Apr 14, 2023 08:20 PM
To:&nbsp;"FFmpeg development discussions and patches"<ffmpeg-devel@ffmpeg.org&gt;;
Cc:&nbsp;"徐福隆"<839789740@qq.com&gt;;
Subject:&nbsp;Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon



On Thu, 13 Apr 2023, xufuji456 wrote:

&gt; got 56% speed up (run_count=1000, CPU=Cortex A53)
&gt; transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
&gt;
&gt; Signed-off-by: xufuji456 <839789740@qq.com&gt;
&gt; ---
&gt; libavcodec/aarch64/hevcdsp_idct_neon.S&nbsp;&nbsp;&nbsp; | 48 +++++++++++++++++++++++
&gt; libavcodec/aarch64/hevcdsp_init_aarch64.c |&nbsp; 2 +
&gt; 2 files changed, 50 insertions(+)

Thanks, this version can be applied - and still looks good, so I pushed 
it.

I see that you fixed the issue by just applying the new code in the middle 
of the file instead of at the end of the file though. You really should 
try to look into what it is that is causing the previous version of the 
file to be lacking the trailing newline, since that's not what is in the 
actual upstream git. So it looks like there's something off with your git 
workflow, and it would be very good to get that sorted out before going 
forward anyway.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 994f0a47b6..4a25787070 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -842,6 +842,54 @@  tr_32x4 secondpass_10, 20 - 10
 idct_32x32 8
 idct_32x32 10
 
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
+        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
+        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
+        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
+
+        saddl       v7.4s, \r0, \r3         // src0 + src3
+        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
+        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)
+
+        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
+        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
+        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
+        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3
+
+        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
+        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
+        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
+        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3
+
+        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
+        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
+        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
+        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3
+
+        sqrshrn     \r0, v5.4s, \shift
+        sqrshrn     \r1, v6.4s, \shift
+        sqrshrn     \r2, v7.4s, \shift
+        sqrshrn     \r3, v0.4s, \shift
+.endm
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        ld1            {v28.4h-v31.4h}, [x0]
+        movi           v18.4s, #74
+        movi           v19.4s, #29
+        movi           v20.4s, #55
+        movi           v21.4h, #74
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        st1            {v28.4h-v31.4h}, [x0]
+        ret
+endfunc
+
 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
 .macro idct_dc size, bitdepth
 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4cc8732ad3..be1049a2ec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -78,6 +78,7 @@  void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
 void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
                                   const int16_t *sao_offset_val, int sao_left_class,
@@ -146,6 +147,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
         c->sao_band_filter[0]          =
         c->sao_band_filter[1]          =
         c->sao_band_filter[2]          =