Message ID | tencent_20795ACBA24269D930317A8C70E0A9D91008@qq.com |
---|---|
State | Accepted |
Commit | bd2f00f665cc964fc1942518cdf27bd6e8b6d388 |
Headers | show |
Series | [FFmpeg-devel] codec/aarch64/hevc: add transform_luma_neon | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Thu, 13 Apr 2023, xufuji456 wrote: > got 56% speed up (run_count=1000, CPU=Cortex A53) > transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103 > > Signed-off-by: xufuji456 <839789740@qq.com> > --- > libavcodec/aarch64/hevcdsp_idct_neon.S | 48 +++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 + > 2 files changed, 50 insertions(+) Thanks, this version can be applied - and still looks good, so I pushed it. I see that you fixed the issue by just applying the new code in the middle of the file instead of at the end of the file though. You really should try to look into what it is that is causing the previous version of the file to be lacking the trailing newline, since that's not what is in the actual upstream git. So it looks like there's something off with your git workflow, and it would be very good to get that sorted out before going forward anyway. // Martin
Thank you, Martin.
It's my mistake that delete an empty line in the end of file.
Should I submit a patch with a newline in the end of file or do something else?
Thanks for your review and point out the details of error.
------------------ Original ------------------
From: "Martin Storsj�" <martin@martin.st>;
Date: Fri, Apr 14, 2023 08:20 PM
To: "FFmpeg development discussions and patches"<ffmpeg-devel@ffmpeg.org>;
Cc: "徐福隆"<839789740@qq.com>;
Subject: Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
On Thu, 13 Apr 2023, xufuji456 wrote:
> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 48 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
> 2 files changed, 50 insertions(+)
Thanks, this version can be applied - and still looks good, so I pushed
it.
I see that you fixed the issue by just applying the new code in the middle
of the file instead of at the end of the file though. You really should
try to look into what it is that is causing the previous version of the
file to be lacking the trailing newline, since that's not what is in the
actual upstream git. So it looks like there's something off with your git
workflow, and it would be very good to get that sorted out before going
forward anyway.
// Martin
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 994f0a47b6..4a25787070 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -842,6 +842,54 @@ tr_32x4 secondpass_10, 20 - 10 idct_32x32 8 idct_32x32 10 +.macro tr4_luma_shift r0, r1, r2, r3, shift + saddl v0.4s, \r0, \r2 // c0 = src0 + src2 + saddl v1.4s, \r2, \r3 // c1 = src2 + src3 + ssubl v2.4s, \r0, \r3 // c2 = src0 - src3 + smull v3.4s, \r1, v21.4h // c3 = 74 * src1 + + saddl v7.4s, \r0, \r3 // src0 + src3 + ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3 + mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3) + + mul v5.4s, v0.4s, v19.4s // 29 * c0 + mul v6.4s, v1.4s, v20.4s // 55 * c1 + add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1 + add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3 + + mul v1.4s, v1.4s, v19.4s // 29 * c1 + mul v6.4s, v2.4s, v20.4s // 55 * c2 + sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1 + add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3 + + mul v0.4s, v0.4s, v20.4s // 55 * c0 + mul v2.4s, v2.4s, v19.4s // 29 * c2 + add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2 + sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3 + + sqrshrn \r0, v5.4s, \shift + sqrshrn \r1, v6.4s, \shift + sqrshrn \r2, v7.4s, \shift + sqrshrn \r3, v0.4s, \shift +.endm + +function ff_hevc_transform_luma_4x4_neon_8, export=1 + ld1 {v28.4h-v31.4h}, [x0] + movi v18.4s, #74 + movi v19.4s, #29 + movi v20.4s, #55 + movi v21.4h, #74 + + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7 + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 + + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12 + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 + + st1 {v28.4h-v31.4h}, [x0] + ret +endfunc + // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs) .macro idct_dc size, bitdepth function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 4cc8732ad3..be1049a2ec 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -78,6 +78,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs); void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs); void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs); void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs); +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs); void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, @@ -146,6 +147,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; + c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; c->sao_band_filter[0] = c->sao_band_filter[1] = c->sao_band_filter[2] =
got 56% speed up (run_count=1000, CPU=Cortex A53) transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103 Signed-off-by: xufuji456 <839789740@qq.com> --- libavcodec/aarch64/hevcdsp_idct_neon.S | 48 +++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 + 2 files changed, 50 insertions(+)