diff mbox series

[FFmpeg-devel] libavcodec/hevc: add hevc idct4x4 neon of aarch64 after fixed

Message ID tencent_EFB23DF23E97FB8578B2BF63CC1993C16B09@qq.com
State Accepted
Commit 4b4de07721ed734fea7ec1058ccdc4cc11f3d2da
Headers show
Series [FFmpeg-devel] libavcodec/hevc: add hevc idct4x4 neon of aarch64 after fixed | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

徐福隆 Feb. 24, 2023, 7:43 a.m. UTC
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 40 +++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 +++
 2 files changed, 44 insertions(+)

Comments

Martin Storsjö Feb. 28, 2023, 11:15 a.m. UTC | #1
On Fri, 24 Feb 2023, xufuji456 wrote:

> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 40 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 +++
> 2 files changed, 44 insertions(+)

Thanks, this looks good to me now, will push in a moment.

The subject of the commit doesn't need to include "after fixed" or such 
comments - those comments are kinda irrelevant once the commit gets 
pushed.

It's usually good practice to include checkasm benchmark numbers (and 
mention what system the numbers are from) when posting assembly 
optimizations.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 124c50998a..f5135160b6 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -245,6 +245,43 @@  function hevc_add_residual_32x32_16_neon, export=0
         ret
 endfunc
 
+.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
+         sshll          v20.4s, \in0, #6
+         sshll          v21.4s, \in0, #6
+         smull          v22.4s, \in1, v4.h[1]
+         smull          v23.4s, \in1, v4.h[3]
+         smlal          v20.4s, \in2, v4.h[0] //e0
+         smlsl          v21.4s, \in2, v4.h[0] //e1
+         smlal          v22.4s, \in3, v4.h[3] //o0
+         smlsl          v23.4s, \in3, v4.h[1] //o1
+
+         add            v24.4s, v20.4s, v22.4s
+         sub            v20.4s, v20.4s, v22.4s
+         add            v22.4s, v21.4s, v23.4s
+         sub            v21.4s, v21.4s, v23.4s
+         sqrshrn        \out0, v24.4s, #\shift
+         sqrshrn        \out3, v20.4s, #\shift
+         sqrshrn        \out1, v22.4s, #\shift
+         sqrshrn        \out2, v21.4s, #\shift
+.endm
+
+.macro idct_4x4 bitdepth
+function ff_hevc_idct_4x4_\bitdepth\()_neon, export=1
+        ld1             {v0.4h-v3.4h}, [x0]
+
+        movrel          x1, trans
+        ld1             {v4.4h}, [x1]
+
+        tr_4x4          v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h, 7
+        transpose_4x8H  v16, v17, v18, v19, v26, v27, v28, v29
+
+        tr_4x4          v16.4h, v17.4h, v18.4h, v19.4h, v0.4h, v1.4h, v2.4h, v3.4h, 20 - \bitdepth
+        transpose_4x8H  v0, v1, v2, v3, v26, v27, v28, v29
+        st1             {v0.4h-v3.4h}, [x0]
+        ret
+endfunc
+.endm
+
 .macro sum_sub out, in, c, op, p
   .ifc \op, +
         smlal\p         \out, \in, \c
@@ -578,6 +615,9 @@  function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
 endfunc
 .endm
 
+idct_4x4 8
+idct_4x4 10
+
 idct_8x8 8
 idct_8x8 10
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 88a797f393..1deefca0a2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -49,6 +49,8 @@  void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs,
                                         ptrdiff_t stride);
 void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, const int16_t *coeffs,
                                         ptrdiff_t stride);
+void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -119,6 +121,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
+        c->idct[0]                     = ff_hevc_idct_4x4_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
@@ -168,6 +171,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[1]             = ff_hevc_add_residual_8x8_10_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_10_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_10_neon;
+        c->idct[0]                     = ff_hevc_idct_4x4_10_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_10_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_10_neon;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_10_neon;