[FFmpeg-devel,08/10] h264_idct_add16

Submitted by James Darnley on March 17, 2017, 1:18 p.m.

Details

Message ID 20170317131845.7760-9-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley March 17, 2017, 1:18 p.m.
1.01x faster (2150±46.1 vs. 2118±29.0 decicycles) compared with sse2
---
 libavcodec/x86/h264_idct.asm  | 40 +++++++++++++++++++++++++++++++++++++++-
 libavcodec/x86/h264dsp_init.c |  2 ++
 2 files changed, 41 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index a74e095..f1f2ce7 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -858,7 +858,7 @@  h264_add8x4_idct_sse2:
 %else
     add         r0, r0m
 %endif
-    call        h264_add8x4_idct_sse2
+    call        h264_add8x4_idct_ %+ cpuname
 %%skip:
 %if %1 < 7
     add         r2, 64
@@ -1142,6 +1142,29 @@  IDCT_DC_DEQUANT 7
 
 INIT_XMM avx
 
+ALIGN 16
+; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
+h264_add8x4_idct_avx:
+    movu m0, [r2 + 0]
+    movu m1, [r2 + 32]
+    movu m2, [r2 + 16]
+    movu m3, [r2 + 48]
+    SBUTTERFLY qdq, 0, 1, 4
+    SBUTTERFLY qdq, 2, 3, 5
+    IDCT4_1D w,0,1,2,3,4,5
+    TRANSPOSE2x4x4W 0,1,2,3,4
+    paddw m0, [pw_32]
+    IDCT4_1D w,0,1,2,3,4,5
+    pxor  m7, m7
+    mova [r2+ 0], m7
+    mova [r2+16], m7
+    mova [r2+32], m7
+    mova [r2+48], m7
+    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
+    lea   r0, [r0+r3*2]
+    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
+ret
+
 ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
 %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
     movd       %3, [%7]
@@ -1199,3 +1222,18 @@  cglobal h264_idct8_dc_add_8, 3, 4, 0
     lea          dst_q, [dst_q + stride_q*4]
     DC_ADD_MMXEXT_OP movq, dst_q, stride_q, r3
 RET
+
+cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, stride_, nnzc_
+    movsxdifnidn stride_q, stride_d
+    %if ARCH_X86_64
+        mov r5, r0
+    %endif
+    add16_sse2_cycle 0, 0xc
+    add16_sse2_cycle 1, 0x14
+    add16_sse2_cycle 2, 0xe
+    add16_sse2_cycle 3, 0x16
+    add16_sse2_cycle 4, 0x1c
+    add16_sse2_cycle 5, 0x24
+    add16_sse2_cycle 6, 0x1e
+    add16_sse2_cycle 7, 0x26
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index de7becf..3396fd8 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -62,6 +62,7 @@  IDCT_ADD_REP_FUNC(8, 4, 10, avx)
 IDCT_ADD_REP_FUNC(, 16, 8, mmx)
 IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
 IDCT_ADD_REP_FUNC(, 16, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16, 8, avx)
 IDCT_ADD_REP_FUNC(, 16, 10, sse2)
 IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
 IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
@@ -346,6 +347,7 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct8_add       = ff_h264_idct8_add_8_avx;
             c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
             c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_8_avx;
+            c->h264_idct_add16      = ff_h264_idct_add16_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {