[FFmpeg-devel,06/10] h264_idct_dc_add

Submitted by James Darnley on March 17, 2017, 1:18 p.m.

Details

Message ID 20170317131845.7760-7-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley March 17, 2017, 1:18 p.m.
1.04x faster (521±1.7 vs. 501±1.1 decicycles) compared with mmxext
---
 libavcodec/x86/h264_idct.asm  | 21 +++++++++++++++++++++
 libavcodec/x86/h264dsp_init.c |  2 ++
 2 files changed, 23 insertions(+)

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index ca8ffdb..c4b6e55 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1158,6 +1158,18 @@  INIT_XMM avx
     movd  [%7+%8], %4
 %endmacro
 
+%macro DC_ADD_INIT 1
+    add      %1d, 32
+    sar      %1d, 6
+    movd     m0, %1d
+    SPLATW   m0, m0, 0
+    lea      %1, [3*stride_q]
+    pxor     m1, m1
+    psubw    m1, m0
+    packuswb m0, m0
+    packuswb m1, m1
+%endmacro
+
 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
     movsxdifnidn stride_q, stride_d
     IDCT4_ADD    dst_q, block_q, stride_q
@@ -1167,3 +1179,12 @@  cglobal h264_idct8_add_8, 3, 4, 10, dst_, block_, stride_
     movsxdifnidn stride_q, stride_d
     IDCT8_ADD_SSE dst_q, block_q, stride_q, r3
 RET
+
+; Not any faster
+cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
+    movsxdifnidn stride_q, stride_d
+    movsx             r3d, word [block_q]
+    mov   dword [block_q], 0
+    DC_ADD_INIT r3
+    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 2172a71..1aa66a8 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -35,6 +35,7 @@  IDCT_ADD_FUNC(, 8, mmx)
 IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, avx)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
 IDCT_ADD_FUNC(8_dc, 8, mmxext)
 IDCT_ADD_FUNC(8_dc, 10, sse2)
@@ -342,6 +343,7 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_idct_add        = ff_h264_idct_add_8_avx;
             c->h264_idct8_add       = ff_h264_idct8_add_8_avx;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {