[FFmpeg-devel,10/10] h264_idct_add16intra

Submitted by James Darnley on March 17, 2017, 1:18 p.m.

Details

Message ID 20170317131845.7760-11-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley March 17, 2017, 1:18 p.m.
Broken FATE

1.02x faster (1580±4.8 vs. 1555±3.9 decicycles) compared with sse2
---
 libavcodec/x86/h264_idct.asm  | 43 +++++++++++++++++++++++++++++++++++++++++--
 libavcodec/x86/h264dsp_init.c |  2 ++
 2 files changed, 43 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 1515ea5..16998dc 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -895,7 +895,7 @@  REP_RET
 %else
     add         r0, r0m
 %endif
-    call        h264_add8x4_idct_sse2
+    call        h264_add8x4_idct_ %+ cpuname
     jmp %%skip
 %%trydc:
     movsx       r0, word [r2   ]
@@ -907,13 +907,15 @@  REP_RET
 %else
     add         r0, r0m
 %endif
-    call        h264_idct_dc_add8_mmxext
+    call        h264_idct_dc_add8_ %+ cpuname
 %%skip:
 %if %1 < 7
     add         r2, 64
 %endif
 %endmacro
 
+%define h264_idct_dc_add8_sse2 h264_idct_dc_add8_mmxext
+
 ; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
 ;                                     int16_t *block, int stride,
 ;                                     const uint8_t nnzc[6 * 8])
@@ -1193,6 +1195,27 @@  ret
     packuswb m1, m1
 %endmacro
 
+ALIGN 16
+; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
+; FIXME: I produce incorrect output
+h264_idct_dc_add8_avx:
+    movsxdifnidn r3, r3d
+    movd         m0, [r2   ]          ;  0 0 X D
+    mov word [r2+ 0], 0
+    punpcklwd    m0, [r2+32]          ;  x X d D
+    mov word [r2+32], 0
+    paddsw       m0, [pw_32]
+    psraw        m0, 6
+    punpcklwd    m0, m0               ;  d d D D
+    pxor         m1, m1               ;  0 0 0 0
+    psubw        m1, m0               ; -d-d-D-D
+    packuswb     m0, m1               ; -d-d-D-D d d D D
+    pshuflw      m1, m0, q3322        ; -d-d-d-d-D-D-D-D
+    punpcklwd    m0, m0               ;  d d d d D D D D
+    lea          r6, [r3*3]
+    DC_ADD_MMXEXT_OP movq, r0, r3, r6
+ret
+
 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
     movsxdifnidn stride_q, stride_d
     IDCT4_ADD    dst_q, block_q, stride_q
@@ -1238,6 +1261,22 @@  cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, s
     add16_sse2_cycle 7, 0x26
 RET
 
+; FIXME: I produce incorrect output
+cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8, dst_, block_offset_, block_, stride_, nnzc_
+    movsxdifnidn stride_q, stride_d
+    %if ARCH_X86_64
+        mov r7, r0
+    %endif
+    add16intra_sse2_cycle 0, 0xc
+    add16intra_sse2_cycle 1, 0x14
+    add16intra_sse2_cycle 2, 0xe
+    add16intra_sse2_cycle 3, 0x16
+    add16intra_sse2_cycle 4, 0x1c
+    add16intra_sse2_cycle 5, 0x24
+    add16intra_sse2_cycle 6, 0x1e
+    add16intra_sse2_cycle 7, 0x26
+RET
+
 ; dst, block_offset, block, stride, nnzc, counter, coeff, dst2, picreg
 ; 0    1             2      3       4     5        6      7     8
 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst_, block_offset_, block_, stride_, nnzc_, counter_, coeff_, dst2_, picreg
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 4050276..e09566d 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -68,6 +68,7 @@  IDCT_ADD_REP_FUNC(, 16, 10, sse2)
 IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
 IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
 IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 8, avx)
 IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
 IDCT_ADD_REP_FUNC(, 16, 10, avx)
 IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
@@ -350,6 +351,7 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_8_avx;
             c->h264_idct_add16      = ff_h264_idct_add16_8_avx;
             c->h264_idct8_add4      = ff_h264_idct8_add4_8_avx;
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {