[FFmpeg-devel,09/10] h264_idct8_add4

Submitted by James Darnley on March 17, 2017, 1:18 p.m.

Details

Message ID 20170317131845.7760-10-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley March 17, 2017, 1:18 p.m.
1.00x faster (2884±63.9 vs. 2880±21.1 decicycles) compared with sse2
---
 libavcodec/x86/h264_idct.asm  | 60 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_init.c |  2 ++
 2 files changed, 62 insertions(+)

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index f1f2ce7..1515ea5 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1237,3 +1237,63 @@  cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, s
     add16_sse2_cycle 6, 0x1e
     add16_sse2_cycle 7, 0x26
 RET
+
+; dst, block_offset, block, stride, nnzc, counter, coeff, dst2, picreg
+; 0    1             2      3       4     5        6      7     8
+cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst_, block_offset_, block_, stride_, nnzc_, counter_, coeff_, dst2_, picreg
+    movsxdifnidn stride_q, stride_d
+    xor counter_q, counter_q
+    %ifdef PIC
+        lea picregq, [scan8_mem]
+    %endif
+
+    .next_block:
+        movzx coeff_d, byte [scan8 + counter_q]
+        movzx coeff_d, byte [nnzc_q + coeff_q]
+        test coeff_d, coeff_d
+        jz .skip_block
+
+        cmp coeff_d, 1
+        jnz .no_dc
+
+        movsx coeff_d, word [block_q]
+        test coeff_d, coeff_d
+        jz .no_dc
+
+        mov word [block_q], 0
+        DC_ADD_INIT r6
+        %define stride3 r6
+        %if ARCH_X86_64 == 0
+            %define dst2_q r1
+            %define dst2_d r1d
+        %endif
+
+        mov dst2_d, dword [block_offset_q + 4*counter_q]
+        add dst2_q, dst_q
+        DC_ADD_MMXEXT_OP movq, dst2_q, stride_q, stride3
+        lea dst2_q, [dst2_q + 4*stride_q]
+        DC_ADD_MMXEXT_OP movq, dst2_q, stride_q, stride3
+        %if ARCH_X86_64 == 0
+            mov block_offset_q, block_offset_m
+        %endif
+
+        add counter_q, 4
+        add block_q, 128
+        cmp counter_q, 16
+    jl .next_block
+    RET
+
+    .no_dc:
+        mov dst2_d, dword [block_offset_q + 4*counter_q]
+        add dst2_q, dst_q
+        IDCT8_ADD_SSE dst2_q, block_q, stride_q, stride3
+        %if ARCH_X86_64 == 0
+            mov block_offset_q, block_offset_m
+        %endif
+
+    .skip_block:
+        add counter_q, 4
+        add block_q, 128
+        cmp counter_q, 16
+    jl .next_block
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 3396fd8..4050276 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -57,6 +57,7 @@  void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT       \
 IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
 IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
 IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 8, avx)
 IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
 IDCT_ADD_REP_FUNC(8, 4, 10, avx)
 IDCT_ADD_REP_FUNC(, 16, 8, mmx)
@@ -348,6 +349,7 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
             c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_8_avx;
             c->h264_idct_add16      = ff_h264_idct_add16_8_avx;
+            c->h264_idct8_add4      = ff_h264_idct8_add4_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {