diff mbox

[FFmpeg-devel,3/6] add and fix xmm version of simple_idct

Message ID 20170603001809.13960-4-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley June 3, 2017, 12:18 a.m. UTC
---
 libavcodec/tests/x86/dct.c     |  3 +++
 libavcodec/x86/idctdsp_init.c  |  1 +
 libavcodec/x86/simple_idct.asm | 45 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/simple_idct.h   |  1 +
 4 files changed, 50 insertions(+)
diff mbox

Patch

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 34f5b8767b..97116570f4 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -97,6 +97,9 @@  static const struct algo idct_tab_arch[] = {
 #endif
 #endif
 #endif
+#if HAVE_SSE2_EXTERNAL
+    { "SIMPLE-SSE2",  ff_simple_idct_sse2,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2 },
+#endif
     { 0 }
 };
 
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index f1c915aa00..82530a5cc4 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -92,6 +92,7 @@  av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct_put  = ff_simple_idct_put_sse2;
                 c->idct_add  = ff_simple_idct_add_sse2;
+                c->idct      = ff_simple_idct_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
     }
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index 3b62a4f9d3..a6eb42464b 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -151,6 +151,10 @@  SECTION .text
     psrad            m2, %7
     packssdw         m7, m1             ; A1+B1  a1+b1   A0+B0   a0+b0
     packssdw         m2, m4             ; A0-B0  a0-b0   A1-B1   a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
     movq           [%5], m7
     movq             m1, [blockq + %3]  ; R3     R1      r3      r1
     movq             m4, [coeffs + 80]  ; -C1    C5      -C1     C5
@@ -172,9 +176,15 @@  SECTION .text
     psubd            m4, m3             ; a3-B3          a3-b3
     psrad            m6, %7
     packssdw         m2, m6             ; A3+B3  a3+b3   A2+B2   a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
     movq       [8 + %5], m2
     psrad            m4, %7
     packssdw         m4, m0             ; A2-B2  a2-b2   A3-B3   a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
     movq      [16 + %5], m4
     jmp             %%2
 %%1:
@@ -182,6 +192,9 @@  SECTION .text
     paddd            m0, [d40000]
     psrad            m0, 13
     packssdw         m0, m0
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
     movq           [%5], m0
     movq       [8 + %5], m0
     movq      [16 + %5], m0
@@ -239,6 +252,10 @@  SECTION .text
     psrad            m2, %7
     packssdw         m7, m1             ; A1+B1  a1+b1   A0+B0   a0+b0
     packssdw         m2, m4             ; A0-B0  a0-b0   A1-B1   a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
     movq           [%5], m7
     movq             m1, [blockq + %3]  ; R3     R1      r3      r1
     movq             m4, [coeffs + 80]  ; -C1    C5      -C1     C5
@@ -260,9 +277,15 @@  SECTION .text
     psubd            m4, m3             ; a3-B3          a3-b3
     psrad            m6, %7
     packssdw         m2, m6             ; A3+B3  a3+b3   A2+B2   a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
     movq       [8 + %5], m2
     psrad            m4, %7
     packssdw         m4, m0             ; A2-B2  a2-b2   A3-B3   a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
     movq      [16 + %5], m4
 %endmacro
 
@@ -614,9 +637,15 @@  SECTION .text
     psrad            m7, %6
     psrad            m3, %6
     packssdw         m4, m7             ; A0     a0
+%if mmsize == 16
+pshufd m4, m4, q0020
+%endif
     movq           [%5], m4
     psrad            m0, %6
     packssdw         m0, m3             ; A1     a1
+%if mmsize == 16
+pshufd m0, m0, q0020
+%endif
     movq      [16 + %5], m0
     movq      [96 + %5], m0
     movq     [112 + %5], m4
@@ -624,9 +653,15 @@  SECTION .text
     psrad            m6, %6
     psrad            m2, %6
     packssdw         m5, m2             ; A2-B2  a2-b2
+%if mmsize == 16
+pshufd m5, m5, q0020
+%endif
     movq      [32 + %5], m5
     psrad            m1, %6
     packssdw         m6, m1             ; A3+B3  a3+b3
+%if mmsize == 16
+pshufd m6, m6, q0020
+%endif
     movq      [48 + %5], m6
     movq      [64 + %5], m6
     movq      [80 + %5], m5
@@ -711,9 +746,15 @@  SECTION .text
     movq             m7, [coeffs + 32]  ; C6     C2      C6      C2
     psrad            m1, %6
     packssdw         m4, m1             ; A0     a0
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
     movq           [%5], m4
     psrad            m2, %6
     packssdw         m0, m2             ; A1     a1
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
     movq      [16 + %5], m0
     movq      [96 + %5], m0
     movq     [112 + %5], m4
@@ -889,6 +930,10 @@  RET
 
 INIT_XMM sse2
 
+cglobal simple_idct, 1, 2, 8, 128, block, t0
+    IDCT
+RET
+
 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
     IDCT
     lea lsize3q, [lsizeq*3]
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d17ef6a462..b19e910372 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -26,6 +26,7 @@  void ff_simple_idct_mmx(int16_t *block);
 void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
+void ff_simple_idct_sse2(int16_t *block);
 void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);