@@ -97,6 +97,9 @@ static const struct algo idct_tab_arch[] = {
#endif
#endif
#endif
+#if HAVE_SSE2_EXTERNAL
+ { "SIMPLE-SSE2", ff_simple_idct_sse2, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2 },
+#endif
{ 0 }
};
@@ -92,6 +92,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
+ c->idct = ff_simple_idct_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
}
@@ -151,6 +151,10 @@ SECTION .text
psrad m2, %7
packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
movq [%5], m7
movq m1, [blockq + %3] ; R3 R1 r3 r1
movq m4, [coeffs + 80] ; -C1 C5 -C1 C5
@@ -172,9 +176,15 @@ SECTION .text
psubd m4, m3 ; a3-B3 a3-b3
psrad m6, %7
packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
movq [8 + %5], m2
psrad m4, %7
packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
movq [16 + %5], m4
jmp %%2
%%1:
@@ -182,6 +192,9 @@ SECTION .text
paddd m0, [d40000]
psrad m0, 13
packssdw m0, m0
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
movq [%5], m0
movq [8 + %5], m0
movq [16 + %5], m0
@@ -239,6 +252,10 @@ SECTION .text
psrad m2, %7
packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
movq [%5], m7
movq m1, [blockq + %3] ; R3 R1 r3 r1
movq m4, [coeffs + 80] ; -C1 C5 -C1 C5
@@ -260,9 +277,15 @@ SECTION .text
psubd m4, m3 ; a3-B3 a3-b3
psrad m6, %7
packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
movq [8 + %5], m2
psrad m4, %7
packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
movq [16 + %5], m4
%endmacro
@@ -614,9 +637,15 @@ SECTION .text
psrad m7, %6
psrad m3, %6
packssdw m4, m7 ; A0 a0
+%if mmsize == 16
+pshufd m4, m4, q0020
+%endif
movq [%5], m4
psrad m0, %6
packssdw m0, m3 ; A1 a1
+%if mmsize == 16
+pshufd m0, m0, q0020
+%endif
movq [16 + %5], m0
movq [96 + %5], m0
movq [112 + %5], m4
@@ -624,9 +653,15 @@ SECTION .text
psrad m6, %6
psrad m2, %6
packssdw m5, m2 ; A2-B2 a2-b2
+%if mmsize == 16
+pshufd m5, m5, q0020
+%endif
movq [32 + %5], m5
psrad m1, %6
packssdw m6, m1 ; A3+B3 a3+b3
+%if mmsize == 16
+pshufd m6, m6, q0020
+%endif
movq [48 + %5], m6
movq [64 + %5], m6
movq [80 + %5], m5
@@ -711,9 +746,15 @@ SECTION .text
movq m7, [coeffs + 32] ; C6 C2 C6 C2
psrad m1, %6
packssdw m4, m1 ; A0 a0
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
movq [%5], m4
psrad m2, %6
packssdw m0, m2 ; A1 a1
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
movq [16 + %5], m0
movq [96 + %5], m0
movq [112 + %5], m4
@@ -889,6 +930,10 @@ RET
INIT_XMM sse2
+cglobal simple_idct, 1, 2, 8, 128, block, t0
+ IDCT
+RET
+
cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
IDCT
lea lsize3q, [lsizeq*3]
@@ -26,6 +26,7 @@ void ff_simple_idct_mmx(int16_t *block);
void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_sse2(int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);