[FFmpeg-devel,V2,2/2] libswscale/x86/yuv2rgb: add ssse3 version

Submitted by Ting Fu on Dec. 2, 2019, 3:12 a.m.

Details

Message ID 20191202031242.13641-2-ting.fu@intel.com
State New
Headers show

Commit Message

Ting Fu Dec. 2, 2019, 3:12 a.m.
Tested using this command:
/ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \
-vcodec rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null

The fps increase from 389 to 640 on my local machine.

Signed-off-by: Ting Fu <ting.fu@intel.com>
---
 libswscale/x86/yuv2rgb.c          |   8 +-
 libswscale/x86/yuv2rgb_template.c |  58 ++++++++++-
 libswscale/x86/yuv_2_rgb.asm      | 162 +++++++++++++++++++++++++++---
 3 files changed, 209 insertions(+), 19 deletions(-)

Comments

Michael Niedermayer Dec. 3, 2019, 8:10 a.m.
On Mon, Dec 02, 2019 at 11:12:42AM +0800, Ting Fu wrote:
> Tested using this command:
> /ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \
> -vcodec rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null
> 
> The fps increase from 389 to 640 on my local machine.
> 
> Signed-off-by: Ting Fu <ting.fu@intel.com>
> ---
>  libswscale/x86/yuv2rgb.c          |   8 +-
>  libswscale/x86/yuv2rgb_template.c |  58 ++++++++++-
>  libswscale/x86/yuv_2_rgb.asm      | 162 +++++++++++++++++++++++++++---
>  3 files changed, 209 insertions(+), 19 deletions(-)

one of these patches seems to produce new warnings like:
libswscale/x86/yuv2rgb_template.c: In function ‘yuv420_rgb15’:
libswscale/x86/yuv2rgb_template.c:113:5: warning: passing argument 5 of ‘ff_yuv_420_rgb15_ssse3’ from 



[...]
Ting Fu Dec. 4, 2019, 3:03 a.m.
> -----Original Message-----

> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of

> Michael Niedermayer

> Sent: Tuesday, December 3, 2019 04:11 PM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] libswscale/x86/yuv2rgb: add ssse3

> version

> 

> On Mon, Dec 02, 2019 at 11:12:42AM +0800, Ting Fu wrote:

> > Tested using this command:

> > /ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \ -vcodec

> > rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null

> >

> > The fps increase from 389 to 640 on my local machine.

> >

> > Signed-off-by: Ting Fu <ting.fu@intel.com>

> > ---

> >  libswscale/x86/yuv2rgb.c          |   8 +-

> >  libswscale/x86/yuv2rgb_template.c |  58 ++++++++++-

> >  libswscale/x86/yuv_2_rgb.asm      | 162 +++++++++++++++++++++++++++---

> >  3 files changed, 209 insertions(+), 19 deletions(-)

> 

> one of these patches seems to produce new warnings like:

> libswscale/x86/yuv2rgb_template.c: In function ‘yuv420_rgb15’:

> libswscale/x86/yuv2rgb_template.c:113:5: warning: passing argument 5 of

> ‘ff_yuv_420_rgb15_ssse3’ from

> 


Hi Michael,

This warning is because that the type of one formal parameter in ff_yuv_420_rgbXX_XXXX() has been set as uint8_t.
But it is uint64_t actually. I have corrected it in PATCH V3.
Thank you for your review, I would pay more attention to the warning.

Ting Fu

> 

> 

> [...]

> --

> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

> 

> The real ebay dictionary, page 2

> "100% positive feedback" - "All either got their money back or didnt complain"

> "Best seller ever, very honest" - "Seller refunded buyer after failed scam"

Patch hide | download patch | download mbox

diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index ed9b613cab..b83dd7089a 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -61,13 +61,19 @@  DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
 #define COMPILE_TEMPLATE_MMXEXT 1
 #endif /* HAVE_MMXEXT */
 
+//SSSE3 versions
+#if HAVE_SSSE3
+#define COMPILE_TEMPLATE_SSSE3 1
+#endif
+
 #include "yuv2rgb_template.c"
 
 av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags) || EXTERNAL_MMXEXT(cpu_flags)) {
+    if (EXTERNAL_MMX(cpu_flags) || EXTERNAL_MMXEXT(cpu_flags) ||
+            EXTERNAL_SSSE3(cpu_flags)) {
         switch (c->dstFormat) {
         case AV_PIX_FMT_RGB32:
             if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index efe6356f30..fe586047f0 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -40,6 +40,30 @@ 
         const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
         x86_reg index = -h_size / 2;                                 \
 
+extern void ff_yuv_420_rgb24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_bgr24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_rgb15_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_rgb16_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuva_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                    const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                    const uint8_t *py_2index, const uint8_t *pa_2index);
+extern void ff_yuva_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                    const uint8_t *pv_index, const uint8_t *pointer_c_dither,
+                                    const uint8_t *py_2index, const uint8_t *pa_2index);
 extern void ff_yuv_420_rgb24_mmxext(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                     const uint8_t *pv_index, const uint8_t *pointer_c_dither,
                                     const uint8_t *py_2index);
@@ -84,7 +108,12 @@  static inline int yuv420_rgb15(SwsContext *c, const uint8_t *src[],
     c->greenDither = ff_dither8[y       & 1];
     c->redDither   = ff_dither8[(y + 1) & 1];
 #endif
+
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb15_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_rgb15_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -102,7 +131,12 @@  static inline int yuv420_rgb16(SwsContext *c, const uint8_t *src[],
     c->greenDither = ff_dither4[y       & 1];
     c->redDither   = ff_dither8[(y + 1) & 1];
 #endif
+
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb16_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_rgb16_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -115,7 +149,9 @@  static inline int yuv420_rgb24(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(3)
 
-#if COMPILE_TEMPLATE_MMXEXT
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#elif COMPILE_TEMPLATE_MMXEXT
     ff_yuv_420_rgb24_mmxext(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
 #else
     ff_yuv_420_rgb24_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
@@ -132,7 +168,9 @@  static inline int yuv420_bgr24(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(3)
 
-#if COMPILE_TEMPLATE_MMXEXT
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_bgr24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#elif COMPILE_TEMPLATE_MMXEXT
     ff_yuv_420_bgr24_mmxext(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
 #else
     ff_yuv_420_bgr24_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
@@ -149,7 +187,11 @@  static inline int yuv420_rgb32(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(4)
 
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_rgb32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -162,7 +204,11 @@  static inline int yuv420_bgr32(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(4)
 
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_bgr32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -176,7 +222,11 @@  static inline int yuva420_rgb32(SwsContext *c, const uint8_t *src[],
     YUV2RGB_LOOP(4)
 
     const uint8_t *pa = src[3] + y * srcStride[3];
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuva_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#else
     ff_yuva_420_rgb32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -190,7 +240,11 @@  static inline int yuva420_bgr32(SwsContext *c, const uint8_t *src[],
     YUV2RGB_LOOP(4)
 
     const uint8_t *pa = src[3] + y * srcStride[3];
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuva_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#else
     ff_yuva_420_bgr32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#endif
     }
     return srcSliceH;
 }
diff --git a/libswscale/x86/yuv_2_rgb.asm b/libswscale/x86/yuv_2_rgb.asm
index a44ab1607b..723d6e573e 100644
--- a/libswscale/x86/yuv_2_rgb.asm
+++ b/libswscale/x86/yuv_2_rgb.asm
@@ -25,11 +25,21 @@ 
 
 SECTION_RODATA
 
-pw_00ff: times 4 dw 255
-pb_f8:   times 8 db 248
-pb_e0:   times 8 db 224
-pb_03:   times 8 db 3
-pb_07:   times 8 db 7
+; below variables are named like mask_dwXYtoZW, which means to shuffle from dword No.X & No.Y to No.Z & No.W
+mask_evenword:   db  2,  3,  6,  7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1
+mask_oddword:    db  0,  1,  4,  5,  8,  9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1
+mask_dw01to03:   db  0,  1,  2,  3, -1, -1, -1, -1, -1, -1, -1, -1,  4,  5,  6,  7
+mask_dw01to12:   db -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,  6,  7, -1, -1, -1, -1
+mask_dw2to2:     db -1, -1, -1, -1, -1, -1, -1, -1,  8,  9, 10, 11, -1, -1, -1, -1
+mask_dw23to01:   db  8,  9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1
+mask_dw0to3:     db -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  3
+mask_dw3to1:     db -1, -1, -1, -1, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1
+mask_dw123to023: db  4,  5,  6,  7, -1, -1, -1, -1,  8,  9, 10, 11, 12, 13, 14, 15
+pw_00ff: times 8 dw 255
+pb_f8:   times 16 db 248
+pb_e0:   times 16 db 224
+pb_03:   times 16 db 3
+pb_07:   times 16 db 7
 
 mask_1101: dw -1, -1,  0, -1
 mask_0010: dw  0,  0, -1,  0
@@ -49,7 +59,11 @@  SECTION .text
 ;-----------------------------------------------------------------------------
 
 %macro MOV_H2L 1
-psrlq %1, 32
+%if mmsize == 8
+    psrlq %1, 32
+%else ; mmsize == 16
+    psrldq %1, 8
+%endif
 %endmacro
 
 %macro yuv2rgb_fn 3
@@ -77,6 +91,7 @@  psrlq %1, 32
 %define m_blue m1
 %endif
 
+%if mmsize == 8
 %define time_num 1
 %define reg_num 8
 %define y_offset [pointer_c_ditherq + 8  * 8]
@@ -87,11 +102,45 @@  psrlq %1, 32
 %define y_coff   [pointer_c_ditherq + 3  * 8]
 %define ub_coff  [pointer_c_ditherq + 5  * 8]
 %define vr_coff  [pointer_c_ditherq + 4  * 8]
+%elif mmsize == 16
+%define time_num 2
+%if ARCH_X86_32
+%define reg_num 8
+%define my_offset [pointer_c_ditherq + 8  * 8]
+%define mu_offset [pointer_c_ditherq + 9  * 8]
+%define mv_offset [pointer_c_ditherq + 10 * 8]
+%define mug_coff  [pointer_c_ditherq + 7  * 8]
+%define mvg_coff  [pointer_c_ditherq + 6  * 8]
+%define my_coff   [pointer_c_ditherq + 3  * 8]
+%define mub_coff  [pointer_c_ditherq + 5  * 8]
+%define mvr_coff  [pointer_c_ditherq + 4  * 8]
+%else ; ARCH_X86_64
+%define reg_num 16
+%define y_offset m8
+%define u_offset m9
+%define v_offset m10
+%define ug_coff  m11
+%define vg_coff  m12
+%define y_coff   m13
+%define ub_coff  m14
+%define vr_coff  m15
+%endif ; ARCH_X86_32/64
+%endif ; coeff define mmsize == 8/16
 
 cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
 
 %if ARCH_X86_64
     movsxd indexq, indexd
+%if mmsize == 16
+    VBROADCASTSD y_offset, [pointer_c_ditherq + 8  * 8]
+    VBROADCASTSD u_offset, [pointer_c_ditherq + 9  * 8]
+    VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
+    VBROADCASTSD ug_coff,  [pointer_c_ditherq + 7  * 8]
+    VBROADCASTSD vg_coff,  [pointer_c_ditherq + 6  * 8]
+    VBROADCASTSD y_coff,   [pointer_c_ditherq + 3  * 8]
+    VBROADCASTSD ub_coff,  [pointer_c_ditherq + 5  * 8]
+    VBROADCASTSD vr_coff,  [pointer_c_ditherq + 4  * 8]
+%endif
 %endif
     mova m_y, [py_2indexq + 2 * indexq]
     movh m_u, [pu_indexq  +     indexq]
@@ -108,10 +157,32 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     psllw m1, 3
     psllw m6, 3
     psllw m7, 3
+%if (ARCH_X86_32 && mmsize == 16)
+    vpbroadcastq m2, mu_offset
+    vpbroadcastq m3, mv_offset
+    vpbroadcastq m4, my_offset
+    psubsw m0, m2 ; U = U - 128
+    psubsw m1, m3 ; V = V - 128
+    psubw  m6, m4
+    psubw  m7, m4
+    mova m2, m0
+    mova m3, m1
+    vpbroadcastq m4, mug_coff
+    vpbroadcastq m5, mvg_coff
+    pmulhw m2, m4
+    pmulhw m3, m5
+    vpbroadcastq m4, my_coff
+    vpbroadcastq m5, mub_coff
+    pmulhw m6, m4
+    pmulhw m7, m4
+    pmulhw m0, m5
+    vpbroadcastq m4, mvr_coff
+    pmulhw m1, m4
+%else ; ARCH_X86_64 || mmsize == 8
     psubsw m0, u_offset ; U = U - 128
     psubsw m1, v_offset ; V = V - 128
-    psubw m6, y_offset
-    psubw m7, y_offset
+    psubw  m6, y_offset
+    psubw  m7, y_offset
     mova m2, m0
     mova m3, m1
     pmulhw m2, ug_coff
@@ -120,6 +191,7 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     pmulhw m7, y_coff
     pmulhw m0, ub_coff
     pmulhw m1, vr_coff
+%endif
     paddsw m2, m3
     mova m3, m7
     mova m5, m7
@@ -142,6 +214,7 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     punpcklbw m6, m_red  ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
     mova m5, m3
     punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
+%if  mmsize == 8
     punpcklwd m3 ,m6     ; R0 G0 B0 R1 R2 G2 B2 R3
     punpckhwd m5, m6     ; R4 G4 B4 R5 R6 G6 B6 R7
 %if cpuflag(mmxext)
@@ -177,7 +250,45 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     psrlq m5, 32
     movd [imageq + 20], m2 ; -- -- G7 B7
     movd [imageq + 18], m5 ; R6 G6 B6 R7
-%endif
+%endif ; mmsize = 8
+%else ; mmsize == 16
+    mova m0, m3
+    pshufb m0, [mask_evenword] ; R2 G2 R6 G6 R10 G10 R14 G14 -- -- -- -- -- -- -- --
+    mova m1, m2
+    pshufb m1, [mask_oddword]  ; G1 B1 G5 B5 G9 B9 G13 B13 -- -- -- -- -- -- -- --
+    punpcklwd m1, m0           ; G1 B1 R2 G2 G5 B5 R6 G6 G9 B9 R10 G10 G13 B13 R14 G14
+    mova m0,m6
+    pshufb m0, [mask_evenword] ; B2 R3 B6 R7 B10 R11 B14 R15 -- -- -- -- -- -- -- --
+    mova m4, m2
+    pshufb m4, [mask_evenword] ; G3 B3 G7 B7 G11 B11 G15 G15 -- -- -- -- -- -- -- --
+    punpcklwd m0, m4
+    pshufb m3, [mask_oddword]  ; R0 G0 R4 G4 R8 G8 R12 G12 -- -- -- -- -- -- -- --
+    pshufb m6, [mask_oddword]  ; B0 R1 B4 R5 B8 R9 B12 R13 -- -- -- -- -- -- -- --
+    mova m5, m0
+    mova m7, m1
+    punpcklwd m3, m6 ; R0  G0  B0  R1  R4  G4  B4  R5  R8  G9  B8  R9  R12 G12 B12 R13
+    punpckldq m7, m5 ; G1  B1  R2  G2  B2  R3  G3  B3  G5  B5  R5  G5  B6  R7  G7  B7
+    punpckhdq m1, m0 ; G9  B9  R10 G10 B10 R11 G11 B11 G13 B13 R14 G14 B14 R15 G15 B15
+    mova m0, m3
+    mova m2, m7
+    pshufb m0, [mask_dw01to03]
+    pshufb m2, [mask_dw01to12]
+    por m0, m2 ; R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+    mova m2, m3
+    mova m4, m7
+    pshufb m2, [mask_dw2to2]
+    pshufb m4, [mask_dw23to01]
+    por m2, m4
+    mova m4, m1
+    pshufb m4, [mask_dw0to3]
+    por m2, m4 ; G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 R10 G10
+    pshufb m3, [mask_dw3to1]
+    pshufb m1, [mask_dw123to023]
+    por m1, m3 ; B10 R11 G11 B11 R12 G12 B12 R13 G13 B13 R14 G14 B14 R15 G15 B15
+    mova [imageq], m0
+    mova [imageq + 16], m2
+    mova [imageq + 32], m1
+%endif ; mmsize = 16
 %else ; PACK RGB15/16/32
     packuswb m0, m1
     packuswb m3, m5
@@ -196,10 +307,10 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
 %endif
     mova m5, m_blue
     mova m6, m_red
-    punpckhbw m5, m_green
+    punpckhbw m5,     m_green
     punpcklbw m_blue, m_green
-    punpckhbw m6, m_alpha
-    punpcklbw m_red, m_alpha
+    punpckhbw m6,     m_alpha
+    punpcklbw m_red,  m_alpha
     mova m_green, m_blue
     mova m_alpha, m5
     punpcklwd m_blue, m_red
@@ -207,14 +318,23 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     punpcklwd m5, m6
     punpckhwd m_alpha, m6
     mova [imageq + 0], m_blue
-    mova [imageq + 8 * time_num], m_green
+    mova [imageq + 8  * time_num], m_green
     mova [imageq + 16 * time_num], m5
     mova [imageq + 24 * time_num], m_alpha
 %else ; PACK RGB15/16
 %define depth 2
-%define blue_dither  [pointer_c_ditherq + 2 * 8]
-%define green_dither [pointer_c_ditherq + 1 * 8]
-%define red_dither   [pointer_c_ditherq + 0 * 8]
+%if cpuflag(ssse3)
+    %define red_dither m3
+    %define green_dither m4
+    %define blue_dither m5
+    VBROADCASTSD red_dither,   [pointer_c_ditherq + 0 * 8]
+    VBROADCASTSD green_dither, [pointer_c_ditherq + 1 * 8]
+    VBROADCASTSD blue_dither,  [pointer_c_ditherq + 2 * 8]
+%else ; cpuflag(mmx/mmxext)
+%define blue_dither  [pointer_c_ditherq + 2  * 8]
+%define green_dither [pointer_c_ditherq + 1  * 8]
+%define red_dither   [pointer_c_ditherq + 0  * 8]
+%endif
 %if %3 == 15
 %define gmask pb_03
 %define isRGB15 1
@@ -268,3 +388,13 @@  yuv2rgb_fn yuv,  rgb, 16
 INIT_MMX mmxext
 yuv2rgb_fn yuv, rgb, 24
 yuv2rgb_fn yuv, bgr, 24
+
+INIT_XMM ssse3
+yuv2rgb_fn yuv,  rgb, 24
+yuv2rgb_fn yuv,  bgr, 24
+yuv2rgb_fn yuv,  rgb, 32
+yuv2rgb_fn yuv,  bgr, 32
+yuv2rgb_fn yuva, rgb, 32
+yuv2rgb_fn yuva, bgr, 32
+yuv2rgb_fn yuv,  rgb, 15
+yuv2rgb_fn yuv,  rgb, 16