diff mbox

[FFmpeg-devel,V3,2/2] libswscale/x86/yuv2rgb: add ssse3 version

Message ID 20191204025952.14851-2-ting.fu@intel.com
State Superseded
Headers show

Commit Message

Ting Fu Dec. 4, 2019, 2:59 a.m. UTC
Tested using this command:
/ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \
-vcodec rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null

The fps increase from 389 to 640 on my local machine.

Signed-off-by: Ting Fu <ting.fu@intel.com>
---
 libswscale/x86/yuv2rgb.c          |   8 +-
 libswscale/x86/yuv2rgb_template.c |  58 ++++++++++-
 libswscale/x86/yuv_2_rgb.asm      | 162 +++++++++++++++++++++++++++---
 3 files changed, 209 insertions(+), 19 deletions(-)

Comments

Ting Fu Dec. 9, 2019, 1:49 a.m. UTC | #1
> -----Original Message-----

> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ting Fu

> Sent: Wednesday, December 4, 2019 11:00 AM

> To: ffmpeg-devel@ffmpeg.org

> Subject: [FFmpeg-devel] [PATCH V3 2/2] libswscale/x86/yuv2rgb: add ssse3

> version

> 

> Tested using this command:

> /ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \ -vcodec

> rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null

> 

> The fps increase from 389 to 640 on my local machine.

> 

> Signed-off-by: Ting Fu <ting.fu@intel.com>

> ---

>  libswscale/x86/yuv2rgb.c          |   8 +-

>  libswscale/x86/yuv2rgb_template.c |  58 ++++++++++-

>  libswscale/x86/yuv_2_rgb.asm      | 162 +++++++++++++++++++++++++++---

>  3 files changed, 209 insertions(+), 19 deletions(-)

> 

[...]

Ping.

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org

> with subject "unsubscribe".
Ting Fu Dec. 16, 2019, 3:30 p.m. UTC | #2
> -----Original Message-----

> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Fu,

> Ting

> Sent: Monday, December 9, 2019 09:49 AM

> To: FFmpeg development discussions and patches <ffmpeg-

> devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH V3 2/2] libswscale/x86/yuv2rgb: add

> ssse3 version

> 

> 

> 

> > -----Original Message-----

> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of

> Ting

> > Fu

> > Sent: Wednesday, December 4, 2019 11:00 AM

> > To: ffmpeg-devel@ffmpeg.org

> > Subject: [FFmpeg-devel] [PATCH V3 2/2] libswscale/x86/yuv2rgb: add

> > ssse3 version

> >

> > Tested using this command:

> > /ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \ -vcodec

> > rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null

> >

> > The fps increase from 389 to 640 on my local machine.

> >

> > Signed-off-by: Ting Fu <ting.fu@intel.com>

> > ---

> >  libswscale/x86/yuv2rgb.c          |   8 +-

> >  libswscale/x86/yuv2rgb_template.c |  58 ++++++++++-

> >  libswscale/x86/yuv_2_rgb.asm      | 162 +++++++++++++++++++++++++++---

> >  3 files changed, 209 insertions(+), 19 deletions(-)

> >

> [...]

> 

> Ping.


Ping?

> 

> > _______________________________________________

> > ffmpeg-devel mailing list

> > ffmpeg-devel@ffmpeg.org

> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> >

> > To unsubscribe, visit link above, or email

> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org

> with subject "unsubscribe".
Henrik Gramner Dec. 17, 2019, 12:29 a.m. UTC | #3
On Wed, Dec 4, 2019 at 4:03 AM Ting Fu <ting.fu@intel.com> wrote:
> +    VBROADCASTSD y_offset, [pointer_c_ditherq + 8  * 8]
> +    VBROADCASTSD u_offset, [pointer_c_ditherq + 9  * 8]
> +    VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
> +    VBROADCASTSD ug_coff,  [pointer_c_ditherq + 7  * 8]
> +    VBROADCASTSD vg_coff,  [pointer_c_ditherq + 6  * 8]
> +    VBROADCASTSD y_coff,   [pointer_c_ditherq + 3  * 8]
> +    VBROADCASTSD ub_coff,  [pointer_c_ditherq + 5  * 8]
> +    VBROADCASTSD vr_coff,  [pointer_c_ditherq + 4  * 8]
[...]
> +    vpbroadcastq m2, mu_offset
> +    vpbroadcastq m3, mv_offset
> +    vpbroadcastq m4, my_offset

VBROADCASTSD/vpbroadcastq -> movddup

> +    mova m2, m0
> +    mova m3, m1
> +    vpbroadcastq m4, mug_coff
> +    vpbroadcastq m5, mvg_coff
> +    pmulhw m2, m4
> +    pmulhw m3, m5

The register-register moves can be eliminated:
    movddup m2, mug_coff
    movddup m3, mvg_coff
    pmulhw m2, m0
    pmulhw m3, m1

> +    mova m0, m3
> +    pshufb m0, [mask_evenword] ; R2 G2 R6 G6 R10 G10 R14 G14 -- -- -- -- -- -- -- --
> +    mova m1, m2
> +    pshufb m1, [mask_oddword]  ; G1 B1 G5 B5 G9 B9 G13 B13 -- -- -- -- -- -- -- --
> +    punpcklwd m1, m0           ; G1 B1 R2 G2 G5 B5 R6 G6 G9 B9 R10 G10 G13 B13 R14 G14
> +    mova m0,m6
> +    pshufb m0, [mask_evenword] ; B2 R3 B6 R7 B10 R11 B14 R15 -- -- -- -- -- -- -- --
> +    mova m4, m2
> +    pshufb m4, [mask_evenword] ; G3 B3 G7 B7 G11 B11 G15 G15 -- -- -- -- -- -- -- --
> +    punpcklwd m0, m4
> +    pshufb m3, [mask_oddword]  ; R0 G0 R4 G4 R8 G8 R12 G12 -- -- -- -- -- -- -- --
> +    pshufb m6, [mask_oddword]  ; B0 R1 B4 R5 B8 R9 B12 R13 -- -- -- -- -- -- -- --
> +    mova m5, m0
> +    mova m7, m1
> +    punpcklwd m3, m6 ; R0  G0  B0  R1  R4  G4  B4  R5  R8  G9  B8  R9  R12 G12 B12 R13
> +    punpckldq m7, m5 ; G1  B1  R2  G2  B2  R3  G3  B3  G5  B5  R5  G5  B6  R7  G7  B7
> +    punpckhdq m1, m0 ; G9  B9  R10 G10 B10 R11 G11 B11 G13 B13 R14 G14 B14 R15 G15 B15
> +    mova m0, m3
> +    mova m2, m7
> +    pshufb m0, [mask_dw01to03] ; R0 G0 B0 R1 -- -- -- -- -- -- -- -- R4 G4 B4 R5
> +    pshufb m2, [mask_dw01to12] ; -- -- -- -- G1 B1 R2 G2 B2 R3 G3 B3 -- -- -- --
> +    por m0, m2                 ; R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
> +    mova m2, m3
> +    mova m4, m7
> +    pshufb m2, [mask_dw2to2]   ; -- -- -- -- -- -- -- -- R8 G8 B8 R9 -- -- -- --
> +    pshufb m4, [mask_dw23to01] ; G5 B5 R6 G6 B6 R7 G7 B7 -- -- -- -- -- -- -- --
> +    por m2, m4
> +    mova m4, m1
> +    pshufb m4, [mask_dw0to3]   ; -- -- -- -- -- -- -- -- -- -- -- -- G9 B9 R10 G10
> +    por m2, m4                 ; G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 R10 G10
> +    pshufb m3, [mask_dw3to1]     ; --- --- --- --- R12 G12 B12 R13 --- --- --- --- --- --- --- ---
> +    pshufb m1, [mask_dw123to023] ; B10 R11 G11 B11 --- --- --- --- G13 B13 R14 G14 B14 R15 G15 B15
> +    por m1, m3                   ; B10 R11 G11 B11 R12 G12 B12 R13 G13 B13 R14 G14 B14 R15 G15 B15

Probably faster to do fewer shuffles in favor of masking instead, e.g.
something along the lines of

rgb_shuf1: db  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5, 10, 11
rgb_shuf2: db 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5
rgb_shuf3: db  4,  5, 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15
rgb_mask1: db -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0
rgb_mask2: db  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1
rgb_mask3: db  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0
[...]
pshufb m3, [rgb_shuf1] ; r0  g0  r6  g6  r12 g12 r2  g2  r8  g8  r14
g14 r4  g4  r10 g10
pshufb m6, [rgb_shuf2] ; b10 r11 b0  r1  b6  r7  b12 r13 b2  r3  b8
r9  b14 r15 b4  r5
pshufb m2, [rgb_shuf3] ; g5  b5  g11 b11 g1  b1  g7  b7  g13 b13 g3
b3  g9  b9  g15 b15
mova   m7, [rgb_mask1]
mova   m4, [rgb_mask2]
mova   m5, [rgb_mask3]
pand   m0, m7, m3      ; r0  g0  ___ ___ ___ ___ r2  g2  ___ ___ ___
___ r4  g4  ___ ___
pand   m1, m4, m6      ; ___ ___ b0  r1  ___ ___ ___ ___ b2  r3  ___
___ ___ ___ b4  r5
por    m0, m1
pand   m1, m5, m2      ; ___ ___ ___ ___ g1  b1  ___ ___ ___ ___ g3
b3  ___ ___ ___ ___
por    m0, m1          ; r0  g0  b0  r1  g1  b1  r2  g2  b2  r3  g3
b3  r4  g4  b4  r5
pand   m1, m7, m2      ; g5  b5  ___ ___ ___ ___ g7  b7  ___ ___ ___
___ g9  b9  ___ ___
pand   m7, m6          ; b10 r11 ___ ___ ___ ___ b12 r13 ___ ___ ___
___ b14 r15 ___ ___
pand   m6, m5          ; ___ ___ ___ ___ b6  r7  ___ ___ ___ ___ b8
r9  ___ ___ ___ ___
por    m1, m6
pand   m6, m4, m3      ; ___ ___ r6  g6  ___ ___ ___ ___ r8  g8  ___
___ ___ ___ r10 g10
pand   m2, m4          ; ___ ___ g11 b11 ___ ___ ___ ___ g13 b13 ___
___ ___ ___ g15 b15
pand   m3, m5          ; ___ ___ ___ ___ r12 g12 ___ ___ ___ ___ r14
g14 ___ ___ ___ ___
por    m2, m7
por    m1, m6          ; g5  b5  r6  g6  b6  r7  g7  b7  r8  g8  b8
r9  g9  b9  r10 g10
por    m2, m3          ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14
g14 b14 r15 g15 b15
diff mbox

Patch

diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index ed9b613cab..b83dd7089a 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -61,13 +61,19 @@  DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
 #define COMPILE_TEMPLATE_MMXEXT 1
 #endif /* HAVE_MMXEXT */
 
+//SSSE3 versions
+#if HAVE_SSSE3
+#define COMPILE_TEMPLATE_SSSE3 1
+#endif
+
 #include "yuv2rgb_template.c"
 
 av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags) || EXTERNAL_MMXEXT(cpu_flags)) {
+    if (EXTERNAL_MMX(cpu_flags) || EXTERNAL_MMXEXT(cpu_flags) ||
+            EXTERNAL_SSSE3(cpu_flags)) {
         switch (c->dstFormat) {
         case AV_PIX_FMT_RGB32:
             if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index bcc8eb7602..97a3645b90 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -40,6 +40,30 @@ 
         const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
         x86_reg index = -h_size / 2;                                 \
 
+extern void ff_yuv_420_rgb24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_bgr24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_rgb15_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_rgb16_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuv_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                   const uint8_t *py_2index);
+extern void ff_yuva_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                    const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                    const uint8_t *py_2index, const uint8_t *pa_2index);
+extern void ff_yuva_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                    const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                    const uint8_t *py_2index, const uint8_t *pa_2index);
 extern void ff_yuv_420_rgb24_mmxext(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                     const uint8_t *py_2index);
@@ -84,7 +108,12 @@  static inline int yuv420_rgb15(SwsContext *c, const uint8_t *src[],
     c->greenDither = ff_dither8[y       & 1];
     c->redDither   = ff_dither8[(y + 1) & 1];
 #endif
+
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb15_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_rgb15_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -102,7 +131,12 @@  static inline int yuv420_rgb16(SwsContext *c, const uint8_t *src[],
     c->greenDither = ff_dither4[y       & 1];
     c->redDither   = ff_dither8[(y + 1) & 1];
 #endif
+
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb16_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_rgb16_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -115,7 +149,9 @@  static inline int yuv420_rgb24(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(3)
 
-#if COMPILE_TEMPLATE_MMXEXT
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#elif COMPILE_TEMPLATE_MMXEXT
     ff_yuv_420_rgb24_mmxext(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
 #else
     ff_yuv_420_rgb24_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
@@ -132,7 +168,9 @@  static inline int yuv420_bgr24(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(3)
 
-#if COMPILE_TEMPLATE_MMXEXT
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_bgr24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#elif COMPILE_TEMPLATE_MMXEXT
     ff_yuv_420_bgr24_mmxext(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
 #else
     ff_yuv_420_bgr24_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
@@ -149,7 +187,11 @@  static inline int yuv420_rgb32(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(4)
 
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_rgb32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -162,7 +204,11 @@  static inline int yuv420_bgr32(SwsContext *c, const uint8_t *src[],
     int y, h_size, vshift;
     YUV2RGB_LOOP(4)
 
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuv_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#else
     ff_yuv_420_bgr32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -176,7 +222,11 @@  static inline int yuva420_rgb32(SwsContext *c, const uint8_t *src[],
     YUV2RGB_LOOP(4)
 
     const uint8_t *pa = src[3] + y * srcStride[3];
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuva_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#else
     ff_yuva_420_rgb32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#endif
     }
     return srcSliceH;
 }
@@ -190,7 +240,11 @@  static inline int yuva420_bgr32(SwsContext *c, const uint8_t *src[],
     YUV2RGB_LOOP(4)
 
     const uint8_t *pa = src[3] + y * srcStride[3];
+#if COMPILE_TEMPLATE_SSSE3
+    ff_yuva_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#else
     ff_yuva_420_bgr32_mmx(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+#endif
     }
     return srcSliceH;
 }
diff --git a/libswscale/x86/yuv_2_rgb.asm b/libswscale/x86/yuv_2_rgb.asm
index a44ab1607b..03eb5a62ea 100644
--- a/libswscale/x86/yuv_2_rgb.asm
+++ b/libswscale/x86/yuv_2_rgb.asm
@@ -25,11 +25,21 @@ 
 
 SECTION_RODATA
 
-pw_00ff: times 4 dw 255
-pb_f8:   times 8 db 248
-pb_e0:   times 8 db 224
-pb_03:   times 8 db 3
-pb_07:   times 8 db 7
+; below variables are named like mask_dwXYtoZW, which means to shuffle from dword No.X & No.Y to No.Z & No.W
+mask_evenword:   db  2,  3,  6,  7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1
+mask_oddword:    db  0,  1,  4,  5,  8,  9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1
+mask_dw01to03:   db  0,  1,  2,  3, -1, -1, -1, -1, -1, -1, -1, -1,  4,  5,  6,  7
+mask_dw01to12:   db -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,  6,  7, -1, -1, -1, -1
+mask_dw2to2:     db -1, -1, -1, -1, -1, -1, -1, -1,  8,  9, 10, 11, -1, -1, -1, -1
+mask_dw23to01:   db  8,  9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1
+mask_dw0to3:     db -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  3
+mask_dw3to1:     db -1, -1, -1, -1, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1
+mask_dw123to023: db  4,  5,  6,  7, -1, -1, -1, -1,  8,  9, 10, 11, 12, 13, 14, 15
+pw_00ff: times 8 dw 255
+pb_f8:   times 16 db 248
+pb_e0:   times 16 db 224
+pb_03:   times 16 db 3
+pb_07:   times 16 db 7
 
 mask_1101: dw -1, -1,  0, -1
 mask_0010: dw  0,  0, -1,  0
@@ -49,7 +59,11 @@  SECTION .text
 ;-----------------------------------------------------------------------------
 
 %macro MOV_H2L 1
-psrlq %1, 32
+%if mmsize == 8
+    psrlq %1, 32
+%else ; mmsize == 16
+    psrldq %1, 8
+%endif
 %endmacro
 
 %macro yuv2rgb_fn 3
@@ -77,6 +91,7 @@  psrlq %1, 32
 %define m_blue m1
 %endif
 
+%if mmsize == 8
 %define time_num 1
 %define reg_num 8
 %define y_offset [pointer_c_ditherq + 8  * 8]
@@ -87,11 +102,45 @@  psrlq %1, 32
 %define y_coff   [pointer_c_ditherq + 3  * 8]
 %define ub_coff  [pointer_c_ditherq + 5  * 8]
 %define vr_coff  [pointer_c_ditherq + 4  * 8]
+%elif mmsize == 16
+%define time_num 2
+%if ARCH_X86_32
+%define reg_num 8
+%define my_offset [pointer_c_ditherq + 8  * 8]
+%define mu_offset [pointer_c_ditherq + 9  * 8]
+%define mv_offset [pointer_c_ditherq + 10 * 8]
+%define mug_coff  [pointer_c_ditherq + 7  * 8]
+%define mvg_coff  [pointer_c_ditherq + 6  * 8]
+%define my_coff   [pointer_c_ditherq + 3  * 8]
+%define mub_coff  [pointer_c_ditherq + 5  * 8]
+%define mvr_coff  [pointer_c_ditherq + 4  * 8]
+%else ; ARCH_X86_64
+%define reg_num 16
+%define y_offset m8
+%define u_offset m9
+%define v_offset m10
+%define ug_coff  m11
+%define vg_coff  m12
+%define y_coff   m13
+%define ub_coff  m14
+%define vr_coff  m15
+%endif ; ARCH_X86_32/64
+%endif ; coeff define mmsize == 8/16
 
 cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
 
 %if ARCH_X86_64
     movsxd indexq, indexd
+%if mmsize == 16
+    VBROADCASTSD y_offset, [pointer_c_ditherq + 8  * 8]
+    VBROADCASTSD u_offset, [pointer_c_ditherq + 9  * 8]
+    VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
+    VBROADCASTSD ug_coff,  [pointer_c_ditherq + 7  * 8]
+    VBROADCASTSD vg_coff,  [pointer_c_ditherq + 6  * 8]
+    VBROADCASTSD y_coff,   [pointer_c_ditherq + 3  * 8]
+    VBROADCASTSD ub_coff,  [pointer_c_ditherq + 5  * 8]
+    VBROADCASTSD vr_coff,  [pointer_c_ditherq + 4  * 8]
+%endif
 %endif
     mova m_y, [py_2indexq + 2 * indexq]
     movh m_u, [pu_indexq  +     indexq]
@@ -108,10 +157,32 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     psllw m1, 3
     psllw m6, 3
     psllw m7, 3
+%if (ARCH_X86_32 && mmsize == 16)
+    vpbroadcastq m2, mu_offset
+    vpbroadcastq m3, mv_offset
+    vpbroadcastq m4, my_offset
+    psubsw m0, m2 ; U = U - 128
+    psubsw m1, m3 ; V = V - 128
+    psubw  m6, m4
+    psubw  m7, m4
+    mova m2, m0
+    mova m3, m1
+    vpbroadcastq m4, mug_coff
+    vpbroadcastq m5, mvg_coff
+    pmulhw m2, m4
+    pmulhw m3, m5
+    vpbroadcastq m4, my_coff
+    vpbroadcastq m5, mub_coff
+    pmulhw m6, m4
+    pmulhw m7, m4
+    pmulhw m0, m5
+    vpbroadcastq m4, mvr_coff
+    pmulhw m1, m4
+%else ; ARCH_X86_64 || mmsize == 8
     psubsw m0, u_offset ; U = U - 128
     psubsw m1, v_offset ; V = V - 128
-    psubw m6, y_offset
-    psubw m7, y_offset
+    psubw  m6, y_offset
+    psubw  m7, y_offset
     mova m2, m0
     mova m3, m1
     pmulhw m2, ug_coff
@@ -120,6 +191,7 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     pmulhw m7, y_coff
     pmulhw m0, ub_coff
     pmulhw m1, vr_coff
+%endif
     paddsw m2, m3
     mova m3, m7
     mova m5, m7
@@ -142,6 +214,7 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     punpcklbw m6, m_red  ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
     mova m5, m3
     punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
+%if  mmsize == 8
     punpcklwd m3 ,m6     ; R0 G0 B0 R1 R2 G2 B2 R3
     punpckhwd m5, m6     ; R4 G4 B4 R5 R6 G6 B6 R7
 %if cpuflag(mmxext)
@@ -177,7 +250,45 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     psrlq m5, 32
     movd [imageq + 20], m2 ; -- -- G7 B7
     movd [imageq + 18], m5 ; R6 G6 B6 R7
-%endif
+%endif ; mmsize = 8
+%else ; mmsize == 16
+    mova m0, m3
+    pshufb m0, [mask_evenword] ; R2 G2 R6 G6 R10 G10 R14 G14 -- -- -- -- -- -- -- --
+    mova m1, m2
+    pshufb m1, [mask_oddword]  ; G1 B1 G5 B5 G9 B9 G13 B13 -- -- -- -- -- -- -- --
+    punpcklwd m1, m0           ; G1 B1 R2 G2 G5 B5 R6 G6 G9 B9 R10 G10 G13 B13 R14 G14
+    mova m0,m6
+    pshufb m0, [mask_evenword] ; B2 R3 B6 R7 B10 R11 B14 R15 -- -- -- -- -- -- -- --
+    mova m4, m2
+    pshufb m4, [mask_evenword] ; G3 B3 G7 B7 G11 B11 G15 G15 -- -- -- -- -- -- -- --
+    punpcklwd m0, m4
+    pshufb m3, [mask_oddword]  ; R0 G0 R4 G4 R8 G8 R12 G12 -- -- -- -- -- -- -- --
+    pshufb m6, [mask_oddword]  ; B0 R1 B4 R5 B8 R9 B12 R13 -- -- -- -- -- -- -- --
+    mova m5, m0
+    mova m7, m1
+    punpcklwd m3, m6 ; R0  G0  B0  R1  R4  G4  B4  R5  R8  G9  B8  R9  R12 G12 B12 R13
+    punpckldq m7, m5 ; G1  B1  R2  G2  B2  R3  G3  B3  G5  B5  R5  G5  B6  R7  G7  B7
+    punpckhdq m1, m0 ; G9  B9  R10 G10 B10 R11 G11 B11 G13 B13 R14 G14 B14 R15 G15 B15
+    mova m0, m3
+    mova m2, m7
+    pshufb m0, [mask_dw01to03] ; R0 G0 B0 R1 -- -- -- -- -- -- -- -- R4 G4 B4 R5
+    pshufb m2, [mask_dw01to12] ; -- -- -- -- G1 B1 R2 G2 B2 R3 G3 B3 -- -- -- --
+    por m0, m2                 ; R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+    mova m2, m3
+    mova m4, m7
+    pshufb m2, [mask_dw2to2]   ; -- -- -- -- -- -- -- -- R8 G8 B8 R9 -- -- -- --
+    pshufb m4, [mask_dw23to01] ; G5 B5 R6 G6 B6 R7 G7 B7 -- -- -- -- -- -- -- --
+    por m2, m4
+    mova m4, m1
+    pshufb m4, [mask_dw0to3]   ; -- -- -- -- -- -- -- -- -- -- -- -- G9 B9 R10 G10
+    por m2, m4                 ; G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 R10 G10
+    pshufb m3, [mask_dw3to1]     ; --- --- --- --- R12 G12 B12 R13 --- --- --- --- --- --- --- ---
+    pshufb m1, [mask_dw123to023] ; B10 R11 G11 B11 --- --- --- --- G13 B13 R14 G14 B14 R15 G15 B15
+    por m1, m3                   ; B10 R11 G11 B11 R12 G12 B12 R13 G13 B13 R14 G14 B14 R15 G15 B15
+    mova [imageq], m0
+    mova [imageq + 16], m2
+    mova [imageq + 32], m1
+%endif ; mmsize = 16
 %else ; PACK RGB15/16/32
     packuswb m0, m1
     packuswb m3, m5
@@ -196,10 +307,10 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
 %endif
     mova m5, m_blue
     mova m6, m_red
-    punpckhbw m5, m_green
+    punpckhbw m5,     m_green
     punpcklbw m_blue, m_green
-    punpckhbw m6, m_alpha
-    punpcklbw m_red, m_alpha
+    punpckhbw m6,     m_alpha
+    punpcklbw m_red,  m_alpha
     mova m_green, m_blue
     mova m_alpha, m5
     punpcklwd m_blue, m_red
@@ -207,14 +318,23 @@  cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     punpcklwd m5, m6
     punpckhwd m_alpha, m6
     mova [imageq + 0], m_blue
-    mova [imageq + 8 * time_num], m_green
+    mova [imageq + 8  * time_num], m_green
     mova [imageq + 16 * time_num], m5
     mova [imageq + 24 * time_num], m_alpha
 %else ; PACK RGB15/16
 %define depth 2
-%define blue_dither  [pointer_c_ditherq + 2 * 8]
-%define green_dither [pointer_c_ditherq + 1 * 8]
-%define red_dither   [pointer_c_ditherq + 0 * 8]
+%if cpuflag(ssse3)
+    %define red_dither m3
+    %define green_dither m4
+    %define blue_dither m5
+    VBROADCASTSD red_dither,   [pointer_c_ditherq + 0 * 8]
+    VBROADCASTSD green_dither, [pointer_c_ditherq + 1 * 8]
+    VBROADCASTSD blue_dither,  [pointer_c_ditherq + 2 * 8]
+%else ; cpuflag(mmx/mmxext)
+%define blue_dither  [pointer_c_ditherq + 2  * 8]
+%define green_dither [pointer_c_ditherq + 1  * 8]
+%define red_dither   [pointer_c_ditherq + 0  * 8]
+%endif
 %if %3 == 15
 %define gmask pb_03
 %define isRGB15 1
@@ -268,3 +388,13 @@  yuv2rgb_fn yuv,  rgb, 16
 INIT_MMX mmxext
 yuv2rgb_fn yuv, rgb, 24
 yuv2rgb_fn yuv, bgr, 24
+
+INIT_XMM ssse3
+yuv2rgb_fn yuv,  rgb, 24
+yuv2rgb_fn yuv,  bgr, 24
+yuv2rgb_fn yuv,  rgb, 32
+yuv2rgb_fn yuv,  bgr, 32
+yuv2rgb_fn yuva, rgb, 32
+yuv2rgb_fn yuva, bgr, 32
+yuv2rgb_fn yuv,  rgb, 15
+yuv2rgb_fn yuv,  rgb, 16