diff mbox series

[FFmpeg-devel] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions

Message ID 20240606141505.132-1-jamrial@gmail.com
State New
Headers show
Series [FFmpeg-devel] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Almer June 6, 2024, 2:15 p.m. UTC
And remove shuffle_bytes_2103_mmxext.

shuffle_bytes_0321_c: 28.1
shuffle_bytes_0321_sse2: 13.6
shuffle_bytes_0321_ssse3: 9.6
shuffle_bytes_0321_avx2: 7.1
shuffle_bytes_1230_c: 52.6
shuffle_bytes_1230_sse2: 12.1
shuffle_bytes_1230_ssse3: 8.6
shuffle_bytes_1230_avx2: 6.6
shuffle_bytes_2103_c: 29.1
shuffle_bytes_2103_mmxext: 29.3 // removed
shuffle_bytes_2103_sse2: 12.5
shuffle_bytes_2103_ssse3: 8.6
shuffle_bytes_2103_avx2: 7.1
shuffle_bytes_3012_c: 52.1
shuffle_bytes_3012_sse2: 12.1
shuffle_bytes_3012_ssse3: 8.6
shuffle_bytes_3012_avx2: 7.1
shuffle_bytes_3210_c: 50.6
shuffle_bytes_3210_sse2: 14.6
shuffle_bytes_3210_ssse3: 8.6
shuffle_bytes_3210_avx2: 7.1

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libswscale/x86/rgb2rgb.c     | 14 ++++--
 libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
 2 files changed, 69 insertions(+), 28 deletions(-)

Comments

Andreas Rheinhardt June 6, 2024, 2:48 p.m. UTC | #1
James Almer:
> And remove shuffle_bytes_2103_mmxext.
> 
> shuffle_bytes_0321_c: 28.1
> shuffle_bytes_0321_sse2: 13.6
> shuffle_bytes_0321_ssse3: 9.6
> shuffle_bytes_0321_avx2: 7.1
> shuffle_bytes_1230_c: 52.6
> shuffle_bytes_1230_sse2: 12.1
> shuffle_bytes_1230_ssse3: 8.6
> shuffle_bytes_1230_avx2: 6.6
> shuffle_bytes_2103_c: 29.1
> shuffle_bytes_2103_mmxext: 29.3 // removed
> shuffle_bytes_2103_sse2: 12.5
> shuffle_bytes_2103_ssse3: 8.6
> shuffle_bytes_2103_avx2: 7.1
> shuffle_bytes_3012_c: 52.1
> shuffle_bytes_3012_sse2: 12.1
> shuffle_bytes_3012_ssse3: 8.6
> shuffle_bytes_3012_avx2: 7.1
> shuffle_bytes_3210_c: 50.6
> shuffle_bytes_3210_sse2: 14.6
> shuffle_bytes_3210_ssse3: 8.6
> shuffle_bytes_3210_avx2: 7.1
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libswscale/x86/rgb2rgb.c     | 14 ++++--
>  libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>  2 files changed, 69 insertions(+), 28 deletions(-)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 21ccfafe51..9f6c8efc72 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>  
>  #endif /* HAVE_INLINE_ASM */
>  
> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>          rgb2rgb_init_avx();
>  #endif /* HAVE_INLINE_ASM */
>  
> -    if (EXTERNAL_MMXEXT(cpu_flags)) {
> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
> -    }
>      if (EXTERNAL_SSE2(cpu_flags)) {
> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>  #if ARCH_X86_64
>          uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>  #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 0bf1278718..9fc1974389 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -25,7 +25,6 @@
>  
>  SECTION_RODATA
>  
> -pb_mask_shuffle2103_mmx times 8 dw 255
>  pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
>  pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
>  pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> @@ -50,11 +49,50 @@ SECTION .text
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
>  ;------------------------------------------------------------------------------
> -INIT_MMX mmxext
> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> -    mova   m6, [pb_mask_shuffle2103_mmx]
> -    mova   m7, m6
> -    psllq  m7, 8
> +
> +%macro SHUFFLE2103_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m3
> +    pand     m1, m2
> +%endmacro
> +
> +%macro SHUFFLE0321_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m2
> +    pand     m1, m3
> +%endmacro
> +
> +%macro SHUFFLE1230_SSE2 0
> +    pslld    m1, m0, 24
> +    psrld    m0, 8
> +%endmacro
> +
> +%macro SHUFFLE3012_SSE2 0
> +    pslld    m1, m0, 8
> +    psrld    m0, 24
> +%endmacro
> +
> +%macro SHUFFLE3210_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    psrlw     m0, m1, 8
> +    psllw     m1, 8
> +%endmacro
> +
> +; %1-4 index shuffle
> +; %5 load mask
> +%macro SHUFFLE_BYTES_SSE2 5
> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
> +%if %5
> +    pcmpeqw        m2, m2
> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
> +%endif
>  
>      movsxdifnidn wq, wd
>      mov xq, wq
> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>      je .loop_simd
>  
>  .loop_scalar:
> -   mov          tmpb, [srcq + wq + 2]
> +   mov          tmpb, [srcq + wq + %1]
>     mov [dstq+wq + 0], tmpb
> -   mov          tmpb, [srcq + wq + 1]
> +   mov          tmpb, [srcq + wq + %2]
>     mov [dstq+wq + 1], tmpb
> -   mov          tmpb, [srcq + wq + 0]
> +   mov          tmpb, [srcq + wq + %3]
>     mov [dstq+wq + 2], tmpb
> -   mov          tmpb, [srcq + wq + 3]
> +   mov          tmpb, [srcq + wq + %4]
>     mov [dstq+wq + 3], tmpb
>     add            wq, 4
>     sub            xq, 4
> @@ -86,29 +124,26 @@ jge .end
>  
>  .loop_simd:
>      movu     m0, [srcq+wq]
> -    movu     m1, [srcq+wq+8]
> -
> -    pshufw   m3, m0, 177
> -    pshufw   m5, m1, 177
> -
> -    pand     m0, m7
> -    pand     m3, m6
>  
> -    pand     m1, m7
> -    pand     m5, m6
> +    SHUFFLE%1%2%3%4_SSE2
>  
> -    por      m0, m3
> -    por      m1, m5
> +    por      m0, m1
>  
>      movu      [dstq+wq], m0
> -    movu  [dstq+wq + 8], m1
>  
> -    add              wq, mmsize*2
> +    add              wq, mmsize
>      jl .loop_simd
>  
>  .end:
> -    emms
>      RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>  
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)

How old are the youngest processors with SSE2, but without SSSE3?
According to Wikipedia, nearly 15 years. Which makes me believe that the
SSE2 versions are not worth it (how many of these CPUs will use a new
FFmpeg anyway?).

- Andreas
James Almer June 6, 2024, 3:45 p.m. UTC | #2
On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
> James Almer:
>> And remove shuffle_bytes_2103_mmxext.
>>
>> shuffle_bytes_0321_c: 28.1
>> shuffle_bytes_0321_sse2: 13.6
>> shuffle_bytes_0321_ssse3: 9.6
>> shuffle_bytes_0321_avx2: 7.1
>> shuffle_bytes_1230_c: 52.6
>> shuffle_bytes_1230_sse2: 12.1
>> shuffle_bytes_1230_ssse3: 8.6
>> shuffle_bytes_1230_avx2: 6.6
>> shuffle_bytes_2103_c: 29.1
>> shuffle_bytes_2103_mmxext: 29.3 // removed
>> shuffle_bytes_2103_sse2: 12.5
>> shuffle_bytes_2103_ssse3: 8.6
>> shuffle_bytes_2103_avx2: 7.1
>> shuffle_bytes_3012_c: 52.1
>> shuffle_bytes_3012_sse2: 12.1
>> shuffle_bytes_3012_ssse3: 8.6
>> shuffle_bytes_3012_avx2: 7.1
>> shuffle_bytes_3210_c: 50.6
>> shuffle_bytes_3210_sse2: 14.6
>> shuffle_bytes_3210_ssse3: 8.6
>> shuffle_bytes_3210_avx2: 7.1
>>
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libswscale/x86/rgb2rgb.c     | 14 ++++--
>>   libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>   2 files changed, 69 insertions(+), 28 deletions(-)
>>
>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>> index 21ccfafe51..9f6c8efc72 100644
>> --- a/libswscale/x86/rgb2rgb.c
>> +++ b/libswscale/x86/rgb2rgb.c
>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>>   
>>   #endif /* HAVE_INLINE_ASM */
>>   
>> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>>   void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>>   void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>>   void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>           rgb2rgb_init_avx();
>>   #endif /* HAVE_INLINE_ASM */
>>   
>> -    if (EXTERNAL_MMXEXT(cpu_flags)) {
>> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>> -    }
>>       if (EXTERNAL_SSE2(cpu_flags)) {
>> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>   #if ARCH_X86_64
>>           uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>   #endif
>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>> index 0bf1278718..9fc1974389 100644
>> --- a/libswscale/x86/rgb_2_rgb.asm
>> +++ b/libswscale/x86/rgb_2_rgb.asm
>> @@ -25,7 +25,6 @@
>>   
>>   SECTION_RODATA
>>   
>> -pb_mask_shuffle2103_mmx times 8 dw 255
>>   pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
>>   pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
>>   pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
>> @@ -50,11 +49,50 @@ SECTION .text
>>   ;------------------------------------------------------------------------------
>>   ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
>>   ;------------------------------------------------------------------------------
>> -INIT_MMX mmxext
>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>> -    mova   m6, [pb_mask_shuffle2103_mmx]
>> -    mova   m7, m6
>> -    psllq  m7, 8
>> +
>> +%macro SHUFFLE2103_SSE2 0
>> +    pshuflw   m1, m0, 0xb1
>> +    pshufhw   m1, m1, 0xb1
>> +
>> +    pand     m0, m3
>> +    pand     m1, m2
>> +%endmacro
>> +
>> +%macro SHUFFLE0321_SSE2 0
>> +    pshuflw   m1, m0, 0xb1
>> +    pshufhw   m1, m1, 0xb1
>> +
>> +    pand     m0, m2
>> +    pand     m1, m3
>> +%endmacro
>> +
>> +%macro SHUFFLE1230_SSE2 0
>> +    pslld    m1, m0, 24
>> +    psrld    m0, 8
>> +%endmacro
>> +
>> +%macro SHUFFLE3012_SSE2 0
>> +    pslld    m1, m0, 8
>> +    psrld    m0, 24
>> +%endmacro
>> +
>> +%macro SHUFFLE3210_SSE2 0
>> +    pshuflw   m1, m0, 0xb1
>> +    pshufhw   m1, m1, 0xb1
>> +
>> +    psrlw     m0, m1, 8
>> +    psllw     m1, 8
>> +%endmacro
>> +
>> +; %1-4 index shuffle
>> +; %5 load mask
>> +%macro SHUFFLE_BYTES_SSE2 5
>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>> +%if %5
>> +    pcmpeqw        m2, m2
>> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
>> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
>> +%endif
>>   
>>       movsxdifnidn wq, wd
>>       mov xq, wq
>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>       je .loop_simd
>>   
>>   .loop_scalar:
>> -   mov          tmpb, [srcq + wq + 2]
>> +   mov          tmpb, [srcq + wq + %1]
>>      mov [dstq+wq + 0], tmpb
>> -   mov          tmpb, [srcq + wq + 1]
>> +   mov          tmpb, [srcq + wq + %2]
>>      mov [dstq+wq + 1], tmpb
>> -   mov          tmpb, [srcq + wq + 0]
>> +   mov          tmpb, [srcq + wq + %3]
>>      mov [dstq+wq + 2], tmpb
>> -   mov          tmpb, [srcq + wq + 3]
>> +   mov          tmpb, [srcq + wq + %4]
>>      mov [dstq+wq + 3], tmpb
>>      add            wq, 4
>>      sub            xq, 4
>> @@ -86,29 +124,26 @@ jge .end
>>   
>>   .loop_simd:
>>       movu     m0, [srcq+wq]
>> -    movu     m1, [srcq+wq+8]
>> -
>> -    pshufw   m3, m0, 177
>> -    pshufw   m5, m1, 177
>> -
>> -    pand     m0, m7
>> -    pand     m3, m6
>>   
>> -    pand     m1, m7
>> -    pand     m5, m6
>> +    SHUFFLE%1%2%3%4_SSE2
>>   
>> -    por      m0, m3
>> -    por      m1, m5
>> +    por      m0, m1
>>   
>>       movu      [dstq+wq], m0
>> -    movu  [dstq+wq + 8], m1
>>   
>> -    add              wq, mmsize*2
>> +    add              wq, mmsize
>>       jl .loop_simd
>>   
>>   .end:
>> -    emms
>>       RET
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>   
>>   ;------------------------------------------------------------------------------
>>   ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
> 
> How old are the youngest processors with SSE2, but without SSSE3?

AMD Phenom/K10.

> According to Wikipedia, nearly 15 years. Which makes me believe that the
> SSE2 versions are not worth it (how many of these CPUs will use a new
> FFmpeg anyway?).

Simply by using the latest version of a video player that uses ffmpeg is 
enough to be able to run the newest code.
It was easy to write and i don't feel particularly interested enough to 
argue, so if you think it's not worth adding, i can just remove the 
mmxext version and skip adding anything.
Andreas Rheinhardt June 8, 2024, 3:55 p.m. UTC | #3
James Almer:
> On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
>> James Almer:
>>> And remove shuffle_bytes_2103_mmxext.
>>>
>>> shuffle_bytes_0321_c: 28.1
>>> shuffle_bytes_0321_sse2: 13.6
>>> shuffle_bytes_0321_ssse3: 9.6
>>> shuffle_bytes_0321_avx2: 7.1
>>> shuffle_bytes_1230_c: 52.6
>>> shuffle_bytes_1230_sse2: 12.1
>>> shuffle_bytes_1230_ssse3: 8.6
>>> shuffle_bytes_1230_avx2: 6.6
>>> shuffle_bytes_2103_c: 29.1
>>> shuffle_bytes_2103_mmxext: 29.3 // removed
>>> shuffle_bytes_2103_sse2: 12.5
>>> shuffle_bytes_2103_ssse3: 8.6
>>> shuffle_bytes_2103_avx2: 7.1
>>> shuffle_bytes_3012_c: 52.1
>>> shuffle_bytes_3012_sse2: 12.1
>>> shuffle_bytes_3012_ssse3: 8.6
>>> shuffle_bytes_3012_avx2: 7.1
>>> shuffle_bytes_3210_c: 50.6
>>> shuffle_bytes_3210_sse2: 14.6
>>> shuffle_bytes_3210_ssse3: 8.6
>>> shuffle_bytes_3210_avx2: 7.1
>>>
>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>> ---
>>>   libswscale/x86/rgb2rgb.c     | 14 ++++--
>>>   libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>>   2 files changed, 69 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>>> index 21ccfafe51..9f6c8efc72 100644
>>> --- a/libswscale/x86/rgb2rgb.c
>>> +++ b/libswscale/x86/rgb2rgb.c
>>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t,
>>> ff_bgr2UVOffset);
>>>     #endif /* HAVE_INLINE_ASM */
>>>   -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t
>>> *dst, int src_size);
>>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>>   void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>>   void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>>   void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>>           rgb2rgb_init_avx();
>>>   #endif /* HAVE_INLINE_ASM */
>>>   -    if (EXTERNAL_MMXEXT(cpu_flags)) {
>>> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>>> -    }
>>>       if (EXTERNAL_SSE2(cpu_flags)) {
>>> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>>> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>>> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>>> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>>> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>>   #if ARCH_X86_64
>>>           uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>>   #endif
>>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>>> index 0bf1278718..9fc1974389 100644
>>> --- a/libswscale/x86/rgb_2_rgb.asm
>>> +++ b/libswscale/x86/rgb_2_rgb.asm
>>> @@ -25,7 +25,6 @@
>>>     SECTION_RODATA
>>>   -pb_mask_shuffle2103_mmx times 8 dw 255
>>>   pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13,
>>> 12, 15
>>>   pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15,
>>> 14, 13
>>>   pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14,
>>> 15, 12
>>> @@ -50,11 +49,50 @@ SECTION .text
>>>  
>>> ;------------------------------------------------------------------------------
>>>   ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int
>>> src_size)
>>>  
>>> ;------------------------------------------------------------------------------
>>> -INIT_MMX mmxext
>>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>> -    mova   m6, [pb_mask_shuffle2103_mmx]
>>> -    mova   m7, m6
>>> -    psllq  m7, 8
>>> +
>>> +%macro SHUFFLE2103_SSE2 0
>>> +    pshuflw   m1, m0, 0xb1
>>> +    pshufhw   m1, m1, 0xb1
>>> +
>>> +    pand     m0, m3
>>> +    pand     m1, m2
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE0321_SSE2 0
>>> +    pshuflw   m1, m0, 0xb1
>>> +    pshufhw   m1, m1, 0xb1
>>> +
>>> +    pand     m0, m2
>>> +    pand     m1, m3
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE1230_SSE2 0
>>> +    pslld    m1, m0, 24
>>> +    psrld    m0, 8
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE3012_SSE2 0
>>> +    pslld    m1, m0, 8
>>> +    psrld    m0, 24
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE3210_SSE2 0
>>> +    pshuflw   m1, m0, 0xb1
>>> +    pshufhw   m1, m1, 0xb1
>>> +
>>> +    psrlw     m0, m1, 8
>>> +    psllw     m1, 8
>>> +%endmacro
>>> +
>>> +; %1-4 index shuffle
>>> +; %5 load mask
>>> +%macro SHUFFLE_BYTES_SSE2 5
>>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>>> +%if %5
>>> +    pcmpeqw        m2, m2
>>> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
>>> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
>>> +%endif
>>>         movsxdifnidn wq, wd
>>>       mov xq, wq
>>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst,
>>> w, tmp, x
>>>       je .loop_simd
>>>     .loop_scalar:
>>> -   mov          tmpb, [srcq + wq + 2]
>>> +   mov          tmpb, [srcq + wq + %1]
>>>      mov [dstq+wq + 0], tmpb
>>> -   mov          tmpb, [srcq + wq + 1]
>>> +   mov          tmpb, [srcq + wq + %2]
>>>      mov [dstq+wq + 1], tmpb
>>> -   mov          tmpb, [srcq + wq + 0]
>>> +   mov          tmpb, [srcq + wq + %3]
>>>      mov [dstq+wq + 2], tmpb
>>> -   mov          tmpb, [srcq + wq + 3]
>>> +   mov          tmpb, [srcq + wq + %4]
>>>      mov [dstq+wq + 3], tmpb
>>>      add            wq, 4
>>>      sub            xq, 4
>>> @@ -86,29 +124,26 @@ jge .end
>>>     .loop_simd:
>>>       movu     m0, [srcq+wq]
>>> -    movu     m1, [srcq+wq+8]
>>> -
>>> -    pshufw   m3, m0, 177
>>> -    pshufw   m5, m1, 177
>>> -
>>> -    pand     m0, m7
>>> -    pand     m3, m6
>>>   -    pand     m1, m7
>>> -    pand     m5, m6
>>> +    SHUFFLE%1%2%3%4_SSE2
>>>   -    por      m0, m3
>>> -    por      m1, m5
>>> +    por      m0, m1
>>>         movu      [dstq+wq], m0
>>> -    movu  [dstq+wq + 8], m1
>>>   -    add              wq, mmsize*2
>>> +    add              wq, mmsize
>>>       jl .loop_simd
>>>     .end:
>>> -    emms
>>>       RET
>>> +%endmacro
>>> +
>>> +INIT_XMM sse2
>>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>>    
>>> ;------------------------------------------------------------------------------
>>>   ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
>>
>> How old are the youngest processors with SSE2, but without SSSE3?
> 
> AMD Phenom/K10.
> 
>> According to Wikipedia, nearly 15 years. Which makes me believe that the
>> SSE2 versions are not worth it (how many of these CPUs will use a new
>> FFmpeg anyway?).
> 
> Simply by using the latest version of a video player that uses ffmpeg is
> enough to be able to run the newest code.

I asked "how many", not "how".

> It was easy to write and i don't feel particularly interested enough to
> argue, so if you think it's not worth adding, i can just remove the
> mmxext version and skip adding anything.

I think we should not optimize for CPUs that do not even have x86-64 v2.
So I would not add these SSE2 versions. But the one missing SSSE3
version (shuffle_bytes_2103_ssse3) is of course worth it.

- Andreas
Rémi Denis-Courmont June 8, 2024, 4:21 p.m. UTC | #4
Le lauantaina 8. kesäkuuta 2024, 18.55.53 EEST Andreas Rheinhardt a écrit :
> I think we should not optimize for CPUs that do not even have x86-64 v2.
> So I would not add these SSE2 versions.

We certainly should consider ditching SSE2 where SSSE3 is available now or in 
the near future. But in this particular case, James seems to be converting 
MMX(EXT) code into SSE2 code, more so that introducing pure new SSE2 code.

It took almost forever to agree to get rid of MMX. I would like to go ahead 
with that, and I like to think that many other people too. So can we at least 
tolerate porting MMX to SSE2 until we have gotten rid of MMX for good?
James Almer June 9, 2024, 3:36 p.m. UTC | #5
On 6/8/2024 12:55 PM, Andreas Rheinhardt wrote:
> James Almer:
>> On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
>>> James Almer:
>>>> And remove shuffle_bytes_2103_mmxext.
>>>>
>>>> shuffle_bytes_0321_c: 28.1
>>>> shuffle_bytes_0321_sse2: 13.6
>>>> shuffle_bytes_0321_ssse3: 9.6
>>>> shuffle_bytes_0321_avx2: 7.1
>>>> shuffle_bytes_1230_c: 52.6
>>>> shuffle_bytes_1230_sse2: 12.1
>>>> shuffle_bytes_1230_ssse3: 8.6
>>>> shuffle_bytes_1230_avx2: 6.6
>>>> shuffle_bytes_2103_c: 29.1
>>>> shuffle_bytes_2103_mmxext: 29.3 // removed
>>>> shuffle_bytes_2103_sse2: 12.5
>>>> shuffle_bytes_2103_ssse3: 8.6
>>>> shuffle_bytes_2103_avx2: 7.1
>>>> shuffle_bytes_3012_c: 52.1
>>>> shuffle_bytes_3012_sse2: 12.1
>>>> shuffle_bytes_3012_ssse3: 8.6
>>>> shuffle_bytes_3012_avx2: 7.1
>>>> shuffle_bytes_3210_c: 50.6
>>>> shuffle_bytes_3210_sse2: 14.6
>>>> shuffle_bytes_3210_ssse3: 8.6
>>>> shuffle_bytes_3210_avx2: 7.1
>>>>
>>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>>> ---
>>>>    libswscale/x86/rgb2rgb.c     | 14 ++++--
>>>>    libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>>>    2 files changed, 69 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>>>> index 21ccfafe51..9f6c8efc72 100644
>>>> --- a/libswscale/x86/rgb2rgb.c
>>>> +++ b/libswscale/x86/rgb2rgb.c
>>>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t,
>>>> ff_bgr2UVOffset);
>>>>      #endif /* HAVE_INLINE_ASM */
>>>>    -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t
>>>> *dst, int src_size);
>>>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>>    void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>>    void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>>    void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>>>            rgb2rgb_init_avx();
>>>>    #endif /* HAVE_INLINE_ASM */
>>>>    -    if (EXTERNAL_MMXEXT(cpu_flags)) {
>>>> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>>>> -    }
>>>>        if (EXTERNAL_SSE2(cpu_flags)) {
>>>> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>>>> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>>>> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>>>> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>>>> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>>>    #if ARCH_X86_64
>>>>            uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>>>    #endif
>>>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>>>> index 0bf1278718..9fc1974389 100644
>>>> --- a/libswscale/x86/rgb_2_rgb.asm
>>>> +++ b/libswscale/x86/rgb_2_rgb.asm
>>>> @@ -25,7 +25,6 @@
>>>>      SECTION_RODATA
>>>>    -pb_mask_shuffle2103_mmx times 8 dw 255
>>>>    pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13,
>>>> 12, 15
>>>>    pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15,
>>>> 14, 13
>>>>    pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14,
>>>> 15, 12
>>>> @@ -50,11 +49,50 @@ SECTION .text
>>>>   
>>>> ;------------------------------------------------------------------------------
>>>>    ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int
>>>> src_size)
>>>>   
>>>> ;------------------------------------------------------------------------------
>>>> -INIT_MMX mmxext
>>>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>>> -    mova   m6, [pb_mask_shuffle2103_mmx]
>>>> -    mova   m7, m6
>>>> -    psllq  m7, 8
>>>> +
>>>> +%macro SHUFFLE2103_SSE2 0
>>>> +    pshuflw   m1, m0, 0xb1
>>>> +    pshufhw   m1, m1, 0xb1
>>>> +
>>>> +    pand     m0, m3
>>>> +    pand     m1, m2
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE0321_SSE2 0
>>>> +    pshuflw   m1, m0, 0xb1
>>>> +    pshufhw   m1, m1, 0xb1
>>>> +
>>>> +    pand     m0, m2
>>>> +    pand     m1, m3
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE1230_SSE2 0
>>>> +    pslld    m1, m0, 24
>>>> +    psrld    m0, 8
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE3012_SSE2 0
>>>> +    pslld    m1, m0, 8
>>>> +    psrld    m0, 24
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE3210_SSE2 0
>>>> +    pshuflw   m1, m0, 0xb1
>>>> +    pshufhw   m1, m1, 0xb1
>>>> +
>>>> +    psrlw     m0, m1, 8
>>>> +    psllw     m1, 8
>>>> +%endmacro
>>>> +
>>>> +; %1-4 index shuffle
>>>> +; %5 load mask
>>>> +%macro SHUFFLE_BYTES_SSE2 5
>>>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>>>> +%if %5
>>>> +    pcmpeqw        m2, m2
>>>> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
>>>> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
>>>> +%endif
>>>>          movsxdifnidn wq, wd
>>>>        mov xq, wq
>>>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst,
>>>> w, tmp, x
>>>>        je .loop_simd
>>>>      .loop_scalar:
>>>> -   mov          tmpb, [srcq + wq + 2]
>>>> +   mov          tmpb, [srcq + wq + %1]
>>>>       mov [dstq+wq + 0], tmpb
>>>> -   mov          tmpb, [srcq + wq + 1]
>>>> +   mov          tmpb, [srcq + wq + %2]
>>>>       mov [dstq+wq + 1], tmpb
>>>> -   mov          tmpb, [srcq + wq + 0]
>>>> +   mov          tmpb, [srcq + wq + %3]
>>>>       mov [dstq+wq + 2], tmpb
>>>> -   mov          tmpb, [srcq + wq + 3]
>>>> +   mov          tmpb, [srcq + wq + %4]
>>>>       mov [dstq+wq + 3], tmpb
>>>>       add            wq, 4
>>>>       sub            xq, 4
>>>> @@ -86,29 +124,26 @@ jge .end
>>>>      .loop_simd:
>>>>        movu     m0, [srcq+wq]
>>>> -    movu     m1, [srcq+wq+8]
>>>> -
>>>> -    pshufw   m3, m0, 177
>>>> -    pshufw   m5, m1, 177
>>>> -
>>>> -    pand     m0, m7
>>>> -    pand     m3, m6
>>>>    -    pand     m1, m7
>>>> -    pand     m5, m6
>>>> +    SHUFFLE%1%2%3%4_SSE2
>>>>    -    por      m0, m3
>>>> -    por      m1, m5
>>>> +    por      m0, m1
>>>>          movu      [dstq+wq], m0
>>>> -    movu  [dstq+wq + 8], m1
>>>>    -    add              wq, mmsize*2
>>>> +    add              wq, mmsize
>>>>        jl .loop_simd
>>>>      .end:
>>>> -    emms
>>>>        RET
>>>> +%endmacro
>>>> +
>>>> +INIT_XMM sse2
>>>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>>>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>>>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>>>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>>>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>>>     
>>>> ;------------------------------------------------------------------------------
>>>>    ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
>>>
>>> How old are the youngest processors with SSE2, but without SSSE3?
>>
>> AMD Phenom/K10.
>>
>>> According to Wikipedia, nearly 15 years. Which makes me believe that the
>>> SSE2 versions are not worth it (how many of these CPUs will use a new
>>> FFmpeg anyway?).
>>
>> Simply by using the latest version of a video player that uses ffmpeg is
>> enough to be able to run the newest code.
> 
> I asked "how many", not "how".

I obviously don't have that kind of information. You'd need to look at 
things like Steam's, Firefox's or Chrome's hardware surveys.

> 
>> It was easy to write and i don't feel particularly interested enough to
>> argue, so if you think it's not worth adding, i can just remove the
>> mmxext version and skip adding anything.
> 
> I think we should not optimize for CPUs that do not even have x86-64 v2.

What is x86-64 v2?

> So I would not add these SSE2 versions. But the one missing SSSE3

Ok, I'll just remove the mmxext one, then.

> version (shuffle_bytes_2103_ssse3) is of course worth it.

I will look into that.

> 
> - Andreas
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Rémi Denis-Courmont June 9, 2024, 4:05 p.m. UTC | #6
Le sunnuntaina 9. kesäkuuta 2024, 18.36.35 EEST James Almer a écrit :
> I obviously don't have that kind of information. You'd need to look at
> things like Steam's, Firefox's or Chrome's hardware surveys.

As discussed on IRC yesterday, Steam claims that 106.85% of processors support
SSE2 (and as many SSE3) but "only" 106.63% support SSSE3 (seriously). What
100% are, I don't know. AVX2 is close with 99.83% but no cigar.

In any case, there is a tiny but observable gap between SSE2 and SSSE3 there.

> >> It was easy to write and i don't feel particularly interested enough to
> >> argue, so if you think it's not worth adding, i can just remove the
> >> mmxext version and skip adding anything.
> > 
> > I think we should not optimize for CPUs that do not even have x86-64 v2.
> 
> What is x86-64 v2?

See
https://developers.redhat.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level
James Almer June 10, 2024, 5:06 p.m. UTC | #7
On 6/9/2024 12:36 PM, James Almer wrote:
>> So I would not add these SSE2 versions. But the one missing SSSE3
>> version (shuffle_bytes_2103_ssse3) is of course worth it.
> 
> I will look into that.

I'm not sure why you said it's missing, because it's there.
Andreas Rheinhardt June 11, 2024, 5:18 a.m. UTC | #8
James Almer:
> On 6/9/2024 12:36 PM, James Almer wrote:
>>> So I would not add these SSE2 versions. But the one missing SSSE3
>>> version (shuffle_bytes_2103_ssse3) is of course worth it.
>>
>> I will look into that.
> 
> I'm not sure why you said it's missing, because it's there.

Sorry for having said garbage.

- Andreas
diff mbox series

Patch

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 21ccfafe51..9f6c8efc72 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -116,7 +116,11 @@  DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
 
 #endif /* HAVE_INLINE_ASM */
 
-void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
@@ -154,10 +158,12 @@  av_cold void rgb2rgb_init_x86(void)
         rgb2rgb_init_avx();
 #endif /* HAVE_INLINE_ASM */
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
-    }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
+        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
 #if ARCH_X86_64
         uyvytoyuv422 = ff_uyvytoyuv422_sse2;
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 0bf1278718..9fc1974389 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -25,7 +25,6 @@ 
 
 SECTION_RODATA
 
-pb_mask_shuffle2103_mmx times 8 dw 255
 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
@@ -50,11 +49,50 @@  SECTION .text
 ;------------------------------------------------------------------------------
 ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
 ;------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
-    mova   m6, [pb_mask_shuffle2103_mmx]
-    mova   m7, m6
-    psllq  m7, 8
+
+%macro SHUFFLE2103_SSE2 0
+    pshuflw   m1, m0, 0xb1
+    pshufhw   m1, m1, 0xb1
+
+    pand     m0, m3
+    pand     m1, m2
+%endmacro
+
+%macro SHUFFLE0321_SSE2 0
+    pshuflw   m1, m0, 0xb1
+    pshufhw   m1, m1, 0xb1
+
+    pand     m0, m2
+    pand     m1, m3
+%endmacro
+
+%macro SHUFFLE1230_SSE2 0
+    pslld    m1, m0, 24
+    psrld    m0, 8
+%endmacro
+
+%macro SHUFFLE3012_SSE2 0
+    pslld    m1, m0, 8
+    psrld    m0, 24
+%endmacro
+
+%macro SHUFFLE3210_SSE2 0
+    pshuflw   m1, m0, 0xb1
+    pshufhw   m1, m1, 0xb1
+
+    psrlw     m0, m1, 8
+    psllw     m1, 8
+%endmacro
+
+; %1-4 index shuffle
+; %5 load mask
+%macro SHUFFLE_BYTES_SSE2 5
+cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
+%if %5
+    pcmpeqw        m2, m2
+    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
+    psrlw          m2, 8     ; (word) { 0x00ff } x4
+%endif
 
     movsxdifnidn wq, wd
     mov xq, wq
@@ -68,13 +106,13 @@  cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
     je .loop_simd
 
 .loop_scalar:
-   mov          tmpb, [srcq + wq + 2]
+   mov          tmpb, [srcq + wq + %1]
    mov [dstq+wq + 0], tmpb
-   mov          tmpb, [srcq + wq + 1]
+   mov          tmpb, [srcq + wq + %2]
    mov [dstq+wq + 1], tmpb
-   mov          tmpb, [srcq + wq + 0]
+   mov          tmpb, [srcq + wq + %3]
    mov [dstq+wq + 2], tmpb
-   mov          tmpb, [srcq + wq + 3]
+   mov          tmpb, [srcq + wq + %4]
    mov [dstq+wq + 3], tmpb
    add            wq, 4
    sub            xq, 4
@@ -86,29 +124,26 @@  jge .end
 
 .loop_simd:
     movu     m0, [srcq+wq]
-    movu     m1, [srcq+wq+8]
-
-    pshufw   m3, m0, 177
-    pshufw   m5, m1, 177
-
-    pand     m0, m7
-    pand     m3, m6
 
-    pand     m1, m7
-    pand     m5, m6
+    SHUFFLE%1%2%3%4_SSE2
 
-    por      m0, m3
-    por      m1, m5
+    por      m0, m1
 
     movu      [dstq+wq], m0
-    movu  [dstq+wq + 8], m1
 
-    add              wq, mmsize*2
+    add              wq, mmsize
     jl .loop_simd
 
 .end:
-    emms
     RET
+%endmacro
+
+INIT_XMM sse2
+SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
+SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
+SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
+SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
+SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
 
 ;------------------------------------------------------------------------------
 ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)