diff mbox series

[FFmpeg-devel,V7,1/2] libswscale/x86/yuv2rgb: Change inline assembly into nasm code

Message ID 20200109173816.30897-1-ting.fu@intel.com
State Superseded
Headers show
Series [FFmpeg-devel,V7,1/2] libswscale/x86/yuv2rgb: Change inline assembly into nasm code
Related show

Checks

Context Check Description
andriy/ffmpeg-patchwork pending
andriy/ffmpeg-patchwork success Applied patch
andriy/ffmpeg-patchwork success Configure finished
andriy/ffmpeg-patchwork success Make finished
andriy/ffmpeg-patchwork success Make fate finished

Commit Message

Ting Fu Jan. 9, 2020, 5:38 p.m. UTC
Signed-off-by: Ting Fu <ting.fu@intel.com>
---
V7:
    Fix compile issue when user configure with --disable-mmx.
    Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
    Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c

 libswscale/x86/Makefile           |   1 +
 libswscale/x86/swscale.c          |  16 +-
 libswscale/x86/yuv2rgb.c          |  66 ++---
 libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
 libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
 5 files changed, 405 insertions(+), 415 deletions(-)
 create mode 100644 libswscale/x86/yuv_2_rgb.asm

Comments

Ting Fu Jan. 9, 2020, 5:57 p.m. UTC | #1
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ting
> Fu
> Sent: Friday, January 10, 2020 01:38 AM
> To: ffmpeg-devel@ffmpeg.org
> Subject: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> Signed-off-by: Ting Fu <ting.fu@intel.com>
> ---
> V7:
>     Fix compile issue when user configure with --disable-mmx.
>     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
>     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c

To be more detail. I was use 'if clause' to judge the color format in libswscale/x86/yuv2rgb.c and then the '#if macro' to judge SIMD in libswscale/x86/yuv2rgb_template.c. Which cannot correctly respond to the command when use ./ffmpeg with --cpuflags, cause it does not get value of av_get_cpu_flags() any more. So, I abandoned the macro and judge both color format and SIMD in libswscale/x86/yuv2rgb.c.

Thank you,
Ting Fu
> 
>  libswscale/x86/Makefile           |   1 +
>  libswscale/x86/swscale.c          |  16 +-
>  libswscale/x86/yuv2rgb.c          |  66 ++---
>  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
>  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
>  5 files changed, 405 insertions(+), 415 deletions(-)  create mode 100644
> libswscale/x86/yuv_2_rgb.asm
> 
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index
> f317d5dd9b..831d5359aa 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -12,3 +12,4 @@ X86ASM-OBJS                     += x86/input.o                          \
>                                     x86/output.o                         \
>                                     x86/scale.o                          \
>                                     x86/rgb_2_rgb.o                      \
> +                                   x86/yuv_2_rgb.o                      \
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index
> 0eed4f18d5..e9d474a1e8 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -29,6 +29,14 @@
>  #include "libavutil/cpu.h"
>  #include "libavutil/pixdesc.h"
> 
> +const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
> +    0x0103010301030103LL,
> +    0x0200020002000200LL,};
> +
> +const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
> +    0x0602060206020602LL,
> +    0x0004000400040004LL,};
> +
>  #if HAVE_INLINE_ASM
> 
>  #define DITHER1XBPP
> @@ -38,14 +46,6 @@ DECLARE_ASM_CONST(8, uint64_t, bFC)=
> 0xFCFCFCFCFCFCFCFCLL;
>  DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
>  DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
> 
> -const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
> -    0x0103010301030103LL,
> -    0x0200020002000200LL,};
> -
> -const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
> -    0x0602060206020602LL,
> -    0x0004000400040004LL,};
> -
>  DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
>  DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
>  DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
> diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c index
> 5e2f77c20f..dd813d4deb 100644
> --- a/libswscale/x86/yuv2rgb.c
> +++ b/libswscale/x86/yuv2rgb.c
> @@ -37,7 +37,7 @@
>  #include "libavutil/x86/cpu.h"
>  #include "libavutil/cpu.h"
> 
> -#if HAVE_INLINE_ASM
> +#if HAVE_X86ASM
> 
>  #define DITHER1XBPP // only for MMX
> 
> @@ -50,32 +50,31 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) =
> 0x0303030303030303ULL;  DECLARE_ASM_CONST(8, uint64_t, pb_07) =
> 0x0707070707070707ULL;
> 
>  //MMX versions
> -#if HAVE_MMX_INLINE && HAVE_6REGS
> +#if HAVE_MMX
>  #undef RENAME
>  #undef COMPILE_TEMPLATE_MMXEXT
>  #define COMPILE_TEMPLATE_MMXEXT 0
>  #define RENAME(a) a ## _mmx
>  #include "yuv2rgb_template.c"
> -#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
> +#endif /* HAVE_MMX */
> 
>  // MMXEXT versions
> -#if HAVE_MMXEXT_INLINE && HAVE_6REGS
> +#if HAVE_MMXEXT
>  #undef RENAME
>  #undef COMPILE_TEMPLATE_MMXEXT
>  #define COMPILE_TEMPLATE_MMXEXT 1
>  #define RENAME(a) a ## _mmxext
>  #include "yuv2rgb_template.c"
> -#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */
> +#endif /* HAVE_MMXEXT */
> 
> -#endif /* HAVE_INLINE_ASM */
> +#endif /* HAVE_X86ASM */
> 
>  av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)  { -#if
> HAVE_MMX_INLINE && HAVE_6REGS
> +#if HAVE_X86ASM
>      int cpu_flags = av_get_cpu_flags();
> 
> -#if HAVE_MMXEXT_INLINE
> -    if (INLINE_MMXEXT(cpu_flags)) {
> +    if (EXTERNAL_MMXEXT(cpu_flags)) {
>          switch (c->dstFormat) {
>          case AV_PIX_FMT_RGB24:
>              return yuv420_rgb24_mmxext; @@ -83,37 +82,36 @@ av_cold
> SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
>              return yuv420_bgr24_mmxext;
>          }
>      }
> -#endif
> 
> -    if (INLINE_MMX(cpu_flags)) {
> +    if (EXTERNAL_MMX(cpu_flags)) {
>          switch (c->dstFormat) {
> -            case AV_PIX_FMT_RGB32:
> -                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
> -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
> -                    return yuva420_rgb32_mmx;
> +        case AV_PIX_FMT_RGB32:
> +            if (c->srcFormat == AV_PIX_FMT_YUVA420P) { #if
> +CONFIG_SWSCALE_ALPHA
> +                return yuva420_rgb32_mmx;
>  #endif
> -                    break;
> -                } else
> -                    return yuv420_rgb32_mmx;
> -            case AV_PIX_FMT_BGR32:
> -                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
> -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
> -                    return yuva420_bgr32_mmx;
> +                break;
> +            } else
> +                return yuv420_rgb32_mmx;
> +        case AV_PIX_FMT_BGR32:
> +            if (c->srcFormat == AV_PIX_FMT_YUVA420P) { #if
> +CONFIG_SWSCALE_ALPHA
> +                return yuva420_bgr32_mmx;
>  #endif
> -                    break;
> -                } else
> -                    return yuv420_bgr32_mmx;
> -            case AV_PIX_FMT_RGB24:
> -                return yuv420_rgb24_mmx;
> -            case AV_PIX_FMT_BGR24:
> -                return yuv420_bgr24_mmx;
> -            case AV_PIX_FMT_RGB565:
> -                return yuv420_rgb16_mmx;
> -            case AV_PIX_FMT_RGB555:
> -                return yuv420_rgb15_mmx;
> +                break;
> +            } else
> +                return yuv420_bgr32_mmx;
> +        case AV_PIX_FMT_RGB24:
> +            return yuv420_rgb24_mmx;
> +        case AV_PIX_FMT_BGR24:
> +            return yuv420_bgr24_mmx;
> +        case AV_PIX_FMT_RGB565:
> +            return yuv420_rgb16_mmx;
> +        case AV_PIX_FMT_RGB555:
> +            return yuv420_rgb15_mmx;
>          }
>      }
> -#endif /* HAVE_MMX_INLINE  && HAVE_6REGS */
> 
> +#endif /* HAVE_X86ASM */
>      return NULL;
>  }
> diff --git a/libswscale/x86/yuv2rgb_template.c
> b/libswscale/x86/yuv2rgb_template.c
> index acb78f520e..554750f3e1 100644
> --- a/libswscale/x86/yuv2rgb_template.c
> +++ b/libswscale/x86/yuv2rgb_template.c
> @@ -26,31 +26,13 @@
>  #include "libavutil/x86/asm.h"
>  #include "libswscale/swscale_internal.h"
> 
> -#undef MOVNTQ
> -#undef EMMS
> -#undef SFENCE
> -
> -#if COMPILE_TEMPLATE_MMXEXT
> -#define MOVNTQ "movntq"
> -#define SFENCE "sfence"
> -#else
> -#define MOVNTQ "movq"
> -#define SFENCE " # nop"
> -#endif
> -
> -#define REG_BLUE  "0"
> -#define REG_RED   "1"
> -#define REG_GREEN "2"
> -#define REG_ALPHA "3"
> -
>  #define YUV2RGB_LOOP(depth)                                          \
>      h_size = (c->dstW + 7) & ~7;                                     \
>      if (h_size * depth > FFABS(dstStride[0]))                        \
>          h_size -= 8;                                                 \
>                                                                       \
> -    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                        \
> +    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                     \
>                                                                       \
> -    __asm__ volatile ("pxor %mm4, %mm4\n\t");                        \
>      for (y = 0; y < srcSliceH; y++) {                                \
>          uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
>          const uint8_t *py = src[0] +               y * srcStride[0]; \
> @@ -58,410 +40,149 @@
>          const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
>          x86_reg index = -h_size / 2;                                 \
> 
> -#define YUV2RGB_INITIAL_LOAD          \
> -    __asm__ volatile (                \
> -        "movq (%5, %0, 2), %%mm6\n\t" \
> -        "movd    (%2, %0), %%mm0\n\t" \
> -        "movd    (%3, %0), %%mm1\n\t" \
> -        "1: \n\t"                     \
> -
> -/* YUV2RGB core
> - * Conversion is performed in usual way:
> - * R = Y' * Ycoef + Vred * V'
> - * G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
> - * B = Y' * Ycoef               + Ublue * U'
> - *
> - * where X' = X * 8 - Xoffset (multiplication is performed to increase
> - * precision a bit).
> - * Since it operates in YUV420 colorspace, Y component is additionally
> - * split into Y1 and Y2 for even and odd pixels.
> - *
> - * Input:
> - * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero
> register
> - * Output:
> - * mm1 - R, mm2 - G, mm0 - B
> - */
> -#define YUV2RGB                                  \
> -    /* convert Y, U, V into Y1', Y2', U', V' */  \
> -    "movq      %%mm6, %%mm7\n\t"                 \
> -    "punpcklbw %%mm4, %%mm0\n\t"                 \
> -    "punpcklbw %%mm4, %%mm1\n\t"                 \
> -    "pand     "MANGLE(mmx_00ffw)", %%mm6\n\t"    \
> -    "psrlw     $8,    %%mm7\n\t"                 \
> -    "psllw     $3,    %%mm0\n\t"                 \
> -    "psllw     $3,    %%mm1\n\t"                 \
> -    "psllw     $3,    %%mm6\n\t"                 \
> -    "psllw     $3,    %%mm7\n\t"                 \
> -    "psubsw   "U_OFFSET"(%4), %%mm0\n\t"         \
> -    "psubsw   "V_OFFSET"(%4), %%mm1\n\t"         \
> -    "psubw    "Y_OFFSET"(%4), %%mm6\n\t"         \
> -    "psubw    "Y_OFFSET"(%4), %%mm7\n\t"         \
> -\
> -     /* multiply by coefficients */              \
> -    "movq      %%mm0, %%mm2\n\t"                 \
> -    "movq      %%mm1, %%mm3\n\t"                 \
> -    "pmulhw   "UG_COEFF"(%4), %%mm2\n\t"         \
> -    "pmulhw   "VG_COEFF"(%4), %%mm3\n\t"         \
> -    "pmulhw   "Y_COEFF" (%4), %%mm6\n\t"         \
> -    "pmulhw   "Y_COEFF" (%4), %%mm7\n\t"         \
> -    "pmulhw   "UB_COEFF"(%4), %%mm0\n\t"         \
> -    "pmulhw   "VR_COEFF"(%4), %%mm1\n\t"         \
> -    "paddsw    %%mm3, %%mm2\n\t"                 \
> -    /* now: mm0 = UB, mm1 = VR, mm2 = CG */      \
> -    /*      mm6 = Y1, mm7 = Y2 */                \
> -\
> -    /* produce RGB */                            \
> -    "movq      %%mm7, %%mm3\n\t"                 \
> -    "movq      %%mm7, %%mm5\n\t"                 \
> -    "paddsw    %%mm0, %%mm3\n\t"                 \
> -    "paddsw    %%mm1, %%mm5\n\t"                 \
> -    "paddsw    %%mm2, %%mm7\n\t"                 \
> -    "paddsw    %%mm6, %%mm0\n\t"                 \
> -    "paddsw    %%mm6, %%mm1\n\t"                 \
> -    "paddsw    %%mm6, %%mm2\n\t"                 \
> -
> -#define RGB_PACK_INTERLEAVE                  \
> -    /* pack and interleave even/odd pixels */    \
> -    "packuswb  %%mm1, %%mm0\n\t"                 \
> -    "packuswb  %%mm5, %%mm3\n\t"                 \
> -    "packuswb  %%mm2, %%mm2\n\t"                 \
> -    "movq      %%mm0, %%mm1\n\n"                 \
> -    "packuswb  %%mm7, %%mm7\n\t"                 \
> -    "punpcklbw %%mm3, %%mm0\n\t"                 \
> -    "punpckhbw %%mm3, %%mm1\n\t"                 \
> -    "punpcklbw %%mm7, %%mm2\n\t"                 \
> -
> -#define YUV2RGB_ENDLOOP(depth)                   \
> -    "movq 8 (%5, %0, 2), %%mm6\n\t"              \
> -    "movd 4 (%3, %0),    %%mm1\n\t"              \
> -    "movd 4 (%2, %0),    %%mm0\n\t"              \
> -    "add $"AV_STRINGIFY(depth * 8)", %1\n\t"     \
> -    "add  $4, %0\n\t"                            \
> -    "js   1b\n\t"                                \
> -
> -#if COMPILE_TEMPLATE_MMXEXT
> -#undef RGB_PACK24_B_OPERANDS
> -#define RGB_PACK24_B_OPERANDS
> NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask00
> 10,mask1001)
> -#else
> -#undef RGB_PACK24_B_OPERANDS
> -#define RGB_PACK24_B_OPERANDS
> -#endif
> -
> -#define YUV2RGB_OPERANDS                                          \
> -        : "+r" (index), "+r" (image)                              \
> -        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
> -          "r" (py - 2*index)                                      \
> -
> NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e
> 0) \
> -          RGB_PACK24_B_OPERANDS                                   \
> -        : "memory"                                                \
> -        );                                                        \
> -    }                                                             \
> -
> -#define YUV2RGB_OPERANDS_ALPHA                                    \
> -        : "+r" (index), "+r" (image)                              \
> -        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
> -          "r" (py - 2*index), "r" (pa - 2*index)                  \
> -          NAMED_CONSTRAINTS_ADD(mmx_00ffw)                        \
> -        : "memory"                                                \
> -        );                                                        \
> -    }                                                             \
> -
> -#define YUV2RGB_ENDFUNC                          \
> -    __asm__ volatile (SFENCE"\n\t"               \
> -                    "emms    \n\t");             \
> -    return srcSliceH;                            \
> -
> -#define IF0(x)
> -#define IF1(x) x
> -
> -#define RGB_PACK16(gmask, is15)                  \
> -    "pand      "MANGLE(mmx_redmask)", %%mm0\n\t" \
> -    "pand      "MANGLE(mmx_redmask)", %%mm1\n\t" \
> -    "movq      %%mm2,     %%mm3\n\t"             \
> -    "psllw   $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
> -    "psrlw   $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
> -    "psrlw     $3,        %%mm0\n\t"             \
> -    IF##is15("psrlw  $1,  %%mm1\n\t")            \
> -    "pand "MANGLE(pb_e0)", %%mm2\n\t"            \
> -    "pand "MANGLE(gmask)", %%mm3\n\t"            \
> -    "por       %%mm2,     %%mm0\n\t"             \
> -    "por       %%mm3,     %%mm1\n\t"             \
> -    "movq      %%mm0,     %%mm2\n\t"             \
> -    "punpcklbw %%mm1,     %%mm0\n\t"             \
> -    "punpckhbw %%mm1,     %%mm2\n\t"             \
> -    MOVNTQ "   %%mm0,      (%1)\n\t"             \
> -    MOVNTQ "   %%mm2,     8(%1)\n\t"             \
> -
> -#define DITHER_RGB                               \
> -    "paddusb "BLUE_DITHER"(%4),  %%mm0\n\t"      \
> -    "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
> -    "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \
> +extern void RENAME(ff_yuv_420_rgb24)(x86_reg index, uint8_t *image,
> const uint8_t *pu_index,
> +                                     const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                     const uint8_t *py_2index); extern
> +void RENAME(ff_yuv_420_bgr24)(x86_reg index, uint8_t *image, const
> uint8_t *pu_index,
> +                                     const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                     const uint8_t *py_2index);
> 
>  #if !COMPILE_TEMPLATE_MMXEXT
> -static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
> -                                       int srcStride[],
> -                                       int srcSliceY, int srcSliceH,
> -                                       uint8_t *dst[], int dstStride[])
> +extern void RENAME(ff_yuv_420_rgb15)(x86_reg index, uint8_t *image,
> const uint8_t *pu_index,
> +                                     const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                     const uint8_t *py_2index); extern
> +void RENAME(ff_yuv_420_rgb16)(x86_reg index, uint8_t *image, const
> uint8_t *pu_index,
> +                                     const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                     const uint8_t *py_2index); extern
> +void RENAME(ff_yuv_420_rgb32)(x86_reg index, uint8_t *image, const
> uint8_t *pu_index,
> +                                     const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                     const uint8_t *py_2index); extern
> +void RENAME(ff_yuv_420_bgr32)(x86_reg index, uint8_t *image, const
> uint8_t *pu_index,
> +                                     const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                     const uint8_t *py_2index); extern
> +void RENAME(ff_yuva_420_rgb32)(x86_reg index, uint8_t *image, const
> uint8_t *pu_index,
> +                                      const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                      const uint8_t *py_2index, const
> +uint8_t *pa_2index); extern void RENAME(ff_yuva_420_bgr32)(x86_reg
> index, uint8_t *image, const uint8_t *pu_index,
> +                                      const uint8_t *pv_index, const uint64_t
> *pointer_c_dither,
> +                                      const uint8_t *py_2index, const
> +uint8_t *pa_2index);
> +
> +static int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
> +                                               int srcStride[],
> +                                               int srcSliceY, int srcSliceH,
> +                                               uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> -
>      YUV2RGB_LOOP(2)
> 
>  #ifdef DITHER1XBPP
> -        c->blueDither  = ff_dither8[y       & 1];
> -        c->greenDither = ff_dither8[y       & 1];
> -        c->redDither   = ff_dither8[(y + 1) & 1];
> -#endif
> -
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK_INTERLEAVE
> -#ifdef DITHER1XBPP
> -        DITHER_RGB
> +    c->blueDither  = ff_dither8[y       & 1];
> +    c->greenDither = ff_dither8[y       & 1];
> +    c->redDither   = ff_dither8[(y + 1) & 1];
>  #endif
> -        RGB_PACK16(pb_03, 1)
> 
> -    YUV2RGB_ENDLOOP(2)
> -    YUV2RGB_OPERANDS
> -    YUV2RGB_ENDFUNC
> +    RENAME(ff_yuv_420_rgb15)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index);
> +    }
> +    return srcSliceH;
>  }
> 
> -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> -                                       int srcStride[],
> -                                       int srcSliceY, int srcSliceH,
> -                                       uint8_t *dst[], int dstStride[])
> +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> +                                               int srcStride[],
> +                                               int srcSliceY, int srcSliceH,
> +                                               uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> -
>      YUV2RGB_LOOP(2)
> 
>  #ifdef DITHER1XBPP
> -        c->blueDither  = ff_dither8[y       & 1];
> -        c->greenDither = ff_dither4[y       & 1];
> -        c->redDither   = ff_dither8[(y + 1) & 1];
> -#endif
> -
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK_INTERLEAVE
> -#ifdef DITHER1XBPP
> -        DITHER_RGB
> +    c->blueDither  = ff_dither8[y       & 1];
> +    c->greenDither = ff_dither4[y       & 1];
> +    c->redDither   = ff_dither8[(y + 1) & 1];
>  #endif
> -        RGB_PACK16(pb_07, 0)
> 
> -    YUV2RGB_ENDLOOP(2)
> -    YUV2RGB_OPERANDS
> -    YUV2RGB_ENDFUNC
> +    RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index);
> +    }
> +    return srcSliceH;
>  }
> -#endif /* !COMPILE_TEMPLATE_MMXEXT */
> -
> -#define RGB_PACK24(blue, red)\
> -    "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
> -    "packuswb  %%mm5,      %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
> -    "packuswb  %%mm7,      %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
> -    "movq      %%mm"red",  %%mm3 \n"\
> -    "movq      %%mm"blue", %%mm6 \n"\
> -    "psrlq     $32,        %%mm"red" \n" /* R1 R3 R5 R7 */\
> -    "punpcklbw %%mm2,      %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
> -    "punpcklbw %%mm"red",  %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
> -    "movq      %%mm3,      %%mm5 \n"\
> -    "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
> -    "punpcklwd %%mm6,      %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
> -    "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
> -    RGB_PACK24_B
> -
> -#if COMPILE_TEMPLATE_MMXEXT
> -DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1}; -
> DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0}; -
> DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0}; -
> DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1}; -
> DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0}; -#undef
> RGB_PACK24_B -#define RGB_PACK24_B\
> -    "pshufw    $0xc6,  %%mm2, %%mm1 \n"\
> -    "pshufw    $0x84,  %%mm3, %%mm6 \n"\
> -    "pshufw    $0x38,  %%mm5, %%mm7 \n"\
> -    "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
> -    "movq      %%mm1,         %%mm0 \n"\
> -    "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
> -    "movq      %%mm1,         %%mm2 \n"\
> -    "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
> -    "psrlq       $48,         %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
> -    "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
> -    "psllq       $32,         %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
> -    "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
> -    "por       %%mm3,         %%mm1 \n"\
> -    "por       %%mm6,         %%mm0 \n"\
> -    "por       %%mm5,         %%mm1 \n"\
> -    "por       %%mm7,         %%mm2 \n"\
> -    MOVNTQ"    %%mm0,          (%1) \n"\
> -    MOVNTQ"    %%mm1,         8(%1) \n"\
> -    MOVNTQ"    %%mm2,        16(%1) \n"\
> -
> -#else
> -#undef RGB_PACK24_B
> -#define RGB_PACK24_B\
> -    "movd      %%mm3,       (%1) \n" /* R0 G0 B0 R1 */\
> -    "movd      %%mm2,      4(%1) \n" /* G1 B1 */\
> -    "psrlq     $32,        %%mm3 \n"\
> -    "psrlq     $16,        %%mm2 \n"\
> -    "movd      %%mm3,      6(%1) \n" /* R2 G2 B2 R3 */\
> -    "movd      %%mm2,     10(%1) \n" /* G3 B3 */\
> -    "psrlq     $16,        %%mm2 \n"\
> -    "movd      %%mm5,     12(%1) \n" /* R4 G4 B4 R5 */\
> -    "movd      %%mm2,     16(%1) \n" /* G5 B5 */\
> -    "psrlq     $32,        %%mm5 \n"\
> -    "movd      %%mm2,     20(%1) \n" /* -- -- G7 B7 */\
> -    "movd      %%mm5,     18(%1) \n" /* R6 G6 B6 R7 */\
> 
> -#endif
> -
> -static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
> -                                       int srcStride[],
> -                                       int srcSliceY, int srcSliceH,
> -                                       uint8_t *dst[], int dstStride[])
> +static int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
> +                                               int srcStride[],
> +                                               int srcSliceY, int srcSliceH,
> +                                               uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> +    YUV2RGB_LOOP(4)
> 
> -    YUV2RGB_LOOP(3)
> -
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK24(REG_BLUE, REG_RED)
> -
> -    YUV2RGB_ENDLOOP(3)
> -    YUV2RGB_OPERANDS
> -    YUV2RGB_ENDFUNC
> +    RENAME(ff_yuv_420_rgb32)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index);
> +    }
> +    return srcSliceH;
>  }
> 
> -static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
> -                                       int srcStride[],
> -                                       int srcSliceY, int srcSliceH,
> -                                       uint8_t *dst[], int dstStride[])
> +static int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
> +                                               int srcStride[],
> +                                               int srcSliceY, int srcSliceH,
> +                                               uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> +    YUV2RGB_LOOP(4)
> 
> -    YUV2RGB_LOOP(3)
> -
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK24(REG_RED, REG_BLUE)
> -
> -    YUV2RGB_ENDLOOP(3)
> -    YUV2RGB_OPERANDS
> -    YUV2RGB_ENDFUNC
> +    RENAME(ff_yuv_420_bgr32)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index);
> +    }
> +    return srcSliceH;
>  }
> 
> -
> -#define SET_EMPTY_ALPHA                                                      \
> -    "pcmpeqd   %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha
> to 0xFF */ \
> -
> -#define LOAD_ALPHA                                   \
> -    "movq      (%6, %0, 2),     %%mm"REG_ALPHA"\n\t" \
> -
> -#define RGB_PACK32(red, green, blue, alpha)  \
> -    "movq      %%mm"blue",  %%mm5\n\t"       \
> -    "movq      %%mm"red",   %%mm6\n\t"       \
> -    "punpckhbw %%mm"green", %%mm5\n\t"       \
> -    "punpcklbw %%mm"green", %%mm"blue"\n\t"  \
> -    "punpckhbw %%mm"alpha", %%mm6\n\t"       \
> -    "punpcklbw %%mm"alpha", %%mm"red"\n\t"   \
> -    "movq      %%mm"blue",  %%mm"green"\n\t" \
> -    "movq      %%mm5,       %%mm"alpha"\n\t" \
> -    "punpcklwd %%mm"red",   %%mm"blue"\n\t"  \
> -    "punpckhwd %%mm"red",   %%mm"green"\n\t" \
> -    "punpcklwd %%mm6,       %%mm5\n\t"       \
> -    "punpckhwd %%mm6,       %%mm"alpha"\n\t" \
> -    MOVNTQ "   %%mm"blue",   0(%1)\n\t"      \
> -    MOVNTQ "   %%mm"green",  8(%1)\n\t"      \
> -    MOVNTQ "   %%mm5,       16(%1)\n\t"      \
> -    MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \
> -
> -#if !COMPILE_TEMPLATE_MMXEXT
> -static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
> -                                       int srcStride[],
> -                                       int srcSliceY, int srcSliceH,
> -                                       uint8_t *dst[], int dstStride[])
> +static int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
> +                                                int srcStride[],
> +                                                int srcSliceY, int srcSliceH,
> +                                                uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> -
>      YUV2RGB_LOOP(4)
> 
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK_INTERLEAVE
> -        SET_EMPTY_ALPHA
> -        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
> -
> -    YUV2RGB_ENDLOOP(4)
> -    YUV2RGB_OPERANDS
> -    YUV2RGB_ENDFUNC
> +    const uint8_t *pa = src[3] + y * srcStride[3];
> +    RENAME(ff_yuva_420_rgb32)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index, pa - 2 * index);
> +    }
> +    return srcSliceH;
>  }
> 
> -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
> -static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
> -                                        int srcStride[],
> -                                        int srcSliceY, int srcSliceH,
> -                                        uint8_t *dst[], int dstStride[])
> +static int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
> +                                                int srcStride[],
> +                                                int srcSliceY, int srcSliceH,
> +                                                uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> -
>      YUV2RGB_LOOP(4)
> 
> -        const uint8_t *pa = src[3] + y * srcStride[3];
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK_INTERLEAVE
> -        LOAD_ALPHA
> -        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
> -
> -    YUV2RGB_ENDLOOP(4)
> -    YUV2RGB_OPERANDS_ALPHA
> -    YUV2RGB_ENDFUNC
> +    const uint8_t *pa = src[3] + y * srcStride[3];
> +    RENAME(ff_yuva_420_bgr32)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index, pa - 2 * index);
> +    }
> +    return srcSliceH;
>  }
>  #endif
> 
> -static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
> -                                       int srcStride[],
> -                                       int srcSliceY, int srcSliceH,
> -                                       uint8_t *dst[], int dstStride[])
> +static int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
> +                                               int srcStride[],
> +                                               int srcSliceY, int srcSliceH,
> +                                               uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> +    YUV2RGB_LOOP(3)
> 
> -    YUV2RGB_LOOP(4)
> -
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK_INTERLEAVE
> -        SET_EMPTY_ALPHA
> -        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
> -
> -    YUV2RGB_ENDLOOP(4)
> -    YUV2RGB_OPERANDS
> -    YUV2RGB_ENDFUNC
> +    RENAME(ff_yuv_420_rgb24)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index);
> +    }
> +    return srcSliceH;
>  }
> 
> -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
> -static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
> -                                        int srcStride[],
> -                                        int srcSliceY, int srcSliceH,
> -                                        uint8_t *dst[], int dstStride[])
> +static int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
> +                                               int srcStride[],
> +                                               int srcSliceY, int srcSliceH,
> +                                               uint8_t *dst[], int
> +dstStride[])
>  {
>      int y, h_size, vshift;
> +    YUV2RGB_LOOP(3)
> 
> -    YUV2RGB_LOOP(4)
> -
> -        const uint8_t *pa = src[3] + y * srcStride[3];
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK_INTERLEAVE
> -        LOAD_ALPHA
> -        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
> -
> -    YUV2RGB_ENDLOOP(4)
> -    YUV2RGB_OPERANDS_ALPHA
> -    YUV2RGB_ENDFUNC
> +    RENAME(ff_yuv_420_bgr24)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index);
> +    }
> +    return srcSliceH;
>  }
> -#endif
> 
> -#endif /* !COMPILE_TEMPLATE_MMXEXT */
> diff --git a/libswscale/x86/yuv_2_rgb.asm b/libswscale/x86/yuv_2_rgb.asm
> new file mode 100644 index 0000000000..a44ab1607b
> --- /dev/null
> +++ b/libswscale/x86/yuv_2_rgb.asm
> @@ -0,0 +1,270 @@
> +;**************************************************************
> ********
> +********
> +;* software YUV to RGB converter
> +;*
> +;* Copyright (C) 2001-2007 Michael Niedermayer
> +;*           (c) 2010 Konstantin Shishkov
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> +02110-1301 USA
> +;**************************************************************
> ********
> +********
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pw_00ff: times 4 dw 255
> +pb_f8:   times 8 db 248
> +pb_e0:   times 8 db 224
> +pb_03:   times 8 db 3
> +pb_07:   times 8 db 7
> +
> +mask_1101: dw -1, -1,  0, -1
> +mask_0010: dw  0,  0, -1,  0
> +mask_0110: dw  0, -1, -1,  0
> +mask_1001: dw -1,  0,  0, -1
> +mask_0100: dw  0, -1,  0,  0
> +
> +SECTION .text
> +
> +;----------------------------------------------------------------------
> +-------
> +;
> +; YUV420/YUVA420 to RGB/BGR 15/16/24/32 ; R = Y + ((vrCoff * (v - 128))
> +>> 8) ; G = Y - ((ugCoff * (u - 128) + vgCoff * (v - 128)) >> 8) ; B =
> +Y + ((ubCoff * (u - 128)) >> 8) ;
> +;----------------------------------------------------------------------
> +-------
> +
> +%macro MOV_H2L 1
> +psrlq %1, 32
> +%endmacro
> +
> +%macro yuv2rgb_fn 3
> +
> +%if %3 == 32
> +    %ifidn %1, yuva
> +    %define parameters index, image, pu_index, pv_index, pointer_c_dither,
> py_2index, pa_2index
> +    %define GPR_num 7
> +    %endif
> +%else
> +    %define parameters index, image, pu_index, pv_index, pointer_c_dither,
> py_2index
> +    %define GPR_num 6
> +%endif
> +
> +%define m_green m2
> +%define m_alpha m3
> +%define m_y m6
> +%define m_u m0
> +%define m_v m1
> +%ifidn %2, rgb
> +%define m_red m1
> +%define m_blue m0
> +%else
> +%define m_red m0
> +%define m_blue m1
> +%endif
> +
> +%define time_num 1
> +%define reg_num 8
> +%define y_offset [pointer_c_ditherq + 8  * 8] %define u_offset
> +[pointer_c_ditherq + 9  * 8] %define v_offset [pointer_c_ditherq + 10 *
> +8] %define ug_coff  [pointer_c_ditherq + 7  * 8] %define vg_coff
> +[pointer_c_ditherq + 6  * 8]
> +%define y_coff   [pointer_c_ditherq + 3  * 8]
> +%define ub_coff  [pointer_c_ditherq + 5  * 8] %define vr_coff
> +[pointer_c_ditherq + 4  * 8]
> +
> +cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
> +
> +%if ARCH_X86_64
> +    movsxd indexq, indexd
> +%endif
> +    mova m_y, [py_2indexq + 2 * indexq]
> +    movh m_u, [pu_indexq  +     indexq]
> +    movh m_v, [pv_indexq  +     indexq]
> +.loop0:
> +    pxor m4, m4
> +    mova m7, m6
> +    punpcklbw m0, m4
> +    punpcklbw m1, m4
> +    mova m2, [pw_00ff]
> +    pand m6, m2
> +    psrlw m7, 8
> +    psllw m0, 3
> +    psllw m1, 3
> +    psllw m6, 3
> +    psllw m7, 3
> +    psubsw m0, u_offset ; U = U - 128
> +    psubsw m1, v_offset ; V = V - 128
> +    psubw m6, y_offset
> +    psubw m7, y_offset
> +    mova m2, m0
> +    mova m3, m1
> +    pmulhw m2, ug_coff
> +    pmulhw m3, vg_coff
> +    pmulhw m6, y_coff
> +    pmulhw m7, y_coff
> +    pmulhw m0, ub_coff
> +    pmulhw m1, vr_coff
> +    paddsw m2, m3
> +    mova m3, m7
> +    mova m5, m7
> +    paddsw m3, m0 ; B1 B3 B5 B7 ...
> +    paddsw m5, m1 ; R1 R3 R5 R7 ...
> +    paddsw m7, m2 ; G1 G3 G4 G7 ...
> +    paddsw m0, m6 ; B0 B2 B4 B6 ...
> +    paddsw m1, m6 ; R0 R2 R4 R6 ...
> +    paddsw m2, m6 ; G0 G2 G4 G6 ...
> +
> +%if %3 == 24 ; PACK RGB24
> +%define depth 3
> +    packuswb m0, m3 ; R0 R2 R4 R6 ... R1 R3 R5 R7 ...
> +    packuswb m1, m5 ; B0 B2 B4 B6 ... B1 B3 B5 B7 ...
> +    packuswb m2, m7 ; G0 G2 G4 G6 ... G1 G3 G5 G7 ...
> +    mova m3, m_red
> +    mova m6, m_blue
> +    MOV_H2L m_red
> +    punpcklbw m3, m2     ; R0 G0 R2 G2 R4 G4 R6 G6 R8 G8 ...
> +    punpcklbw m6, m_red  ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
> +    mova m5, m3
> +    punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
> +    punpcklwd m3 ,m6     ; R0 G0 B0 R1 R2 G2 B2 R3
> +    punpckhwd m5, m6     ; R4 G4 B4 R5 R6 G6 B6 R7
> +%if cpuflag(mmxext)
> +    pshufw m1, m2, 0xc6
> +    pshufw m6, m3, 0x84
> +    pshufw m7, m5, 0x38
> +    pand m6, [mask_1101] ; R0 G0 B0 R1 -- -- R2 G2
> +    movq m0, m1
> +    pand m7, [mask_0110] ; -- -- R6 G6 B6 R7 -- --
> +    movq m2, m1
> +    pand m1, [mask_0100] ; -- -- G3 B3 -- -- -- --
> +    psrlq m3, 48         ; B2 R3 -- -- -- -- -- --
> +    pand m0, [mask_0010] ; -- -- -- -- G1 B1 -- --
> +    psllq m5, 32         ; -- -- -- -- R4 G4 B4 R5
> +    pand m2, [mask_1001] ; G5 B5 -- -- -- -- G7 B7
> +    por m1, m3
> +    por m0, m6
> +    por m1, m5
> +    por m2, m7
> +    movntq [imageq], m0
> +    movntq [imageq + 8], m1
> +    movntq [imageq + 16], m2
> +%else ; cpuflag(mmx)
> +    movd [imageq], m3      ; R0 G0 R2 G2
> +    movd [imageq + 4], m2  ; G1 B1
> +    psrlq m3, 32
> +    psrlq m2, 16
> +    movd [imageq + 6], m3  ; R2 G2 B2 R3
> +    movd [imageq + 10], m2 ; G3 B3
> +    psrlq m2, 16
> +    movd [imageq + 12], m5 ; R4 G4 B4 R5
> +    movd [imageq + 16], m2 ; G5 B5
> +    psrlq m5, 32
> +    movd [imageq + 20], m2 ; -- -- G7 B7
> +    movd [imageq + 18], m5 ; R6 G6 B6 R7 %endif %else ; PACK
> +RGB15/16/32
> +    packuswb m0, m1
> +    packuswb m3, m5
> +    packuswb m2, m2
> +    mova m1, m0
> +    packuswb m7, m7
> +    punpcklbw m0, m3 ; B0 B1 B2 B3 ... B7
> +    punpckhbw m1, m3 ; R0 R1 R2 R3 ... R7
> +    punpcklbw m2, m7 ; G0 G1 G2 G3 ... G7 %if %3 == 32 ; PACK RGB32
> +%define depth 4 %ifidn %1, yuv
> +    pcmpeqd m3, m3 ; Set alpha empty
> +%else
> +    mova m3, [pa_2indexq + 2 * indexq] ; Load alpha %endif
> +    mova m5, m_blue
> +    mova m6, m_red
> +    punpckhbw m5, m_green
> +    punpcklbw m_blue, m_green
> +    punpckhbw m6, m_alpha
> +    punpcklbw m_red, m_alpha
> +    mova m_green, m_blue
> +    mova m_alpha, m5
> +    punpcklwd m_blue, m_red
> +    punpckhwd m_green, m_red
> +    punpcklwd m5, m6
> +    punpckhwd m_alpha, m6
> +    mova [imageq + 0], m_blue
> +    mova [imageq + 8 * time_num], m_green
> +    mova [imageq + 16 * time_num], m5
> +    mova [imageq + 24 * time_num], m_alpha %else ; PACK RGB15/16
> +%define depth 2 %define blue_dither  [pointer_c_ditherq + 2 * 8]
> +%define green_dither [pointer_c_ditherq + 1 * 8]
> +%define red_dither   [pointer_c_ditherq + 0 * 8]
> +%if %3 == 15
> +%define gmask pb_03
> +%define isRGB15 1
> +%else
> +%define gmask pb_07
> +%define isRGB15 0
> +%endif
> +    paddusb m0, blue_dither
> +    paddusb m2, green_dither
> +    paddusb m1, red_dither
> +    pand m0, [pb_f8]
> +    pand m1, [pb_f8]
> +    mova m3, m2
> +    psllw m2, 3 - isRGB15
> +    psrlw m3, 5 + isRGB15
> +    psrlw m0, 3
> +    psrlw m1, isRGB15
> +    pand m2, [pb_e0]
> +    pand m3, [gmask]
> +    por m0, m2
> +    por m1, m3
> +    mova m2, m0
> +    punpcklbw m0, m1
> +    punpckhbw m2, m1
> +    mova [imageq], m0
> +    mova [imageq + 8 * time_num], m2
> +%endif ; PACK RGB15/16
> +%endif ; PACK RGB15/16/32
> +
> +mova m_y, [py_2indexq + 2 * indexq + 8 * time_num]
> +movh m_v, [pv_indexq  +     indexq + 4 * time_num]
> +movh m_u, [pu_indexq  +     indexq + 4 * time_num]
> +add imageq, 8 * depth * time_num
> +add indexq, 4 * time_num
> +js .loop0
> +
> +REP_RET
> +
> +%endmacro
> +
> +INIT_MMX mmx
> +yuv2rgb_fn yuv,  rgb, 24
> +yuv2rgb_fn yuv,  bgr, 24
> +yuv2rgb_fn yuv,  rgb, 32
> +yuv2rgb_fn yuv,  bgr, 32
> +yuv2rgb_fn yuva, rgb, 32
> +yuv2rgb_fn yuva, bgr, 32
> +yuv2rgb_fn yuv,  rgb, 15
> +yuv2rgb_fn yuv,  rgb, 16
> +
> +INIT_MMX mmxext
> +yuv2rgb_fn yuv, rgb, 24
> +yuv2rgb_fn yuv, bgr, 24
> --
> 2.17.1
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
Ting Fu Jan. 14, 2020, 6:14 a.m. UTC | #2
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Fu,
> Ting
> Sent: Friday, January 10, 2020 01:58 AM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ting
> > Fu
> > Sent: Friday, January 10, 2020 01:38 AM
> > To: ffmpeg-devel@ffmpeg.org
> > Subject: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> > inline assembly into nasm code
> >
> > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > ---
> > V7:
> >     Fix compile issue when user configure with --disable-mmx.
> >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> 
> To be more detail. I was use 'if clause' to judge the color format in
> libswscale/x86/yuv2rgb.c and then the '#if macro' to judge SIMD in
> libswscale/x86/yuv2rgb_template.c. Which cannot correctly respond to the
> command when use ./ffmpeg with --cpuflags, cause it does not get value of
> av_get_cpu_flags() any more. So, I abandoned the macro and judge both color
> format and SIMD in libswscale/x86/yuv2rgb.c.
> 
> Thank you,
> Ting Fu
> >
> >  libswscale/x86/Makefile           |   1 +
> >  libswscale/x86/swscale.c          |  16 +-
> >  libswscale/x86/yuv2rgb.c          |  66 ++---
> >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> >  5 files changed, 405 insertions(+), 415 deletions(-)  create mode
> > 100644 libswscale/x86/yuv_2_rgb.asm
> >
A kindle ping.
[...]
> > --
> > 2.17.1
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
Ting Fu Jan. 14, 2020, 6:23 a.m. UTC | #3
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Fu,
> Ting
> Sent: Tuesday, January 14, 2020 02:15 PM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Fu,
> > Ting
> > Sent: Friday, January 10, 2020 01:58 AM
> > To: FFmpeg development discussions and patches
> > <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb:
> > Change inline assembly into nasm code
> >
> >
> >
> > > -----Original Message-----
> > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > > Ting Fu
> > > Sent: Friday, January 10, 2020 01:38 AM
> > > To: ffmpeg-devel@ffmpeg.org
> > > Subject: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb:
> > > Change inline assembly into nasm code
> > >
> > > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > > ---
> > > V7:
> > >     Fix compile issue when user configure with --disable-mmx.
> > >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> > >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> >
> > To be more detail. I was use 'if clause' to judge the color format in
> > libswscale/x86/yuv2rgb.c and then the '#if macro' to judge SIMD in
> > libswscale/x86/yuv2rgb_template.c. Which cannot correctly respond to
> > the command when use ./ffmpeg with --cpuflags, cause it does not get
> > value of
> > av_get_cpu_flags() any more. So, I abandoned the macro and judge both
> > color format and SIMD in libswscale/x86/yuv2rgb.c.
> >
> > Thank you,
> > Ting Fu
> > >
> > >  libswscale/x86/Makefile           |   1 +
> > >  libswscale/x86/swscale.c          |  16 +-
> > >  libswscale/x86/yuv2rgb.c          |  66 ++---
> > >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> > >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> > >  5 files changed, 405 insertions(+), 415 deletions(-)  create mode
> > > 100644 libswscale/x86/yuv_2_rgb.asm
> > >
> A kindle ping.

Sorry , I mean 'a kindly ping'.

Ting Fu
> [...]
> > > --
> > > 2.17.1
> > >
> > > _______________________________________________
> > > ffmpeg-devel mailing list
> > > ffmpeg-devel@ffmpeg.org
> > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > >
> > > To unsubscribe, visit link above, or email
> > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
Michael Niedermayer Jan. 14, 2020, 9:55 p.m. UTC | #4
On Fri, Jan 10, 2020 at 01:38:15AM +0800, Ting Fu wrote:
> Signed-off-by: Ting Fu <ting.fu@intel.com>
> ---
> V7:
>     Fix compile issue when user configure with --disable-mmx.
>     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
>     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> 
>  libswscale/x86/Makefile           |   1 +
>  libswscale/x86/swscale.c          |  16 +-
>  libswscale/x86/yuv2rgb.c          |  66 ++---
>  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
>  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
>  5 files changed, 405 insertions(+), 415 deletions(-)
>  create mode 100644 libswscale/x86/yuv_2_rgb.asm

The commit message seems a bit terse
I think it should say if the sequence of instructions is unchanged
and if it was benchmaked. If its the same speed, when the code is run
the commit message should say that too

the principle of this (inline -> nasm) is fine of course.


> 
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index f317d5dd9b..831d5359aa 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -12,3 +12,4 @@ X86ASM-OBJS                     += x86/input.o                          \
>                                     x86/output.o                         \
>                                     x86/scale.o                          \
>                                     x86/rgb_2_rgb.o                      \
> +                                   x86/yuv_2_rgb.o                      \
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 0eed4f18d5..e9d474a1e8 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -29,6 +29,14 @@
>  #include "libavutil/cpu.h"
>  #include "libavutil/pixdesc.h"
>  
> +const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
> +    0x0103010301030103LL,
> +    0x0200020002000200LL,};
> +
> +const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
> +    0x0602060206020602LL,
> +    0x0004000400040004LL,};
> +
>  #if HAVE_INLINE_ASM
>  
>  #define DITHER1XBPP
> @@ -38,14 +46,6 @@ DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
>  DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
>  DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
>  
> -const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
> -    0x0103010301030103LL,
> -    0x0200020002000200LL,};
> -
> -const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
> -    0x0602060206020602LL,
> -    0x0004000400040004LL,};
> -
>  DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
>  DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
>  DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
> diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
> index 5e2f77c20f..dd813d4deb 100644
> --- a/libswscale/x86/yuv2rgb.c
> +++ b/libswscale/x86/yuv2rgb.c
> @@ -37,7 +37,7 @@
>  #include "libavutil/x86/cpu.h"
>  #include "libavutil/cpu.h"
>  
> -#if HAVE_INLINE_ASM
> +#if HAVE_X86ASM
>  
>  #define DITHER1XBPP // only for MMX
>  
> @@ -50,32 +50,31 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
>  DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
>  
>  //MMX versions
> -#if HAVE_MMX_INLINE && HAVE_6REGS
> +#if HAVE_MMX
>  #undef RENAME
>  #undef COMPILE_TEMPLATE_MMXEXT
>  #define COMPILE_TEMPLATE_MMXEXT 0
>  #define RENAME(a) a ## _mmx
>  #include "yuv2rgb_template.c"
> -#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
> +#endif /* HAVE_MMX */
>  
>  // MMXEXT versions
> -#if HAVE_MMXEXT_INLINE && HAVE_6REGS
> +#if HAVE_MMXEXT
>  #undef RENAME
>  #undef COMPILE_TEMPLATE_MMXEXT
>  #define COMPILE_TEMPLATE_MMXEXT 1
>  #define RENAME(a) a ## _mmxext
>  #include "yuv2rgb_template.c"
> -#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */
> +#endif /* HAVE_MMXEXT */
>  
> -#endif /* HAVE_INLINE_ASM */
> +#endif /* HAVE_X86ASM */
>  
>  av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
>  {
> -#if HAVE_MMX_INLINE && HAVE_6REGS
> +#if HAVE_X86ASM
>      int cpu_flags = av_get_cpu_flags();
>  
> -#if HAVE_MMXEXT_INLINE
> -    if (INLINE_MMXEXT(cpu_flags)) {
> +    if (EXTERNAL_MMXEXT(cpu_flags)) {
>          switch (c->dstFormat) {
>          case AV_PIX_FMT_RGB24:
>              return yuv420_rgb24_mmxext;
> @@ -83,37 +82,36 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
>              return yuv420_bgr24_mmxext;
>          }
>      }
> -#endif
>  
> -    if (INLINE_MMX(cpu_flags)) {
> +    if (EXTERNAL_MMX(cpu_flags)) {
>          switch (c->dstFormat) {
> -            case AV_PIX_FMT_RGB32:
> -                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
> -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
> -                    return yuva420_rgb32_mmx;
> +        case AV_PIX_FMT_RGB32:
> +            if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
> +#if CONFIG_SWSCALE_ALPHA
> +                return yuva420_rgb32_mmx;
>  #endif
> -                    break;
> -                } else
> -                    return yuv420_rgb32_mmx;
> -            case AV_PIX_FMT_BGR32:
> -                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
> -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
> -                    return yuva420_bgr32_mmx;
> +                break;
> +            } else
> +                return yuv420_rgb32_mmx;
> +        case AV_PIX_FMT_BGR32:
> +            if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
> +#if CONFIG_SWSCALE_ALPHA
> +                return yuva420_bgr32_mmx;
>  #endif
> -                    break;
> -                } else
> -                    return yuv420_bgr32_mmx;
> -            case AV_PIX_FMT_RGB24:
> -                return yuv420_rgb24_mmx;
> -            case AV_PIX_FMT_BGR24:
> -                return yuv420_bgr24_mmx;
> -            case AV_PIX_FMT_RGB565:
> -                return yuv420_rgb16_mmx;
> -            case AV_PIX_FMT_RGB555:
> -                return yuv420_rgb15_mmx;
> +                break;
> +            } else
> +                return yuv420_bgr32_mmx;
> +        case AV_PIX_FMT_RGB24:
> +            return yuv420_rgb24_mmx;
> +        case AV_PIX_FMT_BGR24:
> +            return yuv420_bgr24_mmx;
> +        case AV_PIX_FMT_RGB565:
> +            return yuv420_rgb16_mmx;
> +        case AV_PIX_FMT_RGB555:
> +            return yuv420_rgb15_mmx;
>          }
>      }

this is a little messy to review
it is mostly reindention
yuv2rgb.c          |   66 +++----
and with -w
yuv2rgb.c          |   26 +--



[...]
> -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> -                                       int srcStride[],
> -                                       int srcSliceY, int srcSliceH,
> -                                       uint8_t *dst[], int dstStride[])
> +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> +                                               int srcStride[],
> +                                               int srcSliceY, int srcSliceH,
> +                                               uint8_t *dst[], int dstStride[])

maybe the removial of inline should be a seperate patch
also there is the question why these wraper functions exist
These do change from a a "free thing in inline asm" to a
call overhead with C->NASM


>  {
>      int y, h_size, vshift;
> -
>      YUV2RGB_LOOP(2)
>  
>  #ifdef DITHER1XBPP
> -        c->blueDither  = ff_dither8[y       & 1];
> -        c->greenDither = ff_dither4[y       & 1];
> -        c->redDither   = ff_dither8[(y + 1) & 1];
> +    c->blueDither  = ff_dither8[y       & 1];
> +    c->greenDither = ff_dither4[y       & 1];
> +    c->redDither   = ff_dither8[(y + 1) & 1];

these changes make the patch harder to review and the resulting
commit harder to read too (and i manually matched these up above
it lookes worse in the actual diff


> -#endif
> -
> -        YUV2RGB_INITIAL_LOAD
> -        YUV2RGB
> -        RGB_PACK_INTERLEAVE
> -#ifdef DITHER1XBPP
> -        DITHER_RGB
>  #endif
> -        RGB_PACK16(pb_07, 0)
>  
> -    YUV2RGB_ENDLOOP(2)
> -    YUV2RGB_OPERANDS
> -    YUV2RGB_ENDFUNC

> +    RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
> +    }
> +    return srcSliceH;

This doesnt look correctly indented


thanks



[...]
Ting Fu Jan. 15, 2020, 3:27 a.m. UTC | #5
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Michael Niedermayer
> Sent: Wednesday, January 15, 2020 05:55 AM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> On Fri, Jan 10, 2020 at 01:38:15AM +0800, Ting Fu wrote:
> > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > ---
> > V7:
> >     Fix compile issue when user configure with --disable-mmx.
> >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> >
> >  libswscale/x86/Makefile           |   1 +
> >  libswscale/x86/swscale.c          |  16 +-
> >  libswscale/x86/yuv2rgb.c          |  66 ++---
> >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> >  5 files changed, 405 insertions(+), 415 deletions(-)  create mode
> > 100644 libswscale/x86/yuv_2_rgb.asm
> 
> The commit message seems a bit terse
> I think it should say if the sequence of instructions is unchanged and if it was
> benchmaked. If its the same speed, when the code is run the commit message
> should say that too
> 
> the principle of this (inline -> nasm) is fine of course.
> 
Hi Michael,

Got it, will add more infos in next patch version.

> 
> >
[...]
> > -                    break;
> > -                } else
> > -                    return yuv420_bgr32_mmx;
> > -            case AV_PIX_FMT_RGB24:
> > -                return yuv420_rgb24_mmx;
> > -            case AV_PIX_FMT_BGR24:
> > -                return yuv420_bgr24_mmx;
> > -            case AV_PIX_FMT_RGB565:
> > -                return yuv420_rgb16_mmx;
> > -            case AV_PIX_FMT_RGB555:
> > -                return yuv420_rgb15_mmx;
> > +                break;
> > +            } else
> > +                return yuv420_bgr32_mmx;
> > +        case AV_PIX_FMT_RGB24:
> > +            return yuv420_rgb24_mmx;
> > +        case AV_PIX_FMT_BGR24:
> > +            return yuv420_bgr24_mmx;
> > +        case AV_PIX_FMT_RGB565:
> > +            return yuv420_rgb16_mmx;
> > +        case AV_PIX_FMT_RGB555:
> > +            return yuv420_rgb15_mmx;
> >          }
> >      }
> 
> this is a little messy to review
> it is mostly reindention
> yuv2rgb.c          |   66 +++----
> and with -w
> yuv2rgb.c          |   26 +--
> 

All reindention will be removed.

> 
> 
> [...]
> > -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > -                                       int srcStride[],
> > -                                       int srcSliceY, int srcSliceH,
> > -                                       uint8_t *dst[], int dstStride[])
> > +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > +                                               int srcStride[],
> > +                                               int srcSliceY, int srcSliceH,
> > +                                               uint8_t *dst[], int
> > +dstStride[])
> 
> maybe the removial of inline should be a seperate patch also there is the
> question why these wraper functions exist These do change from a a "free thing
> in inline asm" to a call overhead with C->NASM

I will try to call nasm directly by removing the wrapper function.

> 
> 
> >  {
> >      int y, h_size, vshift;
> > -
> >      YUV2RGB_LOOP(2)
> >
> >  #ifdef DITHER1XBPP
> > -        c->blueDither  = ff_dither8[y       & 1];
> > -        c->greenDither = ff_dither4[y       & 1];
> > -        c->redDither   = ff_dither8[(y + 1) & 1];
> > +    c->blueDither  = ff_dither8[y       & 1];
> > +    c->greenDither = ff_dither4[y       & 1];
> > +    c->redDither   = ff_dither8[(y + 1) & 1];
> 
> these changes make the patch harder to review and the resulting commit harder
> to read too (and i manually matched these up above it lookes worse in the
> actual diff

Reindention will be removed.

> 
> 
> > -#endif
> > -
> > -        YUV2RGB_INITIAL_LOAD
> > -        YUV2RGB
> > -        RGB_PACK_INTERLEAVE
> > -#ifdef DITHER1XBPP
> > -        DITHER_RGB
> >  #endif
> > -        RGB_PACK16(pb_07, 0)
> >
> > -    YUV2RGB_ENDLOOP(2)
> > -    YUV2RGB_OPERANDS
> > -    YUV2RGB_ENDFUNC
> 
> > +    RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c-
> >redDither), py - 2 * index);
> > +    }
> > +    return srcSliceH;
> 
> This doesnt look correctly indented

Will be changed.

Thank you,
Ting Fu
> 
> 
> thanks
> 
> 
> 
> [...]
> --
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> The smallest minority on earth is the individual. Those who deny individual rights
> cannot claim to be defenders of minorities. - Ayn Rand
Ting Fu Jan. 16, 2020, 7:27 a.m. UTC | #6
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Michael Niedermayer
> Sent: Wednesday, January 15, 2020 05:55 AM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> On Fri, Jan 10, 2020 at 01:38:15AM +0800, Ting Fu wrote:
> > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > ---
> > V7:
> >     Fix compile issue when user configure with --disable-mmx.
> >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> >
> >  libswscale/x86/Makefile           |   1 +
> >  libswscale/x86/swscale.c          |  16 +-
> >  libswscale/x86/yuv2rgb.c          |  66 ++---
> >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> >  5 files changed, 405 insertions(+), 415 deletions(-)  create mode
> > 100644 libswscale/x86/yuv_2_rgb.asm
> 
> The commit message seems a bit terse
> I think it should say if the sequence of instructions is unchanged and if it was
> benchmaked. If its the same speed, when the code is run the commit message
> should say that too
> 
> the principle of this (inline -> nasm) is fine of course.
> 
> 
[...]
> > -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > -                                       int srcStride[],
> > -                                       int srcSliceY, int srcSliceH,
> > -                                       uint8_t *dst[], int dstStride[])
> > +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > +                                               int srcStride[],
> > +                                               int srcSliceY, int srcSliceH,
> > +                                               uint8_t *dst[], int
> > +dstStride[])
> 
> maybe the removial of inline should be a seperate patch also there is the
> question why these wraper functions exist These do change from a a "free thing
> in inline asm" to a call overhead with C->NASM
> 
Hi Michael,

The wrapper functions initiate some variables and contain one 'for cycle'. The variable initiation needs to access to the 'c->dstW', furthermore macro SWS_MAX_ FILTER_SIZE is needed. Which means extra work and much more NASM code.
If you still prefer to do all the things in assembly, I can change from 'C->NASM' to 'call NASM function directly' in another further patch( for current patch easier to review).
Or in my opinion, the cost in C->NASM can be ignored, and the initiation work looks clearer in C, just let it be what it is now.
What do you think?

Thank you,
Ting Fu

[...]
Michael Niedermayer Jan. 16, 2020, 9:35 p.m. UTC | #7
On Thu, Jan 16, 2020 at 07:27:05AM +0000, Fu, Ting wrote:
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > Michael Niedermayer
> > Sent: Wednesday, January 15, 2020 05:55 AM
> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> > inline assembly into nasm code
> > 
> > On Fri, Jan 10, 2020 at 01:38:15AM +0800, Ting Fu wrote:
> > > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > > ---
> > > V7:
> > >     Fix compile issue when user configure with --disable-mmx.
> > >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> > >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> > >
> > >  libswscale/x86/Makefile           |   1 +
> > >  libswscale/x86/swscale.c          |  16 +-
> > >  libswscale/x86/yuv2rgb.c          |  66 ++---
> > >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> > >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> > >  5 files changed, 405 insertions(+), 415 deletions(-)  create mode
> > > 100644 libswscale/x86/yuv_2_rgb.asm
> > 
> > The commit message seems a bit terse
> > I think it should say if the sequence of instructions is unchanged and if it was
> > benchmaked. If its the same speed, when the code is run the commit message
> > should say that too
> > 
> > the principle of this (inline -> nasm) is fine of course.
> > 
> > 
> [...]
> > > -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > > -                                       int srcStride[],
> > > -                                       int srcSliceY, int srcSliceH,
> > > -                                       uint8_t *dst[], int dstStride[])
> > > +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > > +                                               int srcStride[],
> > > +                                               int srcSliceY, int srcSliceH,
> > > +                                               uint8_t *dst[], int
> > > +dstStride[])
> > 
> > maybe the removial of inline should be a seperate patch also there is the
> > question why these wraper functions exist These do change from a a "free thing
> > in inline asm" to a call overhead with C->NASM
> > 
> Hi Michael,
> 
> The wrapper functions initiate some variables and contain one 'for cycle'. The variable initiation needs to access to the 'c->dstW', furthermore macro SWS_MAX_ FILTER_SIZE is needed. Which means extra work and much more NASM code.
> If you still prefer to do all the things in assembly, I can change from 'C->NASM' to 'call NASM function directly' in another further patch( for current patch easier to review).
> Or in my opinion, the cost in C->NASM can be ignored, and the initiation work looks clearer in C, just let it be what it is now.
> What do you think?

it probably makes no sense if its hard to convert that code

thx

[...]
Ting Fu Jan. 19, 2020, 2:49 a.m. UTC | #8
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Michael Niedermayer
> Sent: Friday, January 17, 2020 05:36 AM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> On Thu, Jan 16, 2020 at 07:27:05AM +0000, Fu, Ting wrote:
> >
> >
> > > -----Original Message-----
> > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > > Michael Niedermayer
> > > Sent: Wednesday, January 15, 2020 05:55 AM
> > > To: FFmpeg development discussions and patches
> > > <ffmpeg-devel@ffmpeg.org>
> > > Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb:
> > > Change inline assembly into nasm code
> > >
> > > On Fri, Jan 10, 2020 at 01:38:15AM +0800, Ting Fu wrote:
> > > > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > > > ---
> > > > V7:
> > > >     Fix compile issue when user configure with --disable-mmx.
> > > >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> > > >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> > > >
> > > >  libswscale/x86/Makefile           |   1 +
> > > >  libswscale/x86/swscale.c          |  16 +-
> > > >  libswscale/x86/yuv2rgb.c          |  66 ++---
> > > >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> > > >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> > > >  5 files changed, 405 insertions(+), 415 deletions(-)  create mode
> > > > 100644 libswscale/x86/yuv_2_rgb.asm
> > >
> > > The commit message seems a bit terse I think it should say if the
> > > sequence of instructions is unchanged and if it was benchmaked. If
> > > its the same speed, when the code is run the commit message should
> > > say that too
> > >
> > > the principle of this (inline -> nasm) is fine of course.
> > >
> > >
> > [...]
> > > > -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t
> *src[],
> > > > -                                       int srcStride[],
> > > > -                                       int srcSliceY, int srcSliceH,
> > > > -                                       uint8_t *dst[], int dstStride[])
> > > > +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > > > +                                               int srcStride[],
> > > > +                                               int srcSliceY, int srcSliceH,
> > > > +                                               uint8_t *dst[],
> > > > +int
> > > > +dstStride[])
> > >
> > > maybe the removial of inline should be a seperate patch also there
> > > is the question why these wraper functions exist These do change
> > > from a a "free thing in inline asm" to a call overhead with C->NASM
> > >
> > Hi Michael,
> >
> > The wrapper functions initiate some variables and contain one 'for cycle'. The
> variable initiation needs to access to the 'c->dstW', furthermore macro
> SWS_MAX_ FILTER_SIZE is needed. Which means extra work and much more
> NASM code.
> > If you still prefer to do all the things in assembly, I can change from 'C->NASM'
> to 'call NASM function directly' in another further patch( for current patch easier
> to review).
> > Or in my opinion, the cost in C->NASM can be ignored, and the initiation work
> looks clearer in C, just let it be what it is now.
> > What do you think?
> 
> it probably makes no sense if its hard to convert that code

Hi Michael,

You mean I still need to convert that code, did I get you right?
Since NASM function will get only the address of SwsConext c ( in order to be compatible with yuv2rgb_c function in parameters), not the address of c->redDither nor the c->dstW. I have no way to get the value of c->dstW by using address offset. 
Do you have any suggestion for solving that problem? 

Thank you,
Ting Fu
> 
> thx
> 
> [...]
> --
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> If you fake or manipulate statistics in a paper in physics you will never get a job
> again.
> If you fake or manipulate statistics in a paper in medicin you will get a job for life
> at the pharma industry.
Michael Niedermayer Jan. 19, 2020, 1:10 p.m. UTC | #9
On Sun, Jan 19, 2020 at 02:49:21AM +0000, Fu, Ting wrote:
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > Michael Niedermayer
> > Sent: Friday, January 17, 2020 05:36 AM
> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> > inline assembly into nasm code
> > 
> > On Thu, Jan 16, 2020 at 07:27:05AM +0000, Fu, Ting wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > > > Michael Niedermayer
> > > > Sent: Wednesday, January 15, 2020 05:55 AM
> > > > To: FFmpeg development discussions and patches
> > > > <ffmpeg-devel@ffmpeg.org>
> > > > Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb:
> > > > Change inline assembly into nasm code
> > > >
> > > > On Fri, Jan 10, 2020 at 01:38:15AM +0800, Ting Fu wrote:
> > > > > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > > > > ---
> > > > > V7:
> > > > >     Fix compile issue when user configure with --disable-mmx.
> > > > >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> > > > >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> > > > >
> > > > >  libswscale/x86/Makefile           |   1 +
> > > > >  libswscale/x86/swscale.c          |  16 +-
> > > > >  libswscale/x86/yuv2rgb.c          |  66 ++---
> > > > >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> > > > >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> > > > >  5 files changed, 405 insertions(+), 415 deletions(-)  create mode
> > > > > 100644 libswscale/x86/yuv_2_rgb.asm
> > > >
> > > > The commit message seems a bit terse I think it should say if the
> > > > sequence of instructions is unchanged and if it was benchmaked. If
> > > > its the same speed, when the code is run the commit message should
> > > > say that too
> > > >
> > > > the principle of this (inline -> nasm) is fine of course.
> > > >
> > > >
> > > [...]
> > > > > -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t
> > *src[],
> > > > > -                                       int srcStride[],
> > > > > -                                       int srcSliceY, int srcSliceH,
> > > > > -                                       uint8_t *dst[], int dstStride[])
> > > > > +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > > > > +                                               int srcStride[],
> > > > > +                                               int srcSliceY, int srcSliceH,
> > > > > +                                               uint8_t *dst[],
> > > > > +int
> > > > > +dstStride[])
> > > >
> > > > maybe the removial of inline should be a seperate patch also there
> > > > is the question why these wraper functions exist These do change
> > > > from a a "free thing in inline asm" to a call overhead with C->NASM
> > > >
> > > Hi Michael,
> > >
> > > The wrapper functions initiate some variables and contain one 'for cycle'. The
> > variable initiation needs to access to the 'c->dstW', furthermore macro
> > SWS_MAX_ FILTER_SIZE is needed. Which means extra work and much more
> > NASM code.
> > > If you still prefer to do all the things in assembly, I can change from 'C->NASM'
> > to 'call NASM function directly' in another further patch( for current patch easier
> > to review).
> > > Or in my opinion, the cost in C->NASM can be ignored, and the initiation work
> > looks clearer in C, just let it be what it is now.
> > > What do you think?
> > 
> > it probably makes no sense if its hard to convert that code
> 
> Hi Michael,
> 
> You mean I still need to convert that code, did I get you right?

i dont think its needed if its complex


> Since NASM function will get only the address of SwsConext c ( in order to be compatible with yuv2rgb_c function in parameters), not the address of c->redDither nor the c->dstW. I have no way to get the value of c->dstW by using address offset. 
> Do you have any suggestion for solving that problem? 

maybe the offset to redDither could be the 2nd field of the struct or the
whole redDither block could be moved up

but only do this if the resulting asm code is reasonable.
The goal is to eliminate some ugly wraper funcions which call the asm
if the code to avoid that is more ugly then it is not a good idea

thx

[...]
Ronald S. Bultje Jan. 19, 2020, 2:55 p.m. UTC | #10
Hi,

On Sat, Jan 18, 2020 at 9:49 PM Fu, Ting <ting.fu@intel.com> wrote:

> Since NASM function will get only the address of SwsConext c ( in order to
> be compatible with yuv2rgb_c function in parameters), not the address of
> c->redDither nor the c->dstW. I have no way to get the value of c->dstW by
> using address offset.


Nasm and related variants have "struc" (like "struct" in C) for this.

See for example this code:
https://code.videolan.org/videolan/dav1d/blob/master/src/x86/film_grain.asm#L64

Hope this helps,
Ronald
Ting Fu Jan. 20, 2020, 2:57 a.m. UTC | #11
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Michael Niedermayer
> Sent: Sunday, January 19, 2020 09:11 PM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> On Sun, Jan 19, 2020 at 02:49:21AM +0000, Fu, Ting wrote:
> >
> >
> > > -----Original Message-----
> > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > > Michael Niedermayer
> > > Sent: Friday, January 17, 2020 05:36 AM
> > > To: FFmpeg development discussions and patches
> > > <ffmpeg-devel@ffmpeg.org>
> > > Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb:
> > > Change inline assembly into nasm code
> > >
> > > On Thu, Jan 16, 2020 at 07:27:05AM +0000, Fu, Ting wrote:
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf
> > > > > Of Michael Niedermayer
> > > > > Sent: Wednesday, January 15, 2020 05:55 AM
> > > > > To: FFmpeg development discussions and patches
> > > > > <ffmpeg-devel@ffmpeg.org>
> > > > > Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb:
> > > > > Change inline assembly into nasm code
> > > > >
> > > > > On Fri, Jan 10, 2020 at 01:38:15AM +0800, Ting Fu wrote:
> > > > > > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > > > > > ---
> > > > > > V7:
> > > > > >     Fix compile issue when user configure with --disable-mmx.
> > > > > >     Fix issue when running ./ffmpeg with --cpuflags mmx/ssse3.
> > > > > >     Adjust the SIMD verify logic in libswscale/x86/yuv2rgb.c
> > > > > >
> > > > > >  libswscale/x86/Makefile           |   1 +
> > > > > >  libswscale/x86/swscale.c          |  16 +-
> > > > > >  libswscale/x86/yuv2rgb.c          |  66 ++---
> > > > > >  libswscale/x86/yuv2rgb_template.c | 467 ++++++------------------------
> > > > > >  libswscale/x86/yuv_2_rgb.asm      | 270 +++++++++++++++++
> > > > > >  5 files changed, 405 insertions(+), 415 deletions(-)  create
> > > > > > mode
> > > > > > 100644 libswscale/x86/yuv_2_rgb.asm
> > > > >
> > > > > The commit message seems a bit terse I think it should say if
> > > > > the sequence of instructions is unchanged and if it was
> > > > > benchmaked. If its the same speed, when the code is run the
> > > > > commit message should say that too
> > > > >
> > > > > the principle of this (inline -> nasm) is fine of course.
> > > > >
> > > > >
> > > > [...]
> > > > > > -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const
> > > > > > uint8_t
> > > *src[],
> > > > > > -                                       int srcStride[],
> > > > > > -                                       int srcSliceY, int srcSliceH,
> > > > > > -                                       uint8_t *dst[], int dstStride[])
> > > > > > +static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
> > > > > > +                                               int srcStride[],
> > > > > > +                                               int srcSliceY, int srcSliceH,
> > > > > > +                                               uint8_t
> > > > > > +*dst[], int
> > > > > > +dstStride[])
> > > > >
> > > > > maybe the removial of inline should be a seperate patch also
> > > > > there is the question why these wraper functions exist These do
> > > > > change from a a "free thing in inline asm" to a call overhead
> > > > > with C->NASM
> > > > >
> > > > Hi Michael,
> > > >
> > > > The wrapper functions initiate some variables and contain one 'for
> > > > cycle'. The
> > > variable initiation needs to access to the 'c->dstW', furthermore
> > > macro SWS_MAX_ FILTER_SIZE is needed. Which means extra work and
> > > much more NASM code.
> > > > If you still prefer to do all the things in assembly, I can change from 'C-
> >NASM'
> > > to 'call NASM function directly' in another further patch( for
> > > current patch easier to review).
> > > > Or in my opinion, the cost in C->NASM can be ignored, and the
> > > > initiation work
> > > looks clearer in C, just let it be what it is now.
> > > > What do you think?
> > >
> > > it probably makes no sense if its hard to convert that code
> >
> > Hi Michael,
> >
> > You mean I still need to convert that code, did I get you right?
> 
> i dont think its needed if its complex
Hi Michael,

Oh, I get you now.

> 
> 
> > Since NASM function will get only the address of SwsConext c ( in order to be
> compatible with yuv2rgb_c function in parameters), not the address of c-
> >redDither nor the c->dstW. I have no way to get the value of c->dstW by using
> address offset.
> > Do you have any suggestion for solving that problem?
> 
> maybe the offset to redDither could be the 2nd field of the struct or the whole
> redDither block could be moved up
> 
> but only do this if the resulting asm code is reasonable.
> The goal is to eliminate some ugly wraper funcions which call the asm if the
> code to avoid that is more ugly then it is not a good idea

I temporarily prefer the current situation.
The suggestion of moving redDither block is a good idea. I will try it after Chinese New Year holiday, if it's not that ugly will be posted in next patch.
And I have sent V8 to fix all the issues mentioned before (except the wrapper function).
Please take a review.

Thank you,
Ting Fu

> 
> thx
> 
> [...]
> --
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> The day soldiers stop bringing you their problems is the day you have stopped
> leading them. They have either lost confidence that you can help or concluded
> you do not care. Either case is a failure of leadership. - Colin Powell
Ting Fu Jan. 20, 2020, 3:01 a.m. UTC | #12
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ronald
> S. Bultje
> Sent: Sunday, January 19, 2020 10:55 PM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 1/2] libswscale/x86/yuv2rgb: Change
> inline assembly into nasm code
> 
> Hi,
> 
> On Sat, Jan 18, 2020 at 9:49 PM Fu, Ting <ting.fu@intel.com> wrote:
> 
> > Since NASM function will get only the address of SwsConext c ( in
> > order to be compatible with yuv2rgb_c function in parameters), not the
> > address of
> > c->redDither nor the c->dstW. I have no way to get the value of
> > c->c->dstW by
> > using address offset.
> 
> 
> Nasm and related variants have "struc" (like "struct" in C) for this.
Hi Ronald,

Thank you so much for the information. I believe it will be helpful in future patches.

Thank you,
Ting Fu
> 
> See for example this code:
> https://code.videolan.org/videolan/dav1d/blob/master/src/x86/film_grain.asm
> #L64
> 
> Hope this helps,
> Ronald
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index f317d5dd9b..831d5359aa 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -12,3 +12,4 @@  X86ASM-OBJS                     += x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
                                    x86/rgb_2_rgb.o                      \
+                                   x86/yuv_2_rgb.o                      \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 0eed4f18d5..e9d474a1e8 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -29,6 +29,14 @@ 
 #include "libavutil/cpu.h"
 #include "libavutil/pixdesc.h"
 
+const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
+    0x0103010301030103LL,
+    0x0200020002000200LL,};
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
+    0x0602060206020602LL,
+    0x0004000400040004LL,};
+
 #if HAVE_INLINE_ASM
 
 #define DITHER1XBPP
@@ -38,14 +46,6 @@  DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
 DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
 DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
 
-const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
-    0x0103010301030103LL,
-    0x0200020002000200LL,};
-
-const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
-    0x0602060206020602LL,
-    0x0004000400040004LL,};
-
 DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
 DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
 DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index 5e2f77c20f..dd813d4deb 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -37,7 +37,7 @@ 
 #include "libavutil/x86/cpu.h"
 #include "libavutil/cpu.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_X86ASM
 
 #define DITHER1XBPP // only for MMX
 
@@ -50,32 +50,31 @@  DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
 DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
 
 //MMX versions
-#if HAVE_MMX_INLINE && HAVE_6REGS
+#if HAVE_MMX
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMXEXT
 #define COMPILE_TEMPLATE_MMXEXT 0
 #define RENAME(a) a ## _mmx
 #include "yuv2rgb_template.c"
-#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
+#endif /* HAVE_MMX */
 
 // MMXEXT versions
-#if HAVE_MMXEXT_INLINE && HAVE_6REGS
+#if HAVE_MMXEXT
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMXEXT
 #define COMPILE_TEMPLATE_MMXEXT 1
 #define RENAME(a) a ## _mmxext
 #include "yuv2rgb_template.c"
-#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */
+#endif /* HAVE_MMXEXT */
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_X86ASM */
 
 av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
 {
-#if HAVE_MMX_INLINE && HAVE_6REGS
+#if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_MMXEXT_INLINE
-    if (INLINE_MMXEXT(cpu_flags)) {
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
         switch (c->dstFormat) {
         case AV_PIX_FMT_RGB24:
             return yuv420_rgb24_mmxext;
@@ -83,37 +82,36 @@  av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
             return yuv420_bgr24_mmxext;
         }
     }
-#endif
 
-    if (INLINE_MMX(cpu_flags)) {
+    if (EXTERNAL_MMX(cpu_flags)) {
         switch (c->dstFormat) {
-            case AV_PIX_FMT_RGB32:
-                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
-                    return yuva420_rgb32_mmx;
+        case AV_PIX_FMT_RGB32:
+            if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if CONFIG_SWSCALE_ALPHA
+                return yuva420_rgb32_mmx;
 #endif
-                    break;
-                } else
-                    return yuv420_rgb32_mmx;
-            case AV_PIX_FMT_BGR32:
-                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
-                    return yuva420_bgr32_mmx;
+                break;
+            } else
+                return yuv420_rgb32_mmx;
+        case AV_PIX_FMT_BGR32:
+            if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if CONFIG_SWSCALE_ALPHA
+                return yuva420_bgr32_mmx;
 #endif
-                    break;
-                } else
-                    return yuv420_bgr32_mmx;
-            case AV_PIX_FMT_RGB24:
-                return yuv420_rgb24_mmx;
-            case AV_PIX_FMT_BGR24:
-                return yuv420_bgr24_mmx;
-            case AV_PIX_FMT_RGB565:
-                return yuv420_rgb16_mmx;
-            case AV_PIX_FMT_RGB555:
-                return yuv420_rgb15_mmx;
+                break;
+            } else
+                return yuv420_bgr32_mmx;
+        case AV_PIX_FMT_RGB24:
+            return yuv420_rgb24_mmx;
+        case AV_PIX_FMT_BGR24:
+            return yuv420_bgr24_mmx;
+        case AV_PIX_FMT_RGB565:
+            return yuv420_rgb16_mmx;
+        case AV_PIX_FMT_RGB555:
+            return yuv420_rgb15_mmx;
         }
     }
-#endif /* HAVE_MMX_INLINE  && HAVE_6REGS */
 
+#endif /* HAVE_X86ASM */
     return NULL;
 }
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index acb78f520e..554750f3e1 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -26,31 +26,13 @@ 
 #include "libavutil/x86/asm.h"
 #include "libswscale/swscale_internal.h"
 
-#undef MOVNTQ
-#undef EMMS
-#undef SFENCE
-
-#if COMPILE_TEMPLATE_MMXEXT
-#define MOVNTQ "movntq"
-#define SFENCE "sfence"
-#else
-#define MOVNTQ "movq"
-#define SFENCE " # nop"
-#endif
-
-#define REG_BLUE  "0"
-#define REG_RED   "1"
-#define REG_GREEN "2"
-#define REG_ALPHA "3"
-
 #define YUV2RGB_LOOP(depth)                                          \
     h_size = (c->dstW + 7) & ~7;                                     \
     if (h_size * depth > FFABS(dstStride[0]))                        \
         h_size -= 8;                                                 \
                                                                      \
-    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                        \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                     \
                                                                      \
-    __asm__ volatile ("pxor %mm4, %mm4\n\t");                        \
     for (y = 0; y < srcSliceH; y++) {                                \
         uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
         const uint8_t *py = src[0] +               y * srcStride[0]; \
@@ -58,410 +40,149 @@ 
         const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
         x86_reg index = -h_size / 2;                                 \
 
-#define YUV2RGB_INITIAL_LOAD          \
-    __asm__ volatile (                \
-        "movq (%5, %0, 2), %%mm6\n\t" \
-        "movd    (%2, %0), %%mm0\n\t" \
-        "movd    (%3, %0), %%mm1\n\t" \
-        "1: \n\t"                     \
-
-/* YUV2RGB core
- * Conversion is performed in usual way:
- * R = Y' * Ycoef + Vred * V'
- * G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
- * B = Y' * Ycoef               + Ublue * U'
- *
- * where X' = X * 8 - Xoffset (multiplication is performed to increase
- * precision a bit).
- * Since it operates in YUV420 colorspace, Y component is additionally
- * split into Y1 and Y2 for even and odd pixels.
- *
- * Input:
- * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
- * Output:
- * mm1 - R, mm2 - G, mm0 - B
- */
-#define YUV2RGB                                  \
-    /* convert Y, U, V into Y1', Y2', U', V' */  \
-    "movq      %%mm6, %%mm7\n\t"                 \
-    "punpcklbw %%mm4, %%mm0\n\t"                 \
-    "punpcklbw %%mm4, %%mm1\n\t"                 \
-    "pand     "MANGLE(mmx_00ffw)", %%mm6\n\t"    \
-    "psrlw     $8,    %%mm7\n\t"                 \
-    "psllw     $3,    %%mm0\n\t"                 \
-    "psllw     $3,    %%mm1\n\t"                 \
-    "psllw     $3,    %%mm6\n\t"                 \
-    "psllw     $3,    %%mm7\n\t"                 \
-    "psubsw   "U_OFFSET"(%4), %%mm0\n\t"         \
-    "psubsw   "V_OFFSET"(%4), %%mm1\n\t"         \
-    "psubw    "Y_OFFSET"(%4), %%mm6\n\t"         \
-    "psubw    "Y_OFFSET"(%4), %%mm7\n\t"         \
-\
-     /* multiply by coefficients */              \
-    "movq      %%mm0, %%mm2\n\t"                 \
-    "movq      %%mm1, %%mm3\n\t"                 \
-    "pmulhw   "UG_COEFF"(%4), %%mm2\n\t"         \
-    "pmulhw   "VG_COEFF"(%4), %%mm3\n\t"         \
-    "pmulhw   "Y_COEFF" (%4), %%mm6\n\t"         \
-    "pmulhw   "Y_COEFF" (%4), %%mm7\n\t"         \
-    "pmulhw   "UB_COEFF"(%4), %%mm0\n\t"         \
-    "pmulhw   "VR_COEFF"(%4), %%mm1\n\t"         \
-    "paddsw    %%mm3, %%mm2\n\t"                 \
-    /* now: mm0 = UB, mm1 = VR, mm2 = CG */      \
-    /*      mm6 = Y1, mm7 = Y2 */                \
-\
-    /* produce RGB */                            \
-    "movq      %%mm7, %%mm3\n\t"                 \
-    "movq      %%mm7, %%mm5\n\t"                 \
-    "paddsw    %%mm0, %%mm3\n\t"                 \
-    "paddsw    %%mm1, %%mm5\n\t"                 \
-    "paddsw    %%mm2, %%mm7\n\t"                 \
-    "paddsw    %%mm6, %%mm0\n\t"                 \
-    "paddsw    %%mm6, %%mm1\n\t"                 \
-    "paddsw    %%mm6, %%mm2\n\t"                 \
-
-#define RGB_PACK_INTERLEAVE                  \
-    /* pack and interleave even/odd pixels */    \
-    "packuswb  %%mm1, %%mm0\n\t"                 \
-    "packuswb  %%mm5, %%mm3\n\t"                 \
-    "packuswb  %%mm2, %%mm2\n\t"                 \
-    "movq      %%mm0, %%mm1\n\n"                 \
-    "packuswb  %%mm7, %%mm7\n\t"                 \
-    "punpcklbw %%mm3, %%mm0\n\t"                 \
-    "punpckhbw %%mm3, %%mm1\n\t"                 \
-    "punpcklbw %%mm7, %%mm2\n\t"                 \
-
-#define YUV2RGB_ENDLOOP(depth)                   \
-    "movq 8 (%5, %0, 2), %%mm6\n\t"              \
-    "movd 4 (%3, %0),    %%mm1\n\t"              \
-    "movd 4 (%2, %0),    %%mm0\n\t"              \
-    "add $"AV_STRINGIFY(depth * 8)", %1\n\t"     \
-    "add  $4, %0\n\t"                            \
-    "js   1b\n\t"                                \
-
-#if COMPILE_TEMPLATE_MMXEXT
-#undef RGB_PACK24_B_OPERANDS
-#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask0010,mask1001)
-#else
-#undef RGB_PACK24_B_OPERANDS
-#define RGB_PACK24_B_OPERANDS
-#endif
-
-#define YUV2RGB_OPERANDS                                          \
-        : "+r" (index), "+r" (image)                              \
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
-          "r" (py - 2*index)                                      \
-          NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e0) \
-          RGB_PACK24_B_OPERANDS                                   \
-        : "memory"                                                \
-        );                                                        \
-    }                                                             \
-
-#define YUV2RGB_OPERANDS_ALPHA                                    \
-        : "+r" (index), "+r" (image)                              \
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
-          "r" (py - 2*index), "r" (pa - 2*index)                  \
-          NAMED_CONSTRAINTS_ADD(mmx_00ffw)                        \
-        : "memory"                                                \
-        );                                                        \
-    }                                                             \
-
-#define YUV2RGB_ENDFUNC                          \
-    __asm__ volatile (SFENCE"\n\t"               \
-                    "emms    \n\t");             \
-    return srcSliceH;                            \
-
-#define IF0(x)
-#define IF1(x) x
-
-#define RGB_PACK16(gmask, is15)                  \
-    "pand      "MANGLE(mmx_redmask)", %%mm0\n\t" \
-    "pand      "MANGLE(mmx_redmask)", %%mm1\n\t" \
-    "movq      %%mm2,     %%mm3\n\t"             \
-    "psllw   $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
-    "psrlw   $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
-    "psrlw     $3,        %%mm0\n\t"             \
-    IF##is15("psrlw  $1,  %%mm1\n\t")            \
-    "pand "MANGLE(pb_e0)", %%mm2\n\t"            \
-    "pand "MANGLE(gmask)", %%mm3\n\t"            \
-    "por       %%mm2,     %%mm0\n\t"             \
-    "por       %%mm3,     %%mm1\n\t"             \
-    "movq      %%mm0,     %%mm2\n\t"             \
-    "punpcklbw %%mm1,     %%mm0\n\t"             \
-    "punpckhbw %%mm1,     %%mm2\n\t"             \
-    MOVNTQ "   %%mm0,      (%1)\n\t"             \
-    MOVNTQ "   %%mm2,     8(%1)\n\t"             \
-
-#define DITHER_RGB                               \
-    "paddusb "BLUE_DITHER"(%4),  %%mm0\n\t"      \
-    "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
-    "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \
+extern void RENAME(ff_yuv_420_rgb24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_bgr24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
 
 #if !COMPILE_TEMPLATE_MMXEXT
-static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
-                                       int srcStride[],
-                                       int srcSliceY, int srcSliceH,
-                                       uint8_t *dst[], int dstStride[])
+extern void RENAME(ff_yuv_420_rgb15)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_rgb16)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuva_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                      const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                      const uint8_t *py_2index, const uint8_t *pa_2index);
+extern void RENAME(ff_yuva_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                      const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                      const uint8_t *py_2index, const uint8_t *pa_2index);
+
+static int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
+                                               int srcStride[],
+                                               int srcSliceY, int srcSliceH,
+                                               uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
-
     YUV2RGB_LOOP(2)
 
 #ifdef DITHER1XBPP
-        c->blueDither  = ff_dither8[y       & 1];
-        c->greenDither = ff_dither8[y       & 1];
-        c->redDither   = ff_dither8[(y + 1) & 1];
-#endif
-
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK_INTERLEAVE
-#ifdef DITHER1XBPP
-        DITHER_RGB
+    c->blueDither  = ff_dither8[y       & 1];
+    c->greenDither = ff_dither8[y       & 1];
+    c->redDither   = ff_dither8[(y + 1) & 1];
 #endif
-        RGB_PACK16(pb_03, 1)
 
-    YUV2RGB_ENDLOOP(2)
-    YUV2RGB_OPERANDS
-    YUV2RGB_ENDFUNC
+    RENAME(ff_yuv_420_rgb15)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
 }
 
-static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
-                                       int srcStride[],
-                                       int srcSliceY, int srcSliceH,
-                                       uint8_t *dst[], int dstStride[])
+static int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
+                                               int srcStride[],
+                                               int srcSliceY, int srcSliceH,
+                                               uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
-
     YUV2RGB_LOOP(2)
 
 #ifdef DITHER1XBPP
-        c->blueDither  = ff_dither8[y       & 1];
-        c->greenDither = ff_dither4[y       & 1];
-        c->redDither   = ff_dither8[(y + 1) & 1];
-#endif
-
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK_INTERLEAVE
-#ifdef DITHER1XBPP
-        DITHER_RGB
+    c->blueDither  = ff_dither8[y       & 1];
+    c->greenDither = ff_dither4[y       & 1];
+    c->redDither   = ff_dither8[(y + 1) & 1];
 #endif
-        RGB_PACK16(pb_07, 0)
 
-    YUV2RGB_ENDLOOP(2)
-    YUV2RGB_OPERANDS
-    YUV2RGB_ENDFUNC
+    RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
 }
-#endif /* !COMPILE_TEMPLATE_MMXEXT */
-
-#define RGB_PACK24(blue, red)\
-    "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
-    "packuswb  %%mm5,      %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
-    "packuswb  %%mm7,      %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
-    "movq      %%mm"red",  %%mm3 \n"\
-    "movq      %%mm"blue", %%mm6 \n"\
-    "psrlq     $32,        %%mm"red" \n" /* R1 R3 R5 R7 */\
-    "punpcklbw %%mm2,      %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
-    "punpcklbw %%mm"red",  %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
-    "movq      %%mm3,      %%mm5 \n"\
-    "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
-    "punpcklwd %%mm6,      %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
-    "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
-    RGB_PACK24_B
-
-#if COMPILE_TEMPLATE_MMXEXT
-DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
-DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
-DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
-DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
-DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
-#undef RGB_PACK24_B
-#define RGB_PACK24_B\
-    "pshufw    $0xc6,  %%mm2, %%mm1 \n"\
-    "pshufw    $0x84,  %%mm3, %%mm6 \n"\
-    "pshufw    $0x38,  %%mm5, %%mm7 \n"\
-    "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
-    "movq      %%mm1,         %%mm0 \n"\
-    "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
-    "movq      %%mm1,         %%mm2 \n"\
-    "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
-    "psrlq       $48,         %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
-    "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
-    "psllq       $32,         %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
-    "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
-    "por       %%mm3,         %%mm1 \n"\
-    "por       %%mm6,         %%mm0 \n"\
-    "por       %%mm5,         %%mm1 \n"\
-    "por       %%mm7,         %%mm2 \n"\
-    MOVNTQ"    %%mm0,          (%1) \n"\
-    MOVNTQ"    %%mm1,         8(%1) \n"\
-    MOVNTQ"    %%mm2,        16(%1) \n"\
-
-#else
-#undef RGB_PACK24_B
-#define RGB_PACK24_B\
-    "movd      %%mm3,       (%1) \n" /* R0 G0 B0 R1 */\
-    "movd      %%mm2,      4(%1) \n" /* G1 B1 */\
-    "psrlq     $32,        %%mm3 \n"\
-    "psrlq     $16,        %%mm2 \n"\
-    "movd      %%mm3,      6(%1) \n" /* R2 G2 B2 R3 */\
-    "movd      %%mm2,     10(%1) \n" /* G3 B3 */\
-    "psrlq     $16,        %%mm2 \n"\
-    "movd      %%mm5,     12(%1) \n" /* R4 G4 B4 R5 */\
-    "movd      %%mm2,     16(%1) \n" /* G5 B5 */\
-    "psrlq     $32,        %%mm5 \n"\
-    "movd      %%mm2,     20(%1) \n" /* -- -- G7 B7 */\
-    "movd      %%mm5,     18(%1) \n" /* R6 G6 B6 R7 */\
 
-#endif
-
-static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
-                                       int srcStride[],
-                                       int srcSliceY, int srcSliceH,
-                                       uint8_t *dst[], int dstStride[])
+static int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                               int srcStride[],
+                                               int srcSliceY, int srcSliceH,
+                                               uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
+    YUV2RGB_LOOP(4)
 
-    YUV2RGB_LOOP(3)
-
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK24(REG_BLUE, REG_RED)
-
-    YUV2RGB_ENDLOOP(3)
-    YUV2RGB_OPERANDS
-    YUV2RGB_ENDFUNC
+    RENAME(ff_yuv_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
 }
 
-static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
-                                       int srcStride[],
-                                       int srcSliceY, int srcSliceH,
-                                       uint8_t *dst[], int dstStride[])
+static int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                               int srcStride[],
+                                               int srcSliceY, int srcSliceH,
+                                               uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
+    YUV2RGB_LOOP(4)
 
-    YUV2RGB_LOOP(3)
-
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK24(REG_RED, REG_BLUE)
-
-    YUV2RGB_ENDLOOP(3)
-    YUV2RGB_OPERANDS
-    YUV2RGB_ENDFUNC
+    RENAME(ff_yuv_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
 }
 
-
-#define SET_EMPTY_ALPHA                                                      \
-    "pcmpeqd   %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
-
-#define LOAD_ALPHA                                   \
-    "movq      (%6, %0, 2),     %%mm"REG_ALPHA"\n\t" \
-
-#define RGB_PACK32(red, green, blue, alpha)  \
-    "movq      %%mm"blue",  %%mm5\n\t"       \
-    "movq      %%mm"red",   %%mm6\n\t"       \
-    "punpckhbw %%mm"green", %%mm5\n\t"       \
-    "punpcklbw %%mm"green", %%mm"blue"\n\t"  \
-    "punpckhbw %%mm"alpha", %%mm6\n\t"       \
-    "punpcklbw %%mm"alpha", %%mm"red"\n\t"   \
-    "movq      %%mm"blue",  %%mm"green"\n\t" \
-    "movq      %%mm5,       %%mm"alpha"\n\t" \
-    "punpcklwd %%mm"red",   %%mm"blue"\n\t"  \
-    "punpckhwd %%mm"red",   %%mm"green"\n\t" \
-    "punpcklwd %%mm6,       %%mm5\n\t"       \
-    "punpckhwd %%mm6,       %%mm"alpha"\n\t" \
-    MOVNTQ "   %%mm"blue",   0(%1)\n\t"      \
-    MOVNTQ "   %%mm"green",  8(%1)\n\t"      \
-    MOVNTQ "   %%mm5,       16(%1)\n\t"      \
-    MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \
-
-#if !COMPILE_TEMPLATE_MMXEXT
-static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
-                                       int srcStride[],
-                                       int srcSliceY, int srcSliceH,
-                                       uint8_t *dst[], int dstStride[])
+static int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                                int srcStride[],
+                                                int srcSliceY, int srcSliceH,
+                                                uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
-
     YUV2RGB_LOOP(4)
 
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK_INTERLEAVE
-        SET_EMPTY_ALPHA
-        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
-
-    YUV2RGB_ENDLOOP(4)
-    YUV2RGB_OPERANDS
-    YUV2RGB_ENDFUNC
+    const uint8_t *pa = src[3] + y * srcStride[3];
+    RENAME(ff_yuva_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+    }
+    return srcSliceH;
 }
 
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
-static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
-                                        int srcStride[],
-                                        int srcSliceY, int srcSliceH,
-                                        uint8_t *dst[], int dstStride[])
+static int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                                int srcStride[],
+                                                int srcSliceY, int srcSliceH,
+                                                uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
-
     YUV2RGB_LOOP(4)
 
-        const uint8_t *pa = src[3] + y * srcStride[3];
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK_INTERLEAVE
-        LOAD_ALPHA
-        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
-
-    YUV2RGB_ENDLOOP(4)
-    YUV2RGB_OPERANDS_ALPHA
-    YUV2RGB_ENDFUNC
+    const uint8_t *pa = src[3] + y * srcStride[3];
+    RENAME(ff_yuva_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+    }
+    return srcSliceH;
 }
 #endif
 
-static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
-                                       int srcStride[],
-                                       int srcSliceY, int srcSliceH,
-                                       uint8_t *dst[], int dstStride[])
+static int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
+                                               int srcStride[],
+                                               int srcSliceY, int srcSliceH,
+                                               uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
+    YUV2RGB_LOOP(3)
 
-    YUV2RGB_LOOP(4)
-
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK_INTERLEAVE
-        SET_EMPTY_ALPHA
-        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
-
-    YUV2RGB_ENDLOOP(4)
-    YUV2RGB_OPERANDS
-    YUV2RGB_ENDFUNC
+    RENAME(ff_yuv_420_rgb24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
 }
 
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
-static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
-                                        int srcStride[],
-                                        int srcSliceY, int srcSliceH,
-                                        uint8_t *dst[], int dstStride[])
+static int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
+                                               int srcStride[],
+                                               int srcSliceY, int srcSliceH,
+                                               uint8_t *dst[], int dstStride[])
 {
     int y, h_size, vshift;
+    YUV2RGB_LOOP(3)
 
-    YUV2RGB_LOOP(4)
-
-        const uint8_t *pa = src[3] + y * srcStride[3];
-        YUV2RGB_INITIAL_LOAD
-        YUV2RGB
-        RGB_PACK_INTERLEAVE
-        LOAD_ALPHA
-        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
-
-    YUV2RGB_ENDLOOP(4)
-    YUV2RGB_OPERANDS_ALPHA
-    YUV2RGB_ENDFUNC
+    RENAME(ff_yuv_420_bgr24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
 }
-#endif
 
-#endif /* !COMPILE_TEMPLATE_MMXEXT */
diff --git a/libswscale/x86/yuv_2_rgb.asm b/libswscale/x86/yuv_2_rgb.asm
new file mode 100644
index 0000000000..a44ab1607b
--- /dev/null
+++ b/libswscale/x86/yuv_2_rgb.asm
@@ -0,0 +1,270 @@ 
+;******************************************************************************
+;* software YUV to RGB converter
+;*
+;* Copyright (C) 2001-2007 Michael Niedermayer
+;*           (c) 2010 Konstantin Shishkov
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_00ff: times 4 dw 255
+pb_f8:   times 8 db 248
+pb_e0:   times 8 db 224
+pb_03:   times 8 db 3
+pb_07:   times 8 db 7
+
+mask_1101: dw -1, -1,  0, -1
+mask_0010: dw  0,  0, -1,  0
+mask_0110: dw  0, -1, -1,  0
+mask_1001: dw -1,  0,  0, -1
+mask_0100: dw  0, -1,  0,  0
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+;
+; YUV420/YUVA420 to RGB/BGR 15/16/24/32
+; R = Y + ((vrCoff * (v - 128)) >> 8)
+; G = Y - ((ugCoff * (u - 128) + vgCoff * (v - 128)) >> 8)
+; B = Y + ((ubCoff * (u - 128)) >> 8)
+;
+;-----------------------------------------------------------------------------
+
+%macro MOV_H2L 1
+psrlq %1, 32
+%endmacro
+
+%macro yuv2rgb_fn 3
+
+%if %3 == 32
+    %ifidn %1, yuva
+    %define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index, pa_2index
+    %define GPR_num 7
+    %endif
+%else
+    %define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index
+    %define GPR_num 6
+%endif
+
+%define m_green m2
+%define m_alpha m3
+%define m_y m6
+%define m_u m0
+%define m_v m1
+%ifidn %2, rgb
+%define m_red m1
+%define m_blue m0
+%else
+%define m_red m0
+%define m_blue m1
+%endif
+
+%define time_num 1
+%define reg_num 8
+%define y_offset [pointer_c_ditherq + 8  * 8]
+%define u_offset [pointer_c_ditherq + 9  * 8]
+%define v_offset [pointer_c_ditherq + 10 * 8]
+%define ug_coff  [pointer_c_ditherq + 7  * 8]
+%define vg_coff  [pointer_c_ditherq + 6  * 8]
+%define y_coff   [pointer_c_ditherq + 3  * 8]
+%define ub_coff  [pointer_c_ditherq + 5  * 8]
+%define vr_coff  [pointer_c_ditherq + 4  * 8]
+
+cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
+
+%if ARCH_X86_64
+    movsxd indexq, indexd
+%endif
+    mova m_y, [py_2indexq + 2 * indexq]
+    movh m_u, [pu_indexq  +     indexq]
+    movh m_v, [pv_indexq  +     indexq]
+.loop0:
+    pxor m4, m4
+    mova m7, m6
+    punpcklbw m0, m4
+    punpcklbw m1, m4
+    mova m2, [pw_00ff]
+    pand m6, m2
+    psrlw m7, 8
+    psllw m0, 3
+    psllw m1, 3
+    psllw m6, 3
+    psllw m7, 3
+    psubsw m0, u_offset ; U = U - 128
+    psubsw m1, v_offset ; V = V - 128
+    psubw m6, y_offset
+    psubw m7, y_offset
+    mova m2, m0
+    mova m3, m1
+    pmulhw m2, ug_coff
+    pmulhw m3, vg_coff
+    pmulhw m6, y_coff
+    pmulhw m7, y_coff
+    pmulhw m0, ub_coff
+    pmulhw m1, vr_coff
+    paddsw m2, m3
+    mova m3, m7
+    mova m5, m7
+    paddsw m3, m0 ; B1 B3 B5 B7 ...
+    paddsw m5, m1 ; R1 R3 R5 R7 ...
+    paddsw m7, m2 ; G1 G3 G4 G7 ...
+    paddsw m0, m6 ; B0 B2 B4 B6 ...
+    paddsw m1, m6 ; R0 R2 R4 R6 ...
+    paddsw m2, m6 ; G0 G2 G4 G6 ...
+
+%if %3 == 24 ; PACK RGB24
+%define depth 3
+    packuswb m0, m3 ; R0 R2 R4 R6 ... R1 R3 R5 R7 ...
+    packuswb m1, m5 ; B0 B2 B4 B6 ... B1 B3 B5 B7 ...
+    packuswb m2, m7 ; G0 G2 G4 G6 ... G1 G3 G5 G7 ...
+    mova m3, m_red
+    mova m6, m_blue
+    MOV_H2L m_red
+    punpcklbw m3, m2     ; R0 G0 R2 G2 R4 G4 R6 G6 R8 G8 ...
+    punpcklbw m6, m_red  ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
+    mova m5, m3
+    punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
+    punpcklwd m3 ,m6     ; R0 G0 B0 R1 R2 G2 B2 R3
+    punpckhwd m5, m6     ; R4 G4 B4 R5 R6 G6 B6 R7
+%if cpuflag(mmxext)
+    pshufw m1, m2, 0xc6
+    pshufw m6, m3, 0x84
+    pshufw m7, m5, 0x38
+    pand m6, [mask_1101] ; R0 G0 B0 R1 -- -- R2 G2
+    movq m0, m1
+    pand m7, [mask_0110] ; -- -- R6 G6 B6 R7 -- --
+    movq m2, m1
+    pand m1, [mask_0100] ; -- -- G3 B3 -- -- -- --
+    psrlq m3, 48         ; B2 R3 -- -- -- -- -- --
+    pand m0, [mask_0010] ; -- -- -- -- G1 B1 -- --
+    psllq m5, 32         ; -- -- -- -- R4 G4 B4 R5
+    pand m2, [mask_1001] ; G5 B5 -- -- -- -- G7 B7
+    por m1, m3
+    por m0, m6
+    por m1, m5
+    por m2, m7
+    movntq [imageq], m0
+    movntq [imageq + 8], m1
+    movntq [imageq + 16], m2
+%else ; cpuflag(mmx)
+    movd [imageq], m3      ; R0 G0 R2 G2
+    movd [imageq + 4], m2  ; G1 B1
+    psrlq m3, 32
+    psrlq m2, 16
+    movd [imageq + 6], m3  ; R2 G2 B2 R3
+    movd [imageq + 10], m2 ; G3 B3
+    psrlq m2, 16
+    movd [imageq + 12], m5 ; R4 G4 B4 R5
+    movd [imageq + 16], m2 ; G5 B5
+    psrlq m5, 32
+    movd [imageq + 20], m2 ; -- -- G7 B7
+    movd [imageq + 18], m5 ; R6 G6 B6 R7
+%endif
+%else ; PACK RGB15/16/32
+    packuswb m0, m1
+    packuswb m3, m5
+    packuswb m2, m2
+    mova m1, m0
+    packuswb m7, m7
+    punpcklbw m0, m3 ; B0 B1 B2 B3 ... B7
+    punpckhbw m1, m3 ; R0 R1 R2 R3 ... R7
+    punpcklbw m2, m7 ; G0 G1 G2 G3 ... G7
+%if %3 == 32 ; PACK RGB32
+%define depth 4
+%ifidn %1, yuv
+    pcmpeqd m3, m3 ; Set alpha empty
+%else
+    mova m3, [pa_2indexq + 2 * indexq] ; Load alpha
+%endif
+    mova m5, m_blue
+    mova m6, m_red
+    punpckhbw m5, m_green
+    punpcklbw m_blue, m_green
+    punpckhbw m6, m_alpha
+    punpcklbw m_red, m_alpha
+    mova m_green, m_blue
+    mova m_alpha, m5
+    punpcklwd m_blue, m_red
+    punpckhwd m_green, m_red
+    punpcklwd m5, m6
+    punpckhwd m_alpha, m6
+    mova [imageq + 0], m_blue
+    mova [imageq + 8 * time_num], m_green
+    mova [imageq + 16 * time_num], m5
+    mova [imageq + 24 * time_num], m_alpha
+%else ; PACK RGB15/16
+%define depth 2
+%define blue_dither  [pointer_c_ditherq + 2 * 8]
+%define green_dither [pointer_c_ditherq + 1 * 8]
+%define red_dither   [pointer_c_ditherq + 0 * 8]
+%if %3 == 15
+%define gmask pb_03
+%define isRGB15 1
+%else
+%define gmask pb_07
+%define isRGB15 0
+%endif
+    paddusb m0, blue_dither
+    paddusb m2, green_dither
+    paddusb m1, red_dither
+    pand m0, [pb_f8]
+    pand m1, [pb_f8]
+    mova m3, m2
+    psllw m2, 3 - isRGB15
+    psrlw m3, 5 + isRGB15
+    psrlw m0, 3
+    psrlw m1, isRGB15
+    pand m2, [pb_e0]
+    pand m3, [gmask]
+    por m0, m2
+    por m1, m3
+    mova m2, m0
+    punpcklbw m0, m1
+    punpckhbw m2, m1
+    mova [imageq], m0
+    mova [imageq + 8 * time_num], m2
+%endif ; PACK RGB15/16
+%endif ; PACK RGB15/16/32
+
+mova m_y, [py_2indexq + 2 * indexq + 8 * time_num]
+movh m_v, [pv_indexq  +     indexq + 4 * time_num]
+movh m_u, [pu_indexq  +     indexq + 4 * time_num]
+add imageq, 8 * depth * time_num
+add indexq, 4 * time_num
+js .loop0
+
+REP_RET
+
+%endmacro
+
+INIT_MMX mmx
+yuv2rgb_fn yuv,  rgb, 24
+yuv2rgb_fn yuv,  bgr, 24
+yuv2rgb_fn yuv,  rgb, 32
+yuv2rgb_fn yuv,  bgr, 32
+yuv2rgb_fn yuva, rgb, 32
+yuv2rgb_fn yuva, bgr, 32
+yuv2rgb_fn yuv,  rgb, 15
+yuv2rgb_fn yuv,  rgb, 16
+
+INIT_MMX mmxext
+yuv2rgb_fn yuv, rgb, 24
+yuv2rgb_fn yuv, bgr, 24