diff mbox

[FFmpeg-devel,V1,1/3] lavu: Add alpha blending API based on row.

Message ID 1537889235-17619-1-git-send-email-mypopydev@gmail.com
State New
Headers show

Commit Message

Jun Zhao Sept. 25, 2018, 3:27 p.m. UTC
Add alpha blending API based on row, support global alpha blending/
per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.

Signed-off-by: Jun Zhao <mypopydev@gmail.com>
---
 libavutil/Makefile         |    2 +
 libavutil/blend.c          |  101 ++++++++++++
 libavutil/blend.h          |   47 ++++++
 libavutil/x86/Makefile     |    3 +-
 libavutil/x86/blend.h      |   32 ++++
 libavutil/x86/blend_init.c |  369 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 553 insertions(+), 1 deletions(-)
 create mode 100644 libavutil/blend.c
 create mode 100644 libavutil/blend.h
 create mode 100644 libavutil/x86/blend.h
 create mode 100644 libavutil/x86/blend_init.c

Comments

Rostislav Pehlivanov Sept. 25, 2018, 7:49 p.m. UTC | #1
On 25 September 2018 at 16:27, Jun Zhao <mypopydev@gmail.com> wrote:

> Add alpha blending API based on row, support global alpha blending/
> per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.
>
> Signed-off-by: Jun Zhao <mypopydev@gmail.com>
> ---
>  libavutil/Makefile         |    2 +
>  libavutil/blend.c          |  101 ++++++++++++
>  libavutil/blend.h          |   47 ++++++
>  libavutil/x86/Makefile     |    3 +-
>  libavutil/x86/blend.h      |   32 ++++
>  libavutil/x86/blend_init.c |  369 ++++++++++++++++++++++++++++++
> ++++++++++++++
>  6 files changed, 553 insertions(+), 1 deletions(-)
>  create mode 100644 libavutil/blend.c
>  create mode 100644 libavutil/blend.h
>  create mode 100644 libavutil/x86/blend.h
>  create mode 100644 libavutil/x86/blend_init.c
>
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 9ed24cf..f1c06e4 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -10,6 +10,7 @@ HEADERS = adler32.h
>                \
>            avstring.h                                                    \
>            avutil.h                                                      \
>            base64.h                                                      \
> +          blend.h                                                       \
>            blowfish.h                                                    \
>            bprint.h                                                      \
>            bswap.h                                                       \
> @@ -95,6 +96,7 @@ OBJS = adler32.o
>                 \
>         audio_fifo.o                                                     \
>         avstring.o                                                       \
>         base64.o                                                         \
> +       blend.o                                                          \
>         blowfish.o                                                       \
>         bprint.o                                                         \
>         buffer.o                                                         \
> diff --git a/libavutil/blend.c b/libavutil/blend.c
> new file mode 100644
> index 0000000..e28efa0
> --- /dev/null
> +++ b/libavutil/blend.c
> @@ -0,0 +1,101 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/blend.h"
> +
> +#include "libavutil/x86/blend.h"
> +
> +static void ff_global_blend_row_c(const uint8_t *src0,
> +                                  const uint8_t *src1,
> +                                  const uint8_t *alpha, /* XXX: only use
> alpha[0] */
> +                                  uint8_t *dst,
> +                                  int width)
> +{
> +    int x;
> +    for (x = 0; x < width - 1; x += 2) {
> +        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> +        dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255)
> >> 8;
> +        src0 += 2;
> +        src1 += 2;
> +        dst  += 2;
> +    }
> +    if (width & 1) {
> +        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> +    }
> +}
> +
> +void av_global_blend_row(const uint8_t *src0,
> +                         const uint8_t *src1,
> +                         const uint8_t *alpha,
> +                         uint8_t *dst,
> +                         int width)
> +{
> +    blend_row blend_row_fn = NULL;
> +
> +#if ARCH_X86
> +    blend_row_fn = ff_blend_row_init_x86(1);
> +#endif
> +
> +    if (!blend_row_fn)
> +        blend_row_fn = ff_global_blend_row_c;
> +
> +    blend_row_fn(src0, src1, alpha, dst, width);
> +}
> +
> +static void ff_per_pixel_blend_row_c(const uint8_t *src0,
> +                                     const uint8_t *src1,
> +                                     const uint8_t *alpha,
> +                                     uint8_t *dst,
> +                                     int width)
> +{
> +    int x;
> +    for (x = 0; x < width - 1; x += 2) {
> +        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> +        dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255)
> >> 8;
> +        src0 += 2;
> +        src1 += 2;
> +        dst  += 2;
> +        alpha+= 2;
> +    }
> +    if (width & 1) {
> +        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> +    }
> +}
> +
> +void av_per_pixel_blend_row(const uint8_t *src0,
> +                            const uint8_t *src1,
> +                            const uint8_t *alpha,
> +                            uint8_t *dst,
> +                            int width)
> +{
> +    blend_row blend_row_fn = NULL;
> +
> +#if ARCH_X86
> +    blend_row_fn = ff_blend_row_init_x86(0);
> +#endif
> +
> +    if (!blend_row_fn)
> +        blend_row_fn = ff_per_pixel_blend_row_c;
> +
> +    blend_row_fn(src0, src1, alpha, dst, width);
> +}
> +
> diff --git a/libavutil/blend.h b/libavutil/blend.h
> new file mode 100644
> index 0000000..8a42109
> --- /dev/null
> +++ b/libavutil/blend.h
> @@ -0,0 +1,47 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +#ifndef AVUTIL_BLEND_H
> +#define AVUTIL_BLEND_H
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +
> +/**
> + * Global alpha blending by row
> + *
> + * dst[i] = (src[i]*alpha[0]+(255-alpha[0])*src1[i]+255)>>8
> + */
> +void av_global_blend_row(const uint8_t *src0,
> +                         const uint8_t *src1,
> +                         const uint8_t *alpha, /* XXX: only use alpha[0]
> */
> +                         uint8_t *dst,
> +                         int width);
> +
> +/**
> + * Per-pixel alpha blending by row
> + *
> + * dst[i] = (src[i]*alpha[i]+(255-alpha[i])*src1[i]+255)>>8
> + */
> +void av_per_pixel_blend_row(const uint8_t *src0,
> +                            const uint8_t *src1,
> +                            const uint8_t *alpha,
> +                            uint8_t *dst,
> +                            int width);
> +#endif
> diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
> index 5f5242b..1e5e3e4 100644
> --- a/libavutil/x86/Makefile
> +++ b/libavutil/x86/Makefile
> @@ -1,4 +1,5 @@
> -OBJS += x86/cpu.o                                                       \
> +OBJS += x86/blend_init.o                                                \
> +        x86/cpu.o                                                       \
>          x86/fixed_dsp_init.o                                            \
>          x86/float_dsp_init.o                                            \
>          x86/imgutils_init.o                                             \
> diff --git a/libavutil/x86/blend.h b/libavutil/x86/blend.h
> new file mode 100644
> index 0000000..9fa0f36
> --- /dev/null
> +++ b/libavutil/x86/blend.h
> @@ -0,0 +1,32 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_X86_BLEND_H
> +#define AVUTIL_X86_BLEND_H
> +
> +#include "libavutil/blend.h"
> +
> +typedef void (*blend_row)(const uint8_t *src0,
> +                          const uint8_t *src1,
> +                          const uint8_t *alpha,
> +                          uint8_t *dst,
> +                          int width);
> +
> +blend_row ff_blend_row_init_x86(int global);
> +
> +#endif /* AVUTIL_X86_BLEND_H */
> diff --git a/libavutil/x86/blend_init.c b/libavutil/x86/blend_init.c
> new file mode 100644
> index 0000000..f555dfa
> --- /dev/null
> +++ b/libavutil/x86/blend_init.c
> @@ -0,0 +1,369 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/blend.h"
> +
> +#if HAVE_SSSE3_INLINE && HAVE_6REGS
> +// per-pixel blend (8 pixels at a time.)
> +// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
> +static void ff_per_pixel_blend_row_ssse3(const uint8_t *src0,
> +                                         const uint8_t *src1,
> +                                         const uint8_t *alpha,
> +                                         uint8_t *dst,
> +                                         int width)
> +{
> +    int aligned_w = width/8 * 8;
> +    int width_u = width - aligned_w;
> +    uint8_t *src0_u  = (uint8_t *)src0 + aligned_w;
> +    uint8_t *src1_u  = (uint8_t *)src1 + aligned_w;
> +    uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
> +    uint8_t *dst_u  = dst + aligned_w;
> +    int i;
> +
> +    if (aligned_w > 0) {
> +        __asm__ volatile(
> +            "pcmpeqb    %%xmm3,%%xmm3                  \n\t"
> +            "psllw      $0x8,%%xmm3                    \n\t"
> +            "mov        $0x80808080,%%eax              \n\t"
> +            "movd       %%eax,%%xmm3                   \n\t"
> +            "pshufd     $0x0,%%xmm4,%%xmm4             \n\t"
> +            "mov        $0x807f807f,%%eax              \n\t"
> +            "movd       %%eax,%%xmm5                   \n\t"
> +            "pshufd     $0x0,%%xmm5,%%xmm5             \n\t"
> +            "sub        %2,%0                          \n\t"
> +            "sub        %2,%1                          \n\t"
> +            "sub        %2,%3                          \n\t"
> +
> +            // 8 pixel per loop.
> +            "1:                                        \n\t"
> +            "movq       (%2),%%xmm0                    \n\t"
> +            "punpcklbw  %%xmm0,%%xmm0                  \n\t"
> +            "pxor       %%xmm3,%%xmm0                  \n\t"
> +            "movq       (%0,%2,1),%%xmm1               \n\t"
> +            "movq       (%1,%2,1),%%xmm2               \n\t"
> +            "punpcklbw  %%xmm2,%%xmm1                  \n\t"
> +            "psubb      %%xmm4,%%xmm1                  \n\t"
> +            "pmaddubsw  %%xmm1,%%xmm0                  \n\t"
> +            "paddw      %%xmm5,%%xmm0                  \n\t"
> +            "psrlw      $0x8,%%xmm0                    \n\t"
> +            "packuswb   %%xmm0,%%xmm0                  \n\t"
> +            "movq       %%xmm0,(%3,%2,1)               \n\t"
> +            "lea        0x8(%2),%2                     \n\t"
> +            "sub        $0x8,%4                        \n\t"
> +            "jg        1b                              \n\t"
> +            : "+r"(src0),       // %0
> +              "+r"(src1),       // %1
> +              "+r"(alpha),      // %2
> +              "+r"(dst),        // %3
> +              "+rm"(aligned_w)  // %4
> +            ::"memory",
> +             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
> +    }
> +
> +    for (i = 0; i < width_u - 1; i += 2) {
> +        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> +        dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 -
> alpha_u[0]) + 255) >> 8;
> +        src0_u += 2;
> +        src1_u += 2;
> +        dst_u  += 2;
> +        alpha_u+= 2;
> +    }
> +    if (width_u & 1) {
> +        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> +    }
> +}
> +
> +// global blend (8 pixels at a time).
> +// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
> +static void ff_global_blend_row_ssse3(const uint8_t *src0,
> +                                      const uint8_t *src1,
> +                                      const uint8_t *alpha,
> +                                      uint8_t *dst,
> +                                      int width)
> +{
> +    int aligned_w = width/8 * 8;
> +    int width_u = width - aligned_w;
> +    uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
> +    uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
> +    uint8_t *dst_u  = dst + aligned_w;
> +    int i;
> +
> +    if (aligned_w > 0) {
> +        __asm__ volatile(
> +            "pcmpeqb    %%xmm3,%%xmm3                  \n\t"
> +            "psllw      $0x8,%%xmm3                    \n\t"
> +            "mov        $0x80808080,%%eax              \n\t"
> +            "movd       %%eax,%%xmm4                   \n\t"
> +            "pshufd     $0x0,%%xmm4,%%xmm4             \n\t"
> +            "mov        $0x807f807f,%%eax              \n\t"
> +            "movd       %%eax,%%xmm5                   \n\t"
> +            "pshufd     $0x0,%%xmm5,%%xmm5             \n\t"
> +            // a => xmm6 [a a a a a a a a a a a a a a a a ]
> +            "movb       (%2),%%al                      \n\t"
> +            "movd       %%eax,%%xmm6                   \n\t" // xmm6 = x
> x x x x x x x x x x x x x x a
> +            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x
> x x x x x x x x x x x x x a a
> +            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x
> x x x x x x x x x x x a a a a
> +            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x
> x x x x x x x a a a a a a a a
> +            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = a
> a a a a a a a a a a a a a a a
> +
> +            // 8 pixel per loop.
> +            "1:                                        \n\t"
> +            "movdqu     %%xmm6,%%xmm0                  \n\t" // xmm0 =
> xmm6
> +            "pxor       %%xmm3,%%xmm0                  \n\t"
> +
> +            "movq       (%0),%%xmm1                    \n\t"
> +            "movq       (%1),%%xmm2                    \n\t"
> +            "punpcklbw  %%xmm2,%%xmm1                  \n\t"
> +            "psubb      %%xmm4,%%xmm1                  \n\t"
> +
> +            "pmaddubsw  %%xmm1,%%xmm0                  \n\t"
> +            "paddw      %%xmm5,%%xmm0                  \n\t"
> +            "psrlw      $0x8,%%xmm0                    \n\t"
> +            "packuswb   %%xmm0,%%xmm0                  \n\t"
> +            "movq       %%xmm0,(%3)                    \n\t"
> +
> +            "lea        0x8(%0),%0                     \n\t" // src0+8
> +            "lea        0x8(%1),%1                     \n\t" // src1+8
> +            "lea        0x8(%3),%3                     \n\t" // dst+8
> +            "sub        $0x8,%4                        \n\t"
> +            "jg        1b                              \n\t"
> +            : "+r"(src0),       // %0
> +              "+r"(src1),       // %1
> +              "+r"(alpha),      // %2
> +              "+r"(dst),        // %3
> +              "+rm"(aligned_w)  // %4
> +            ::"memory",
> +             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
> "xmm6");
> +    }
> +
> +    for (i = 0; i < width_u - 1; i += 2) {
> +        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> +        dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) +
> 255) >> 8;
> +        src0_u += 2;
> +        src1_u += 2;
> +        dst_u  += 2;
> +    }
> +    if (width_u & 1) {
> +        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> +    }
> +}
> +#endif
> +
> +#if HAVE_AVX2_INLINE && HAVE_6REGS
> +// per-pixe blend (32 pixels at a time).
> +// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
> +static void ff_per_pixel_blend_row_avx2(const uint8_t *src0,
> +                                        const uint8_t *src1,
> +                                        const uint8_t *alpha,
> +                                        uint8_t *dst,
> +                                        int width)
> +{
> +    int aligned_w = width/32 * 32;
> +    int width_u = width - aligned_w;
> +    uint8_t *src0_u  = (uint8_t *)src0 + aligned_w;
> +    uint8_t *src1_u  = (uint8_t *)src1 + aligned_w;
> +    uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
> +    uint8_t *dst_u  = dst + aligned_w;
> +    int i;
> +
> +    if (aligned_w > 0) {
> +        __asm__ volatile(
> +            "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n\t"
> +            "vpsllw     $0x8,%%ymm5,%%ymm5             \n\t"
> +            "mov        $0x80808080,%%eax              \n\t"
> +            "vmovd      %%eax,%%xmm6                   \n\t"
> +            "vbroadcastss %%xmm6,%%ymm6                \n\t"
> +            "mov        $0x807f807f,%%eax              \n\t"
> +            "vmovd      %%eax,%%xmm7                   \n\t"
> +            "vbroadcastss %%xmm7,%%ymm7                \n\t"
> +            "sub        %2,%0                          \n\t"
> +            "sub        %2,%1                          \n\t"
> +            "sub        %2,%3                          \n\t"
> +
> +            // 32 pixel per loop.
> +            "1:                                        \n\t"
> +            "vmovdqu    (%2),%%ymm0                    \n\t"
> +            "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n\t"
> +            "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n\t"
> +            "vpxor      %%ymm5,%%ymm3,%%ymm3           \n\t"
> +            "vpxor      %%ymm5,%%ymm0,%%ymm0           \n\t"
> +            "vmovdqu    (%0,%2,1),%%ymm1               \n\t"
> +            "vmovdqu    (%1,%2,1),%%ymm2               \n\t"
> +            "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n\t"
> +            "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n\t"
> +            "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n\t"
> +            "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n\t"
> +            "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n\t"
> +            "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n\t"
> +            "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n\t"
> +            "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n\t"
> +            "vpsrlw     $0x8,%%ymm3,%%ymm3             \n\t"
> +            "vpsrlw     $0x8,%%ymm0,%%ymm0             \n\t"
> +            "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n\t"
> +            "vmovdqu    %%ymm0,(%3,%2,1)               \n\t"
> +            "lea        0x20(%2),%2                    \n\t"
> +            "sub        $0x20,%4                       \n\t"
> +            "jg        1b                              \n\t"
> +            "vzeroupper                                \n\t"
> +            : "+r"(src0),      // %0
> +              "+r"(src1),      // %1
> +              "+r"(alpha),     // %2
> +              "+r"(dst),       // %3
> +              "+rm"(aligned_w) // %4
> +            ::"memory",
> +             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
> "xmm6",
> +             "xmm7");
> +    }
> +
> +    for (i = 0; i < width_u - 1; i += 2) {
> +        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> +        dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 -
> alpha_u[0]) + 255) >> 8;
> +        src0_u += 2;
> +        src1_u += 2;
> +        dst_u  += 2;
> +        alpha_u+= 2;
> +    }
> +    if (width_u & 1) {
> +        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> +    }
> +}
> +
> +// global blend (32 pixels at a time)
> +// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
> +static void ff_global_blend_row_avx2(const uint8_t *src0,
> +                                     const uint8_t *src1,
> +                                     const uint8_t *alpha,
> +                                     uint8_t *dst,
> +                                     int width)
> +{
> +    int aligned_w = width/32 * 32;
> +    int width_u = width - aligned_w;
> +    uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
> +    uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
> +    uint8_t *dst_u  = dst + aligned_w;
> +    int i;
> +
> +    if (aligned_w > 0) {
> +        __asm__ volatile(
> +            "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n\t"
> +            "vpsllw     $0x8,%%ymm5,%%ymm5             \n\t"
> +            "mov        $0x80808080,%%eax              \n\t"
> +            "vmovd      %%eax,%%xmm6                   \n\t"
> +            "vbroadcastss %%xmm6,%%ymm6                \n\t"
> +            "mov        $0x807f807f,%%eax              \n\t"
> +            "vmovd      %%eax,%%xmm7                   \n\t"
> +            "vbroadcastss %%xmm7,%%ymm7                \n\t"
> +            // a => ymm8 [a a a a a a a a a a a a a a a a
> +            //            a a a a a a a a a a a a a a a a
> +            //            a a a a a a a a a a a a a a a a
> +            //            a a a a a a a a a a a a a a a a]
> +            "movb       (%2),%%al                      \n\t"
> +            "movd       %%eax,%%xmm8                   \n\t" // xmm8 = x
> x x x x x x x x x x x x x x a
> +            "punpcklbw  %%xmm8,%%xmm8                  \n\t" // xmm8 = x
> x x x x x x x x x x x x x a a
> +            "punpcklbw  %%xmm8,%%xmm8                  \n\t" // xmm8 = x
> x x x x x x x x x x x a a a a
> +            "vbroadcastss %%xmm8,%%ymm8                \n\t"
> +
> +            // 32 pixel per loop.
> +            "1:                                        \n\t"
> +            "vmovdqu    %%ymm8,%%ymm0                  \n\t"
> +            "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n\t"
> +            "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n\t"
> +            "vpxor      %%ymm5,%%ymm3,%%ymm3           \n\t"
> +            "vpxor      %%ymm5,%%ymm0,%%ymm0           \n\t"
> +
> +            "vmovdqu    (%0),%%ymm1                    \n\t"
> +            "vmovdqu    (%1),%%ymm2                    \n\t"
> +            "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n\t"
> +            "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n\t"
> +            "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n\t"
> +            "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n\t"
> +            "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n\t"
> +            "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n\t"
> +            "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n\t"
> +            "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n\t"
> +            "vpsrlw     $0x8,%%ymm3,%%ymm3             \n\t"
> +            "vpsrlw     $0x8,%%ymm0,%%ymm0             \n\t"
> +            "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n\t"
> +
> +            "vmovdqu    %%ymm0,(%3)                    \n\t"
> +            "lea        0x20(%0),%0                    \n\t"
> +            "lea        0x20(%1),%1                    \n\t"
> +            "lea        0x20(%3),%3                    \n\t"
> +            "sub        $0x20,%4                       \n\t"
> +            "jg        1b                              \n\t"
> +            "vzeroupper                                \n\t"
> +            : "+r"(src0),       // %0
> +              "+r"(src1),       // %1
> +              "+r"(alpha),      // %2
> +              "+r"(dst),        // %3
> +              "+rm"(aligned_w)  // %4
> +            ::"memory",
> +             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
> "xmm6",
> +             "xmm7", "xmm8");
> +    }
> +
> +    for (i = 0; i < width_u - 1; i += 2) {
> +        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> +        dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) +
> 255) >> 8;
> +        src0_u += 2;
> +        src1_u += 2;
> +        dst_u  += 2;
> +    }
> +    if (width_u & 1) {
> +        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> +    }
> +}
> +#endif
> +
> +av_cold blend_row ff_blend_row_init_x86(int global)
> +{
> +    blend_row blend_row_fn = NULL;
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (global) {
> +#if HAVE_SSSE3_INLINE && HAVE_6REGS
> +        if (EXTERNAL_SSSE3(cpu_flags)) {
> +            blend_row_fn = ff_global_blend_row_ssse3;
> +        }
> +#endif
> +
> +#if HAVE_AVX2_INLINE && HAVE_6REGS
> +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> +            blend_row_fn = ff_global_blend_row_avx2;
> +        }
> +#endif
> +    } else {
> +#if HAVE_SSSE3_INLINE && HAVE_6REGS
> +        if (EXTERNAL_SSSE3(cpu_flags)) {
> +            blend_row_fn = ff_per_pixel_blend_row_ssse3;
> +        }
> +#endif
> +
> +#if HAVE_AVX2_INLINE && HAVE_6REGS
> +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> +            blend_row_fn = ff_per_pixel_blend_row_avx2;
> +        }
> +#endif
> +    }
> +
> +    return blend_row_fn;
> +}
> --
> 1.7.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

We don't use inline asm on x86 and we don't use global contexts. Look at
how float_dsp is done.
Marton Balint Sept. 25, 2018, 10:58 p.m. UTC | #2
On Tue, 25 Sep 2018, Jun Zhao wrote:

> Add alpha blending API based on row, support global alpha blending/
> per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.

You might want to take a look at 
libavfilter/vf_framerate.c and libavfilter/x86/vf_framerate.asm as well, 
they do something similar. Maybe you should factorize that instead.

Regards,
Marton
mypopy@gmail.com Sept. 26, 2018, 1:45 a.m. UTC | #3
On Wed, Sep 26, 2018 at 3:55 AM Rostislav Pehlivanov <atomnuker@gmail.com>
wrote:
>
> On 25 September 2018 at 16:27, Jun Zhao <mypopydev@gmail.com> wrote:
>
> > Add alpha blending API based on row, support global alpha blending/
> > per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.
> >


> We don't use inline asm on x86 and we don't use global contexts. Look at
> how float_dsp is done.

I guess you precise mean "prefer NASM assembler over inline asm on x86". :)
In fact,
I know some x86 inline asm in FFmpeg, e,g libavcodec/x86/h264_cabac.
(Use grep "__asm__ volatile" can find more x86 inline asm). And we need to
update
the inline asm on x86 rule in
https://github.com/FFmpeg/FFmpeg/blob/master/doc/optimization.txt?

Thanks.
James Almer Sept. 26, 2018, 1:56 a.m. UTC | #4
On 9/25/2018 10:45 PM, mypopy@gmail.com wrote:
> On Wed, Sep 26, 2018 at 3:55 AM Rostislav Pehlivanov <atomnuker@gmail.com>
> wrote:
>>
>> On 25 September 2018 at 16:27, Jun Zhao <mypopydev@gmail.com> wrote:
>>
>>> Add alpha blending API based on row, support global alpha blending/
>>> per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.
>>>
> 
> 
>> We don't use inline asm on x86 and we don't use global contexts. Look at
>> how float_dsp is done.
> 
> I guess you precise mean "prefer NASM assembler over inline asm on x86". :)
> In fact,
> I know some x86 inline asm in FFmpeg, e,g libavcodec/x86/h264_cabac.
> (Use grep "__asm__ volatile" can find more x86 inline asm). And we need to
> update
> the inline asm on x86 rule in
> https://github.com/FFmpeg/FFmpeg/blob/master/doc/optimization.txt?

Yes, we still have some inline asm either because nobody has gotten
around to port it to NASM syntax after the project moved to it, or
because like with CABAC and some single instruction functions in
libavutil it makes sense being inline since the call overhead would kill
performance.

That document could use some polishing, but in any case, as stated in
the "Inline asm vs. external asm" section, we have for several years
required new code that calls external functions to be written in NASM
syntax, as it's the case with this patchset.

> 
> Thanks.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
mypopy@gmail.com Sept. 26, 2018, 2:02 a.m. UTC | #5
On Wed, Sep 26, 2018 at 6:58 AM Marton Balint <cus@passwd.hu> wrote:
>
>
>
> On Tue, 25 Sep 2018, Jun Zhao wrote:
>
> > Add alpha blending API based on row, support global alpha blending/
> > per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.
>
> You might want to take a look at
> libavfilter/vf_framerate.c and libavfilter/x86/vf_framerate.asm as well,
> they do something similar. Maybe you should factorize that instead.
>
>
Yep, this is a good suggestion, I think we can factor this part and
supply a public 8bits/16bits blend API with SSSE3/AVX2 optimiztion,
then we can use the API in
vf_framerate/vf_blend (blend_normal_8bit/16bit)/vf_minterpolate (blend mode).
diff mbox

Patch

diff --git a/libavutil/Makefile b/libavutil/Makefile
index 9ed24cf..f1c06e4 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -10,6 +10,7 @@  HEADERS = adler32.h                                                     \
           avstring.h                                                    \
           avutil.h                                                      \
           base64.h                                                      \
+          blend.h                                                       \
           blowfish.h                                                    \
           bprint.h                                                      \
           bswap.h                                                       \
@@ -95,6 +96,7 @@  OBJS = adler32.o                                                        \
        audio_fifo.o                                                     \
        avstring.o                                                       \
        base64.o                                                         \
+       blend.o                                                          \
        blowfish.o                                                       \
        bprint.o                                                         \
        buffer.o                                                         \
diff --git a/libavutil/blend.c b/libavutil/blend.c
new file mode 100644
index 0000000..e28efa0
--- /dev/null
+++ b/libavutil/blend.c
@@ -0,0 +1,101 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/blend.h"
+
+#include "libavutil/x86/blend.h"
+
+static void ff_global_blend_row_c(const uint8_t *src0,
+                                  const uint8_t *src1,
+                                  const uint8_t *alpha, /* XXX: only use alpha[0] */
+                                  uint8_t *dst,
+                                  int width)
+{
+    int x;
+    for (x = 0; x < width - 1; x += 2) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+        dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) >> 8;
+        src0 += 2;
+        src1 += 2;
+        dst  += 2;
+    }
+    if (width & 1) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+
+void av_global_blend_row(const uint8_t *src0,
+                         const uint8_t *src1,
+                         const uint8_t *alpha,
+                         uint8_t *dst,
+                         int width)
+{
+    blend_row blend_row_fn = NULL;
+
+#if ARCH_X86
+    blend_row_fn = ff_blend_row_init_x86(1);
+#endif
+
+    if (!blend_row_fn)
+        blend_row_fn = ff_global_blend_row_c;
+
+    blend_row_fn(src0, src1, alpha, dst, width);
+}
+
+static void ff_per_pixel_blend_row_c(const uint8_t *src0,
+                                     const uint8_t *src1,
+                                     const uint8_t *alpha,
+                                     uint8_t *dst,
+                                     int width)
+{
+    int x;
+    for (x = 0; x < width - 1; x += 2) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+        dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) >> 8;
+        src0 += 2;
+        src1 += 2;
+        dst  += 2;
+        alpha+= 2;
+    }
+    if (width & 1) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+
+void av_per_pixel_blend_row(const uint8_t *src0,
+                            const uint8_t *src1,
+                            const uint8_t *alpha,
+                            uint8_t *dst,
+                            int width)
+{
+    blend_row blend_row_fn = NULL;
+
+#if ARCH_X86
+    blend_row_fn = ff_blend_row_init_x86(0);
+#endif
+
+    if (!blend_row_fn)
+        blend_row_fn = ff_per_pixel_blend_row_c;
+
+    blend_row_fn(src0, src1, alpha, dst, width);
+}
+
diff --git a/libavutil/blend.h b/libavutil/blend.h
new file mode 100644
index 0000000..8a42109
--- /dev/null
+++ b/libavutil/blend.h
@@ -0,0 +1,47 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef AVUTIL_BLEND_H
+#define AVUTIL_BLEND_H
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+
+/**
+ * Global alpha blending by row
+ *
+ * dst[i] = (src[i]*alpha[0]+(255-alpha[0])*src1[i]+255)>>8
+ */
+void av_global_blend_row(const uint8_t *src0,
+                         const uint8_t *src1,
+                         const uint8_t *alpha, /* XXX: only use alpha[0] */
+                         uint8_t *dst,
+                         int width);
+
+/**
+ * Per-pixel alpha blending by row
+ *
+ * dst[i] = (src[i]*alpha[i]+(255-alpha[i])*src1[i]+255)>>8
+ */
+void av_per_pixel_blend_row(const uint8_t *src0,
+                            const uint8_t *src1,
+                            const uint8_t *alpha,
+                            uint8_t *dst,
+                            int width);
+#endif
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 5f5242b..1e5e3e4 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,4 +1,5 @@ 
-OBJS += x86/cpu.o                                                       \
+OBJS += x86/blend_init.o                                                \
+        x86/cpu.o                                                       \
         x86/fixed_dsp_init.o                                            \
         x86/float_dsp_init.o                                            \
         x86/imgutils_init.o                                             \
diff --git a/libavutil/x86/blend.h b/libavutil/x86/blend.h
new file mode 100644
index 0000000..9fa0f36
--- /dev/null
+++ b/libavutil/x86/blend.h
@@ -0,0 +1,32 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_BLEND_H
+#define AVUTIL_X86_BLEND_H
+
+#include "libavutil/blend.h"
+
+typedef void (*blend_row)(const uint8_t *src0,
+                          const uint8_t *src1,
+                          const uint8_t *alpha,
+                          uint8_t *dst,
+                          int width);
+
+blend_row ff_blend_row_init_x86(int global);
+
+#endif /* AVUTIL_X86_BLEND_H */
diff --git a/libavutil/x86/blend_init.c b/libavutil/x86/blend_init.c
new file mode 100644
index 0000000..f555dfa
--- /dev/null
+++ b/libavutil/x86/blend_init.c
@@ -0,0 +1,369 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/blend.h"
+
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+// per-pixel blend (8 pixels at a time.)
+// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
+static void ff_per_pixel_blend_row_ssse3(const uint8_t *src0,
+                                         const uint8_t *src1,
+                                         const uint8_t *alpha,
+                                         uint8_t *dst,
+                                         int width)
+{
+    int aligned_w = width/8 * 8;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u  = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u  = (uint8_t *)src1 + aligned_w;
+    uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "pcmpeqb    %%xmm3,%%xmm3                  \n\t"
+            "psllw      $0x8,%%xmm3                    \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "movd       %%eax,%%xmm3                   \n\t"
+            "pshufd     $0x0,%%xmm4,%%xmm4             \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "movd       %%eax,%%xmm5                   \n\t"
+            "pshufd     $0x0,%%xmm5,%%xmm5             \n\t"
+            "sub        %2,%0                          \n\t"
+            "sub        %2,%1                          \n\t"
+            "sub        %2,%3                          \n\t"
+
+            // 8 pixel per loop.
+            "1:                                        \n\t"
+            "movq       (%2),%%xmm0                    \n\t"
+            "punpcklbw  %%xmm0,%%xmm0                  \n\t"
+            "pxor       %%xmm3,%%xmm0                  \n\t"
+            "movq       (%0,%2,1),%%xmm1               \n\t"
+            "movq       (%1,%2,1),%%xmm2               \n\t"
+            "punpcklbw  %%xmm2,%%xmm1                  \n\t"
+            "psubb      %%xmm4,%%xmm1                  \n\t"
+            "pmaddubsw  %%xmm1,%%xmm0                  \n\t"
+            "paddw      %%xmm5,%%xmm0                  \n\t"
+            "psrlw      $0x8,%%xmm0                    \n\t"
+            "packuswb   %%xmm0,%%xmm0                  \n\t"
+            "movq       %%xmm0,(%3,%2,1)               \n\t"
+            "lea        0x8(%2),%2                     \n\t"
+            "sub        $0x8,%4                        \n\t"
+            "jg        1b                              \n\t"
+            : "+r"(src0),       // %0
+              "+r"(src1),       // %1
+              "+r"(alpha),      // %2
+              "+r"(dst),        // %3
+              "+rm"(aligned_w)  // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - alpha_u[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+        alpha_u+= 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+    }
+}
+
+// global blend (8 pixels at a time).
+// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
+static void ff_global_blend_row_ssse3(const uint8_t *src0,
+                                      const uint8_t *src1,
+                                      const uint8_t *alpha,
+                                      uint8_t *dst,
+                                      int width)
+{
+    int aligned_w = width/8 * 8;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "pcmpeqb    %%xmm3,%%xmm3                  \n\t"
+            "psllw      $0x8,%%xmm3                    \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "movd       %%eax,%%xmm4                   \n\t"
+            "pshufd     $0x0,%%xmm4,%%xmm4             \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "movd       %%eax,%%xmm5                   \n\t"
+            "pshufd     $0x0,%%xmm5,%%xmm5             \n\t"
+            // a => xmm6 [a a a a a a a a a a a a a a a a ]
+            "movb       (%2),%%al                      \n\t"
+            "movd       %%eax,%%xmm6                   \n\t" // xmm6 = x x x x x x x x x x x x x x x a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x x x x x x x x x x x x x x a a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x x x x x x x x x x x x a a a a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x x x x x x x x a a a a a a a a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = a a a a a a a a a a a a a a a a
+
+            // 8 pixel per loop.
+            "1:                                        \n\t"
+            "movdqu     %%xmm6,%%xmm0                  \n\t" // xmm0 = xmm6
+            "pxor       %%xmm3,%%xmm0                  \n\t"
+
+            "movq       (%0),%%xmm1                    \n\t"
+            "movq       (%1),%%xmm2                    \n\t"
+            "punpcklbw  %%xmm2,%%xmm1                  \n\t"
+            "psubb      %%xmm4,%%xmm1                  \n\t"
+
+            "pmaddubsw  %%xmm1,%%xmm0                  \n\t"
+            "paddw      %%xmm5,%%xmm0                  \n\t"
+            "psrlw      $0x8,%%xmm0                    \n\t"
+            "packuswb   %%xmm0,%%xmm0                  \n\t"
+            "movq       %%xmm0,(%3)                    \n\t"
+
+            "lea        0x8(%0),%0                     \n\t" // src0+8
+            "lea        0x8(%1),%1                     \n\t" // src1+8
+            "lea        0x8(%3),%3                     \n\t" // dst+8
+            "sub        $0x8,%4                        \n\t"
+            "jg        1b                              \n\t"
+            : "+r"(src0),       // %0
+              "+r"(src1),       // %1
+              "+r"(alpha),      // %2
+              "+r"(dst),        // %3
+              "+rm"(aligned_w)  // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+// per-pixe blend (32 pixels at a time).
+// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
+static void ff_per_pixel_blend_row_avx2(const uint8_t *src0,
+                                        const uint8_t *src1,
+                                        const uint8_t *alpha,
+                                        uint8_t *dst,
+                                        int width)
+{
+    int aligned_w = width/32 * 32;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u  = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u  = (uint8_t *)src1 + aligned_w;
+    uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n\t"
+            "vpsllw     $0x8,%%ymm5,%%ymm5             \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm6                   \n\t"
+            "vbroadcastss %%xmm6,%%ymm6                \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm7                   \n\t"
+            "vbroadcastss %%xmm7,%%ymm7                \n\t"
+            "sub        %2,%0                          \n\t"
+            "sub        %2,%1                          \n\t"
+            "sub        %2,%3                          \n\t"
+
+            // 32 pixel per loop.
+            "1:                                        \n\t"
+            "vmovdqu    (%2),%%ymm0                    \n\t"
+            "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n\t"
+            "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n\t"
+            "vpxor      %%ymm5,%%ymm3,%%ymm3           \n\t"
+            "vpxor      %%ymm5,%%ymm0,%%ymm0           \n\t"
+            "vmovdqu    (%0,%2,1),%%ymm1               \n\t"
+            "vmovdqu    (%1,%2,1),%%ymm2               \n\t"
+            "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n\t"
+            "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n\t"
+            "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n\t"
+            "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n\t"
+            "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n\t"
+            "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n\t"
+            "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n\t"
+            "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n\t"
+            "vpsrlw     $0x8,%%ymm3,%%ymm3             \n\t"
+            "vpsrlw     $0x8,%%ymm0,%%ymm0             \n\t"
+            "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n\t"
+            "vmovdqu    %%ymm0,(%3,%2,1)               \n\t"
+            "lea        0x20(%2),%2                    \n\t"
+            "sub        $0x20,%4                       \n\t"
+            "jg        1b                              \n\t"
+            "vzeroupper                                \n\t"
+            : "+r"(src0),      // %0
+              "+r"(src1),      // %1
+              "+r"(alpha),     // %2
+              "+r"(dst),       // %3
+              "+rm"(aligned_w) // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+             "xmm7");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - alpha_u[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+        alpha_u+= 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+    }
+}
+
+// global blend (32 pixels at a time)
+// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
+static void ff_global_blend_row_avx2(const uint8_t *src0,
+                                     const uint8_t *src1,
+                                     const uint8_t *alpha,
+                                     uint8_t *dst,
+                                     int width)
+{
+    int aligned_w = width/32 * 32;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n\t"
+            "vpsllw     $0x8,%%ymm5,%%ymm5             \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm6                   \n\t"
+            "vbroadcastss %%xmm6,%%ymm6                \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm7                   \n\t"
+            "vbroadcastss %%xmm7,%%ymm7                \n\t"
+            // a => ymm8 [a a a a a a a a a a a a a a a a
+            //            a a a a a a a a a a a a a a a a
+            //            a a a a a a a a a a a a a a a a
+            //            a a a a a a a a a a a a a a a a]
+            "movb       (%2),%%al                      \n\t"
+            "movd       %%eax,%%xmm8                   \n\t" // xmm8 = x x x x x x x x x x x x x x x a
+            "punpcklbw  %%xmm8,%%xmm8                  \n\t" // xmm8 = x x x x x x x x x x x x x x a a
+            "punpcklbw  %%xmm8,%%xmm8                  \n\t" // xmm8 = x x x x x x x x x x x x a a a a
+            "vbroadcastss %%xmm8,%%ymm8                \n\t"
+
+            // 32 pixel per loop.
+            "1:                                        \n\t"
+            "vmovdqu    %%ymm8,%%ymm0                  \n\t"
+            "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n\t"
+            "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n\t"
+            "vpxor      %%ymm5,%%ymm3,%%ymm3           \n\t"
+            "vpxor      %%ymm5,%%ymm0,%%ymm0           \n\t"
+
+            "vmovdqu    (%0),%%ymm1                    \n\t"
+            "vmovdqu    (%1),%%ymm2                    \n\t"
+            "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n\t"
+            "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n\t"
+            "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n\t"
+            "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n\t"
+            "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n\t"
+            "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n\t"
+            "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n\t"
+            "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n\t"
+            "vpsrlw     $0x8,%%ymm3,%%ymm3             \n\t"
+            "vpsrlw     $0x8,%%ymm0,%%ymm0             \n\t"
+            "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n\t"
+
+            "vmovdqu    %%ymm0,(%3)                    \n\t"
+            "lea        0x20(%0),%0                    \n\t"
+            "lea        0x20(%1),%1                    \n\t"
+            "lea        0x20(%3),%3                    \n\t"
+            "sub        $0x20,%4                       \n\t"
+            "jg        1b                              \n\t"
+            "vzeroupper                                \n\t"
+            : "+r"(src0),       // %0
+              "+r"(src1),       // %1
+              "+r"(alpha),      // %2
+              "+r"(dst),        // %3
+              "+rm"(aligned_w)  // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+             "xmm7", "xmm8");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+#endif
+
+av_cold blend_row ff_blend_row_init_x86(int global)
+{
+    blend_row blend_row_fn = NULL;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (global) {
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            blend_row_fn = ff_global_blend_row_ssse3;
+        }
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            blend_row_fn = ff_global_blend_row_avx2;
+        }
+#endif
+    } else {
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            blend_row_fn = ff_per_pixel_blend_row_ssse3;
+        }
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            blend_row_fn = ff_per_pixel_blend_row_avx2;
+        }
+#endif
+    }
+
+    return blend_row_fn;
+}