diff mbox

[FFmpeg-devel,1/3] swscale/ppc: VSX-optimize yuv2422_1

Message ID 20190324151035.fe2360d570202b4c34ded513@gmx.com
State New
Headers show

Commit Message

Lauri Kasanen March 24, 2019, 1:10 p.m. UTC
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
            -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
            -cpuflags 0 -v error -

15.3x speedup:

yuyv422
  14513 UNITS in yuv2packed1,   32768 runs,      0 skips
    949 UNITS in yuv2packed1,   32767 runs,      1 skips
yvyu422
  14516 UNITS in yuv2packed1,   32767 runs,      1 skips
    943 UNITS in yuv2packed1,   32767 runs,      1 skips
uyvy422
  14530 UNITS in yuv2packed1,   32767 runs,      1 skips
    941 UNITS in yuv2packed1,   32766 runs,      2 skips

Signed-off-by: Lauri Kasanen <cand@gmx.com>
---
 libswscale/ppc/swscale_vsx.c | 149 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)

Series on top of "swscale/ppc: VSX-optimize yuv2rgb_full".

--
2.6.2

Comments

Lauri Kasanen March 31, 2019, 9:40 a.m. UTC | #1
On Sun, 24 Mar 2019 15:10:35 +0200
Lauri Kasanen <cand@gmx.com> wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
>             -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
>             -cpuflags 0 -v error -
>
> 15.3x speedup:
>
> yuyv422
>   14513 UNITS in yuv2packed1,   32768 runs,      0 skips
>     949 UNITS in yuv2packed1,   32767 runs,      1 skips
> yvyu422
>   14516 UNITS in yuv2packed1,   32767 runs,      1 skips
>     943 UNITS in yuv2packed1,   32767 runs,      1 skips
> uyvy422
>   14530 UNITS in yuv2packed1,   32767 runs,      1 skips
>     941 UNITS in yuv2packed1,   32766 runs,      2 skips
>
> Signed-off-by: Lauri Kasanen <cand@gmx.com>
> ---
>  libswscale/ppc/swscale_vsx.c | 149 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 149 insertions(+)

Applying these.

- Lauri
diff mbox

Patch

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 062ab0d..0bb82ac 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -664,6 +664,143 @@  YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full,  AV_PIX_FMT_RGB24, 0)
 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full,  AV_PIX_FMT_BGR24, 0)

+static av_always_inline void
+write422(const vector int16_t vy1, const vector int16_t vy2,
+         const vector int16_t vu, const vector int16_t vv,
+         uint8_t *dest, const enum AVPixelFormat target)
+{
+    vector uint8_t vd1, vd2, tmp;
+    const vector uint8_t yuyv1 = (vector uint8_t) {
+                                 0x0, 0x10, 0x1, 0x18,
+                                 0x2, 0x11, 0x3, 0x19,
+                                 0x4, 0x12, 0x5, 0x1a,
+                                 0x6, 0x13, 0x7, 0x1b };
+    const vector uint8_t yuyv2 = (vector uint8_t) {
+                                 0x8, 0x14, 0x9, 0x1c,
+                                 0xa, 0x15, 0xb, 0x1d,
+                                 0xc, 0x16, 0xd, 0x1e,
+                                 0xe, 0x17, 0xf, 0x1f };
+    const vector uint8_t yvyu1 = (vector uint8_t) {
+                                 0x0, 0x18, 0x1, 0x10,
+                                 0x2, 0x19, 0x3, 0x11,
+                                 0x4, 0x1a, 0x5, 0x12,
+                                 0x6, 0x1b, 0x7, 0x13 };
+    const vector uint8_t yvyu2 = (vector uint8_t) {
+                                 0x8, 0x1c, 0x9, 0x14,
+                                 0xa, 0x1d, 0xb, 0x15,
+                                 0xc, 0x1e, 0xd, 0x16,
+                                 0xe, 0x1f, 0xf, 0x17 };
+    const vector uint8_t uyvy1 = (vector uint8_t) {
+                                 0x10, 0x0, 0x18, 0x1,
+                                 0x11, 0x2, 0x19, 0x3,
+                                 0x12, 0x4, 0x1a, 0x5,
+                                 0x13, 0x6, 0x1b, 0x7 };
+    const vector uint8_t uyvy2 = (vector uint8_t) {
+                                 0x14, 0x8, 0x1c, 0x9,
+                                 0x15, 0xa, 0x1d, 0xb,
+                                 0x16, 0xc, 0x1e, 0xd,
+                                 0x17, 0xe, 0x1f, 0xf };
+
+    vd1 = vec_packsu(vy1, vy2);
+    vd2 = vec_packsu(vu, vv);
+
+    switch (target) {
+    case AV_PIX_FMT_YUYV422:
+        tmp = vec_perm(vd1, vd2, yuyv1);
+        vec_st(tmp, 0, dest);
+        tmp = vec_perm(vd1, vd2, yuyv2);
+        vec_st(tmp, 16, dest);
+    break;
+    case AV_PIX_FMT_YVYU422:
+        tmp = vec_perm(vd1, vd2, yvyu1);
+        vec_st(tmp, 0, dest);
+        tmp = vec_perm(vd1, vd2, yvyu2);
+        vec_st(tmp, 16, dest);
+    break;
+    case AV_PIX_FMT_UYVY422:
+        tmp = vec_perm(vd1, vd2, uyvy1);
+        vec_st(tmp, 0, dest);
+        tmp = vec_perm(vd1, vd2, uyvy2);
+        vec_st(tmp, 16, dest);
+    break;
+    }
+}
+
+static av_always_inline void
+yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
+                     const int16_t *ubuf[2], const int16_t *vbuf[2],
+                     const int16_t *abuf0, uint8_t *dest, int dstW,
+                     int uvalpha, int y, enum AVPixelFormat target)
+{
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+    vector int16_t vy1, vy2, vu, vv, tmp;
+    const vector int16_t add64 = vec_splats((int16_t) 64);
+    const vector int16_t add128 = vec_splats((int16_t) 128);
+    const vector uint16_t shift7 = vec_splat_u16(7);
+    const vector uint16_t shift8 = vec_splat_u16(8);
+    int i;
+
+    if (uvalpha < 2048) {
+        for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+            vy1 = vec_ld(0, &buf0[i * 2]);
+            vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
+            vu = vec_ld(0, &ubuf0[i]);
+            vv = vec_ld(0, &vbuf0[i]);
+
+            vy1 = vec_add(vy1, add64);
+            vy2 = vec_add(vy2, add64);
+            vu = vec_add(vu, add64);
+            vv = vec_add(vv, add64);
+
+            vy1 = vec_sra(vy1, shift7);
+            vy2 = vec_sra(vy2, shift7);
+            vu = vec_sra(vu, shift7);
+            vv = vec_sra(vv, shift7);
+
+            write422(vy1, vy2, vu, vv, &dest[i * 4], target);
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+        for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+            vy1 = vec_ld(0, &buf0[i * 2]);
+            vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
+            vu = vec_ld(0, &ubuf0[i]);
+            tmp = vec_ld(0, &ubuf1[i]);
+            vu = vec_adds(vu, tmp);
+            vv = vec_ld(0, &vbuf0[i]);
+            tmp = vec_ld(0, &vbuf1[i]);
+            vv = vec_adds(vv, tmp);
+
+            vy1 = vec_add(vy1, add64);
+            vy2 = vec_add(vy2, add64);
+            vu = vec_adds(vu, add128);
+            vv = vec_adds(vv, add128);
+
+            vy1 = vec_sra(vy1, shift7);
+            vy2 = vec_sra(vy2, shift7);
+            vu = vec_sra(vu, shift8);
+            vv = vec_sra(vv, shift8);
+
+            write422(vy1, vy2, vu, vv, &dest[i * 4], target);
+        }
+    }
+}
+
+#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
+static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
+                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
+                                const int16_t *abuf0, uint8_t *dest, int dstW, \
+                                int uvalpha, int y) \
+{ \
+    name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
+                                  abuf0, dest, dstW, uvalpha, \
+                                  y, fmt); \
+}
+
+YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
+YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
+YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
+
 #endif /* !HAVE_BIGENDIAN */

 #endif /* HAVE_VSX */
@@ -768,6 +905,18 @@  av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
                 }
             break;
         }
+    } else { /* !SWS_FULL_CHR_H_INT */
+        switch (dstFormat) {
+            case AV_PIX_FMT_YUYV422:
+                c->yuv2packed1 = yuv2yuyv422_1_vsx;
+            break;
+            case AV_PIX_FMT_YVYU422:
+                c->yuv2packed1 = yuv2yvyu422_1_vsx;
+            break;
+            case AV_PIX_FMT_UYVY422:
+                c->yuv2packed1 = yuv2uyvy422_1_vsx;
+            break;
+        }
     }
 #endif /* !HAVE_BIGENDIAN */