diff mbox

[FFmpeg-devel] swscale/output: Altivec-optimize yuv2plane1_8

Message ID 20181116145938.07a893157e57837fc3e2314c@gmx.com
State Accepted
Headers show

Commit Message

Lauri Kasanen Nov. 16, 2018, 12:59 p.m. UTC
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p \
-f null -vframes 100 -v error -nostats -

1158 UNITS in planar1,   65528 runs,      8 skips

-cpuflags 0

19082 UNITS in planar1,   65533 runs,      3 skips

16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version
takes as many cycles as the x86 SSE2 version, yikes it's fast.

Note that this function uses VSX instructions, but is not marked so.
This is because several existing functions also make that mistake.
I'll submit a patch moving them all once this is reviewed.

No BE support since I can only test LE. LE is however the common case
for POWER8 and POWER9.

Signed-off-by: Lauri Kasanen <cand@gmx.com>
---
 libswscale/ppc/swscale_altivec.c | 55 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

Comments

Carl Eugen Hoyos Nov. 16, 2018, 9:09 p.m. UTC | #1
2018-11-16 13:59 GMT+01:00, Lauri Kasanen <cand@gmx.com>:
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt
> yuv420p \
> -f null -vframes 100 -v error -nostats -
>
> 1158 UNITS in planar1,   65528 runs,      8 skips
>
> -cpuflags 0
>
> 19082 UNITS in planar1,   65533 runs,      3 skips
>
> 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version
> takes as many cycles as the x86 SSE2 version, yikes it's fast.

> Note that this function uses VSX instructions, but is not marked so.
> This is because several existing functions also make that mistake.
> I'll submit a patch moving them all once this is reviewed.

(This is less important atm, but I believe all functions currently
in libswscale/ppc compile and run fine on - old - 32bit be hardware
as your new function does.
My completely inexperienced suspicion is that the instruction that
you call "VSX" also exists on Altivec.)

> No BE support since I can only test LE. LE is however the common case
> for POWER8 and POWER9.
>
> Signed-off-by: Lauri Kasanen <cand@gmx.com>
> ---
>  libswscale/ppc/swscale_altivec.c | 55
> ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
>
> diff --git a/libswscale/ppc/swscale_altivec.c
> b/libswscale/ppc/swscale_altivec.c
> index 2fb2337..a064016 100644
> --- a/libswscale/ppc/swscale_altivec.c
> +++ b/libswscale/ppc/swscale_altivec.c
> @@ -324,6 +324,53 @@ static void hScale_altivec_real(SwsContext *c, int16_t
> *dst, int dstW,
>              }
>          }
>  }
> +
> +static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
> +                           const uint8_t *dither, int offset, int start)
> +{
> +    int i;
> +    for (i = start; i < dstW; i++) {
> +        int val = (src[i] + dither[(i + offset) & 7]) >> 7;
> +        dest[i] = av_clip_uint8(val);
> +    }
> +}
> +
> +static void yuv2plane1_8_altivec(const int16_t *src, uint8_t *dest, int
> dstW,
> +                           const uint8_t *dither, int offset)
> +{
> +    const int dst_u = -(uintptr_t)dest & 15;
> +    int i, j;
> +    LOCAL_ALIGNED(16, int16_t, val, [16]);
> +    const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7,
> 7};
> +    vector int16_t vi, vileft, ditherleft, ditherright;
> +    vector uint8_t vd;
> +
> +    for (j = 0; j < 16; j++) {
> +        val[j] = dither[(dst_u + offset + j) & 7];
> +    }
> +
> +    ditherleft = vec_ld(0, val);
> +    ditherright = vec_ld(0, &val[8]);
> +
> +    yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
> +
> +    for (i = dst_u; i < dstW - 15; i += 16) {
> +
> +        vi = vec_vsx_ld(0, &src[i]);
> +        vi = vec_adds(ditherleft, vi);
> +        vileft = vec_sra(vi, shifts);
> +
> +        vi = vec_vsx_ld(0, &src[i + 8]);
> +        vi = vec_adds(ditherright, vi);
> +        vi = vec_sra(vi, shifts);
> +
> +        vd = vec_packsu(vileft, vi);
> +        vec_st(vd, 0, &dest[i]);
> +    }
> +
> +    yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
> +}
> +
>  #endif /* HAVE_ALTIVEC */
>
>  av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
> @@ -367,6 +414,14 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
>              c->yuv2packedX = ff_yuv2rgb24_X_altivec;
>              break;
>          }

> +
> +        switch (c->dstBpc) {
> +        case 8:
> +#if !HAVE_BIGENDIAN
> +            c->yuv2plane1 = yuv2plane1_8_altivec;
> +            break;
> +#endif /* !HAVE_BIGENDIAN */
> +        }

I wanted to write that this hunk breaks compilation on big-endian
(you should be able to test with "#if 0" instead of "#if !HAVE_BIGENDIAN")
but the good news is that your patch works fine on big-endian,
just remove the if-endif block. (Tested visually with lena on 32 and 64bit be.)

Are you aware of the bounty that is offered for this task?
https://trac.ffmpeg.org/ticket/5568
(and #5569, #5570)

There is a bug report about one altivec routine that works on
big-endian but breaks the output visually on little-endian while
many other functions work on both, could you have a look?
https://trac.ffmpeg.org/ticket/7124

Carl Eugen
Lauri Kasanen Nov. 17, 2018, 8:09 a.m. UTC | #2
On Fri, 16 Nov 2018 22:09:25 +0100
Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:

> (This is less important atm, but I believe all functions currently
> in libswscale/ppc compile and run fine on - old - 32bit be hardware
> as your new function does.
> My completely inexperienced suspicion is that the instruction that
> you call "VSX" also exists on Altivec.)

Ref
http://gcc.gnu.org/onlinedocs/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html#PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06

VSX functions such as vec_vsx_ld were added in ISA 2.06, aka POWER7.
They shouldn't compile on earlier PPC like Apple G4/G5. Is your machine
at least POWER7?

> I wanted to write that this hunk breaks compilation on big-endian
> (you should be able to test with "#if 0" instead of "#if !HAVE_BIGENDIAN")
> but the good news is that your patch works fine on big-endian,
> just remove the if-endif block. (Tested visually with lena on 32 and 64bit be.)

Thanks, will do.

> Are you aware of the bounty that is offered for this task?
> https://trac.ffmpeg.org/ticket/5568
> (and #5569, #5570)

Yes, I admit that's why I started. Looking to make some extra, and
helping IBM is not a bad way to do so. I'm considering getting a Raptor
Blackbird when it comes out next year.

> There is a bug report about one altivec routine that works on
> big-endian but breaks the output visually on little-endian while
> many other functions work on both, could you have a look?
> https://trac.ffmpeg.org/ticket/7124

I'll try. This patch was my first time playing with Power vectors.

- Lauri
Lauri Kasanen Nov. 17, 2018, 5:49 p.m. UTC | #3
On Sat, 17 Nov 2018 15:20:08 +0100
Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:

> 2018-11-17 9:09 GMT+01:00, Lauri Kasanen <cand@gmx.com>:
> > Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:
> >> (This is less important atm, but I believe all functions currently
> >> in libswscale/ppc compile and run fine on - old - 32bit be hardware
> >> as your new function does.
> >> My completely inexperienced suspicion is that the instruction that
> >> you call "VSX" also exists on Altivec.)
> >
> > Ref
> > http://gcc.gnu.org/onlinedocs/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html#PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06
> >
> > VSX functions such as vec_vsx_ld were added in ISA 2.06, aka POWER7.
> 
> The instruction vec_vsx_ld is currently only used for little-endian ppc
> which I thought did not exist before power7, am I wrong?

Looks like there were LE Powers like the 440 already in 1999:
https://lwn.net/Articles/408051/
datasheets.chipdb.org/IBM/PowerPC/440/PowerPC-440-Core.pdf

- Lauri
diff mbox

Patch

diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 2fb2337..a064016 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -324,6 +324,53 @@  static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
             }
         }
 }
+
+static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset, int start)
+{
+    int i;
+    for (i = start; i < dstW; i++) {
+        int val = (src[i] + dither[(i + offset) & 7]) >> 7;
+        dest[i] = av_clip_uint8(val);
+    }
+}
+
+static void yuv2plane1_8_altivec(const int16_t *src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    const int dst_u = -(uintptr_t)dest & 15;
+    int i, j;
+    LOCAL_ALIGNED(16, int16_t, val, [16]);
+    const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
+    vector int16_t vi, vileft, ditherleft, ditherright;
+    vector uint8_t vd;
+
+    for (j = 0; j < 16; j++) {
+        val[j] = dither[(dst_u + offset + j) & 7];
+    }
+
+    ditherleft = vec_ld(0, val);
+    ditherright = vec_ld(0, &val[8]);
+
+    yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
+
+    for (i = dst_u; i < dstW - 15; i += 16) {
+
+        vi = vec_vsx_ld(0, &src[i]);
+        vi = vec_adds(ditherleft, vi);
+        vileft = vec_sra(vi, shifts);
+
+        vi = vec_vsx_ld(0, &src[i + 8]);
+        vi = vec_adds(ditherright, vi);
+        vi = vec_sra(vi, shifts);
+
+        vd = vec_packsu(vileft, vi);
+        vec_st(vd, 0, &dest[i]);
+    }
+
+    yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
+}
+
 #endif /* HAVE_ALTIVEC */
 
 av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
@@ -367,6 +414,14 @@  av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
             c->yuv2packedX = ff_yuv2rgb24_X_altivec;
             break;
         }
+
+        switch (c->dstBpc) {
+        case 8:
+#if !HAVE_BIGENDIAN
+            c->yuv2plane1 = yuv2plane1_8_altivec;
+            break;
+#endif /* !HAVE_BIGENDIAN */
+        }
     }
 #endif /* HAVE_ALTIVEC */
 }