diff mbox

[FFmpeg-devel,v3] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

Message ID 20190108111156.0503b5c1485589c8407f7b73@gmx.com
State Superseded
Headers show

Commit Message

Lauri Kasanen Jan. 8, 2019, 9:11 a.m. UTC
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -

9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.

Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.

yuv420p9le
  12341 UNITS in planarX,  130976 runs,     96 skips
  73752 UNITS in planarX,  131066 runs,      6 skips
yuv420p9be
  12364 UNITS in planarX,  131025 runs,     47 skips
  73001 UNITS in planarX,  131055 runs,     17 skips
yuv420p10le
  12386 UNITS in planarX,  131042 runs,     30 skips
  72735 UNITS in planarX,  131062 runs,     10 skips
yuv420p10be
  12337 UNITS in planarX,  131045 runs,     27 skips
  72734 UNITS in planarX,  131057 runs,     15 skips
yuv420p12le
  12236 UNITS in planarX,  131058 runs,     14 skips
  73029 UNITS in planarX,  131062 runs,     10 skips
yuv420p12be
  12218 UNITS in planarX,  130973 runs,     99 skips
  72402 UNITS in planarX,  131069 runs,      3 skips
yuv420p14le
  12168 UNITS in planarX,  131067 runs,      5 skips
  72480 UNITS in planarX,  131069 runs,      3 skips
yuv420p14be
  12358 UNITS in planarX,  130948 runs,    124 skips
  73772 UNITS in planarX,  131063 runs,      9 skips
yuv420p16le
  10439 UNITS in planarX,  130911 runs,    161 skips
 157923 UNITS in planarX,  131068 runs,      4 skips
yuv420p16be
  10463 UNITS in planarX,  130874 runs,    198 skips
 154405 UNITS in planarX,  131061 runs,     11 skips

Signed-off-by: Lauri Kasanen <cand@gmx.com>
---

v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check

As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at least
power8, meaning with the current setup such a binary wouldn't run on POWER7.
However using the configure define lets it be disabled in configure like Michael
pointed out, and having the runtime check doesn't hurt any (it allows for future
splits like on x86, where one binary can run on low cpu but use higher ISA if
available).

 libswscale/ppc/swscale_ppc_template.c |   4 +-
 libswscale/ppc/swscale_vsx.c          | 195 +++++++++++++++++++++++++++++++++-
 2 files changed, 193 insertions(+), 6 deletions(-)

Comments

Carl Eugen Hoyos Jan. 9, 2019, 9:26 p.m. UTC | #1
2019-01-08 10:11 GMT+01:00, Lauri Kasanen <cand@gmx.com>:
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt
> yuv420p16be \
> -s 1920x1728 -f null -vframes 100 -v error -nostats -
>
> 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
> Fate passes, each format tested with an image to video conversion.
>
> Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
> of the 16-bit function. This includes the vec_mulo/mule functions too,
> not just vmuluwm.
>
> yuv420p9le
>   12341 UNITS in planarX,  130976 runs,     96 skips
>   73752 UNITS in planarX,  131066 runs,      6 skips
> yuv420p9be
>   12364 UNITS in planarX,  131025 runs,     47 skips
>   73001 UNITS in planarX,  131055 runs,     17 skips
> yuv420p10le
>   12386 UNITS in planarX,  131042 runs,     30 skips
>   72735 UNITS in planarX,  131062 runs,     10 skips
> yuv420p10be
>   12337 UNITS in planarX,  131045 runs,     27 skips
>   72734 UNITS in planarX,  131057 runs,     15 skips
> yuv420p12le
>   12236 UNITS in planarX,  131058 runs,     14 skips
>   73029 UNITS in planarX,  131062 runs,     10 skips
> yuv420p12be
>   12218 UNITS in planarX,  130973 runs,     99 skips
>   72402 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14le
>   12168 UNITS in planarX,  131067 runs,      5 skips
>   72480 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14be
>   12358 UNITS in planarX,  130948 runs,    124 skips
>   73772 UNITS in planarX,  131063 runs,      9 skips
> yuv420p16le
>   10439 UNITS in planarX,  130911 runs,    161 skips
>  157923 UNITS in planarX,  131068 runs,      4 skips
> yuv420p16be
>   10463 UNITS in planarX,  130874 runs,    198 skips
>  154405 UNITS in planarX,  131061 runs,     11 skips
>
> Signed-off-by: Lauri Kasanen <cand@gmx.com>
> ---
>
> v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
> v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime
> check
>
> As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at
> least
> power8, meaning with the current setup such a binary wouldn't run on POWER7.
> However using the configure define lets it be disabled in configure like
> Michael
> pointed out, and having the runtime check doesn't hurt any (it allows for
> future
> splits like on x86, where one binary can run on low cpu but use higher ISA
> if
> available).
>
>  libswscale/ppc/swscale_ppc_template.c |   4 +-
>  libswscale/ppc/swscale_vsx.c          | 195
> +++++++++++++++++++++++++++++++++-
>  2 files changed, 193 insertions(+), 6 deletions(-)
>
> diff --git a/libswscale/ppc/swscale_ppc_template.c
> b/libswscale/ppc/swscale_ppc_template.c
> index 00e4b99..11decab 100644
> --- a/libswscale/ppc/swscale_ppc_template.c
> +++ b/libswscale/ppc/swscale_ppc_template.c
> @@ -21,7 +21,7 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
>   */
>
> -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
> +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
>                                    const int16_t **src, uint8_t *dest,
>                                    const uint8_t *dither, int offset, int x)
>  {
> @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int
> filterSize,
>      yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
>
>      for (i = dst_u; i < dstW - 15; i += 16)
> -        FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
> +        FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
>                                offset, i);
>
>      yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
> diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
> index 70da6ae..77680f8 100644
> --- a/libswscale/ppc/swscale_vsx.c
> +++ b/libswscale/ppc/swscale_vsx.c
> @@ -83,6 +83,8 @@
>  #include "swscale_ppc_template.c"
>  #undef FUNC
>
> +#undef vzero
> +
>  #endif /* !HAVE_BIGENDIAN */
>
>  static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
> @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src,
> uint16_t *dest, int dstW,
>      yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>
> +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
> +                              const int16_t **src, uint16_t *dest, int
> dstW,
> +                              int big_endian, int output_bits, int start)
> +{
> +    int i;
> +    int shift = 11 + 16 - output_bits;
> +
> +    for (i = start; i < dstW; i++) {
> +        int val = 1 << (shift - 1);
> +        int j;
> +
> +        for (j = 0; j < filterSize; j++)
> +            val += src[j][i] * filter[j];
> +
> +        output_pixel(&dest[i], val);
> +    }
> +}
> +
> +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
> +                                const int16_t **src, uint16_t *dest, int
> dstW,
> +                                int big_endian, int output_bits)
> +{
> +    const int dst_u = -(uintptr_t)dest & 7;
> +    const int shift = 11 + 16 - output_bits;
> +    const int add = (1 << (shift - 1));
> +    const int clip = (1 << output_bits) - 1;
> +    const uint16_t swap = big_endian ? 8 : 0;
> +    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
> +    const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift,
> shift};
> +    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap,
> swap, swap, swap, swap, swap};
> +    const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip,
> clip, clip, clip, clip, clip};
> +    const vector int16_t vzero = vec_splat_s16(0);
> +    const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10,
> 11, 4, 5, 12, 13, 6, 7, 14, 15};
> +    vector int16_t vfilter[MAX_FILTER_SIZE], vin;
> +    vector uint16_t v;
> +    vector uint32_t vleft, vright, vtmp;
> +    int i, j;
> +
> +    for (i = 0; i < filterSize; i++) {
> +        vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i],
> filter[i],
> +                                       filter[i], filter[i], filter[i],
> filter[i]};
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian,
> output_bits, 0);
> +
> +    for (i = dst_u; i < dstW - 7; i += 8) {
> +        vleft = vright = vadd;
> +
> +        for (j = 0; j < filterSize; j++) {
> +            vin = vec_vsx_ld(0, &src[j][i]);
> +            vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
> +            vleft = vec_add(vleft, vtmp);
> +            vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
> +            vright = vec_add(vright, vtmp);
> +        }
> +
> +        vleft = vec_sra(vleft, vshift);
> +        vright = vec_sra(vright, vshift);
> +        v = vec_packsu(vleft, vright);
> +        v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
> +        v = vec_min(v, vlargest);
> +        v = vec_rl(v, vswap);
> +        v = vec_perm(v, v, vperm);
> +        vec_st(v, 0, &dest[i]);
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian,
> output_bits, i);
> +}
> +
> +
>  #undef output_pixel
>
>  #define output_pixel(pos, val, bias, signedness) \
> @@ -234,7 +306,97 @@ static void yuv2plane1_16_vsx(const int32_t *src,
> uint16_t *dest, int dstW,
>      yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>
> +#ifdef HAVE_POWER8
> +
> +static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
> +                            const int32_t **src, uint16_t *dest, int dstW,
> +                            int big_endian, int output_bits, int start)
> +{
> +    int i;
> +    int shift = 15;
> +
> +    for (i = start; i < dstW; i++) {
> +        int val = 1 << (shift - 1);
> +        int j;
> +
> +        /* range of val is [0,0x7FFFFFFF], so 31 bits, but with
> lanczos/spline
> +         * filters (or anything with negative coeffs, the range can be
> slightly
> +         * wider in both directions. To account for this overflow, we
> subtract
> +         * a constant so it always fits in the signed range (assuming a
> +         * reasonable filterSize), and re-add that at the end. */
> +        val -= 0x40000000;
> +        for (j = 0; j < filterSize; j++)
> +            val += src[j][i] * (unsigned)filter[j];
> +
> +        output_pixel(&dest[i], val, 0x8000, int);
> +    }
> +}
> +
> +static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
> +                              const int32_t **src, uint16_t *dest, int
> dstW,
> +                              int big_endian, int output_bits)
> +{
> +    const int dst_u = -(uintptr_t)dest & 7;
> +    const int shift = 15;
> +    const int bias = 0x8000;
> +    const int add = (1 << (shift - 1)) - 0x40000000;
> +    const uint16_t swap = big_endian ? 8 : 0;
> +    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
> +    const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift,
> shift};
> +    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap,
> swap, swap, swap, swap, swap};
> +    const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias,
> bias, bias, bias, bias, bias};
> +    vector int32_t vfilter[MAX_FILTER_SIZE];
> +    vector uint16_t v;
> +    vector uint32_t vleft, vright, vtmp;
> +    vector int32_t vin32l, vin32r;
> +    int i, j;
> +
> +    for (i = 0; i < filterSize; i++) {
> +        vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i],
> filter[i]};
> +    }
> +
> +    yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian,
> output_bits, 0);
> +
> +    for (i = dst_u; i < dstW - 7; i += 8) {
> +        vleft = vright = vadd;
> +
> +        for (j = 0; j < filterSize; j++) {
> +            vin32l = vec_vsx_ld(0, &src[j][i]);
> +            vin32r = vec_vsx_ld(0, &src[j][i + 4]);
> +

> +#ifdef __GNUC__
> +            // GCC does not support vmuluwm yet. Bug open.
> +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l),
> "v"(vfilter[j]));
> +            vleft = vec_add(vleft, vtmp);
> +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r),
> "v"(vfilter[j]));
> +            vright = vec_add(vright, vtmp);
> +#else
> +            // No idea which compilers this works in, untested. Copied from
> libsimdpp
> +            vtmp = vec_vmuluwm(vin32l, vfilter[j]);
> +            vleft = vec_add(vleft, vtmp);
> +            vtmp = vec_vmuluwm(vin32r, vfilter[j]);
> +            vright = vec_add(vright, vtmp);
> +#endif

Is there no xlc installed on your test system?
I suspect an earlier patch from you already
broke xlc compilation...

Carl Eugen
Carl Eugen Hoyos Jan. 9, 2019, 10:06 p.m. UTC | #2
2019-01-09 22:26 GMT+01:00, Carl Eugen Hoyos <ceffmpeg@gmail.com>:
> 2019-01-08 10:11 GMT+01:00, Lauri Kasanen <cand@gmx.com>:
>> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt
>> yuv420p16be \
>> -s 1920x1728 -f null -vframes 100 -v error -nostats -
>>
>> 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
>> Fate passes, each format tested with an image to video conversion.
>>
>> Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
>> of the 16-bit function. This includes the vec_mulo/mule functions too,
>> not just vmuluwm.
>>
>> yuv420p9le
>>   12341 UNITS in planarX,  130976 runs,     96 skips
>>   73752 UNITS in planarX,  131066 runs,      6 skips
>> yuv420p9be
>>   12364 UNITS in planarX,  131025 runs,     47 skips
>>   73001 UNITS in planarX,  131055 runs,     17 skips
>> yuv420p10le
>>   12386 UNITS in planarX,  131042 runs,     30 skips
>>   72735 UNITS in planarX,  131062 runs,     10 skips
>> yuv420p10be
>>   12337 UNITS in planarX,  131045 runs,     27 skips
>>   72734 UNITS in planarX,  131057 runs,     15 skips
>> yuv420p12le
>>   12236 UNITS in planarX,  131058 runs,     14 skips
>>   73029 UNITS in planarX,  131062 runs,     10 skips
>> yuv420p12be
>>   12218 UNITS in planarX,  130973 runs,     99 skips
>>   72402 UNITS in planarX,  131069 runs,      3 skips
>> yuv420p14le
>>   12168 UNITS in planarX,  131067 runs,      5 skips
>>   72480 UNITS in planarX,  131069 runs,      3 skips
>> yuv420p14be
>>   12358 UNITS in planarX,  130948 runs,    124 skips
>>   73772 UNITS in planarX,  131063 runs,      9 skips
>> yuv420p16le
>>   10439 UNITS in planarX,  130911 runs,    161 skips
>>  157923 UNITS in planarX,  131068 runs,      4 skips
>> yuv420p16be
>>   10463 UNITS in planarX,  130874 runs,    198 skips
>>  154405 UNITS in planarX,  131061 runs,     11 skips
>>
>> Signed-off-by: Lauri Kasanen <cand@gmx.com>
>> ---
>>
>> v2: Separate macros so that yuv2plane1_16_vsx remains available for
>> power7
>> v3: Remove accidental tabs, switch to HAVE_POWER8 from configure +
>> runtime
>> check
>>
>> As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at
>> least
>> power8, meaning with the current setup such a binary wouldn't run on
>> POWER7.
>> However using the configure define lets it be disabled in configure like
>> Michael
>> pointed out, and having the runtime check doesn't hurt any (it allows for
>> future
>> splits like on x86, where one binary can run on low cpu but use higher
>> ISA
>> if
>> available).
>>
>>  libswscale/ppc/swscale_ppc_template.c |   4 +-
>>  libswscale/ppc/swscale_vsx.c          | 195
>> +++++++++++++++++++++++++++++++++-
>>  2 files changed, 193 insertions(+), 6 deletions(-)
>>
>> diff --git a/libswscale/ppc/swscale_ppc_template.c
>> b/libswscale/ppc/swscale_ppc_template.c
>> index 00e4b99..11decab 100644
>> --- a/libswscale/ppc/swscale_ppc_template.c
>> +++ b/libswscale/ppc/swscale_ppc_template.c
>> @@ -21,7 +21,7 @@
>>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301
>> USA
>>   */
>>
>> -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
>> +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
>>                                    const int16_t **src, uint8_t *dest,
>>                                    const uint8_t *dither, int offset, int
>> x)
>>  {
>> @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int
>> filterSize,
>>      yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset,
>> 0);
>>
>>      for (i = dst_u; i < dstW - 15; i += 16)
>> -        FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
>> +        FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
>>                                offset, i);
>>
>>      yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset,
>> i);
>> diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
>> index 70da6ae..77680f8 100644
>> --- a/libswscale/ppc/swscale_vsx.c
>> +++ b/libswscale/ppc/swscale_vsx.c
>> @@ -83,6 +83,8 @@
>>  #include "swscale_ppc_template.c"
>>  #undef FUNC
>>
>> +#undef vzero
>> +
>>  #endif /* !HAVE_BIGENDIAN */
>>
>>  static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
>> @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src,
>> uint16_t *dest, int dstW,
>>      yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
>>  }
>>
>> +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
>> +                              const int16_t **src, uint16_t *dest, int
>> dstW,
>> +                              int big_endian, int output_bits, int
>> start)
>> +{
>> +    int i;
>> +    int shift = 11 + 16 - output_bits;
>> +
>> +    for (i = start; i < dstW; i++) {
>> +        int val = 1 << (shift - 1);
>> +        int j;
>> +
>> +        for (j = 0; j < filterSize; j++)
>> +            val += src[j][i] * filter[j];
>> +
>> +        output_pixel(&dest[i], val);
>> +    }
>> +}
>> +
>> +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
>> +                                const int16_t **src, uint16_t *dest, int
>> dstW,
>> +                                int big_endian, int output_bits)
>> +{
>> +    const int dst_u = -(uintptr_t)dest & 7;
>> +    const int shift = 11 + 16 - output_bits;
>> +    const int add = (1 << (shift - 1));
>> +    const int clip = (1 << output_bits) - 1;
>> +    const uint16_t swap = big_endian ? 8 : 0;
>> +    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
>> +    const vector uint32_t vshift = (vector uint32_t) {shift, shift,
>> shift,
>> shift};
>> +    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap,
>> swap, swap, swap, swap, swap};
>> +    const vector uint16_t vlargest = (vector uint16_t) {clip, clip,
>> clip,
>> clip, clip, clip, clip, clip};
>> +    const vector int16_t vzero = vec_splat_s16(0);
>> +    const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10,
>> 11, 4, 5, 12, 13, 6, 7, 14, 15};
>> +    vector int16_t vfilter[MAX_FILTER_SIZE], vin;
>> +    vector uint16_t v;
>> +    vector uint32_t vleft, vright, vtmp;
>> +    int i, j;
>> +
>> +    for (i = 0; i < filterSize; i++) {
>> +        vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i],
>> filter[i],
>> +                                       filter[i], filter[i], filter[i],
>> filter[i]};
>> +    }
>> +
>> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian,
>> output_bits, 0);
>> +
>> +    for (i = dst_u; i < dstW - 7; i += 8) {
>> +        vleft = vright = vadd;
>> +
>> +        for (j = 0; j < filterSize; j++) {
>> +            vin = vec_vsx_ld(0, &src[j][i]);
>> +            vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
>> +            vleft = vec_add(vleft, vtmp);
>> +            vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
>> +            vright = vec_add(vright, vtmp);
>> +        }
>> +
>> +        vleft = vec_sra(vleft, vshift);
>> +        vright = vec_sra(vright, vshift);
>> +        v = vec_packsu(vleft, vright);
>> +        v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
>> +        v = vec_min(v, vlargest);
>> +        v = vec_rl(v, vswap);
>> +        v = vec_perm(v, v, vperm);
>> +        vec_st(v, 0, &dest[i]);
>> +    }
>> +
>> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian,
>> output_bits, i);
>> +}
>> +
>> +
>>  #undef output_pixel
>>
>>  #define output_pixel(pos, val, bias, signedness) \
>> @@ -234,7 +306,97 @@ static void yuv2plane1_16_vsx(const int32_t *src,
>> uint16_t *dest, int dstW,
>>      yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
>>  }
>>
>> +#ifdef HAVE_POWER8
>> +
>> +static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
>> +                            const int32_t **src, uint16_t *dest, int
>> dstW,
>> +                            int big_endian, int output_bits, int start)
>> +{
>> +    int i;
>> +    int shift = 15;
>> +
>> +    for (i = start; i < dstW; i++) {
>> +        int val = 1 << (shift - 1);
>> +        int j;
>> +
>> +        /* range of val is [0,0x7FFFFFFF], so 31 bits, but with
>> lanczos/spline
>> +         * filters (or anything with negative coeffs, the range can be
>> slightly
>> +         * wider in both directions. To account for this overflow, we
>> subtract
>> +         * a constant so it always fits in the signed range (assuming a
>> +         * reasonable filterSize), and re-add that at the end. */
>> +        val -= 0x40000000;
>> +        for (j = 0; j < filterSize; j++)
>> +            val += src[j][i] * (unsigned)filter[j];
>> +
>> +        output_pixel(&dest[i], val, 0x8000, int);
>> +    }
>> +}
>> +
>> +static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
>> +                              const int32_t **src, uint16_t *dest, int
>> dstW,
>> +                              int big_endian, int output_bits)
>> +{
>> +    const int dst_u = -(uintptr_t)dest & 7;
>> +    const int shift = 15;
>> +    const int bias = 0x8000;
>> +    const int add = (1 << (shift - 1)) - 0x40000000;
>> +    const uint16_t swap = big_endian ? 8 : 0;
>> +    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
>> +    const vector uint32_t vshift = (vector uint32_t) {shift, shift,
>> shift,
>> shift};
>> +    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap,
>> swap, swap, swap, swap, swap};
>> +    const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias,
>> bias, bias, bias, bias, bias};
>> +    vector int32_t vfilter[MAX_FILTER_SIZE];
>> +    vector uint16_t v;
>> +    vector uint32_t vleft, vright, vtmp;
>> +    vector int32_t vin32l, vin32r;
>> +    int i, j;
>> +
>> +    for (i = 0; i < filterSize; i++) {
>> +        vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i],
>> filter[i]};
>> +    }
>> +
>> +    yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian,
>> output_bits, 0);
>> +
>> +    for (i = dst_u; i < dstW - 7; i += 8) {
>> +        vleft = vright = vadd;
>> +
>> +        for (j = 0; j < filterSize; j++) {
>> +            vin32l = vec_vsx_ld(0, &src[j][i]);
>> +            vin32r = vec_vsx_ld(0, &src[j][i + 4]);
>> +
>
>> +#ifdef __GNUC__
>> +            // GCC does not support vmuluwm yet. Bug open.
>> +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l),
>> "v"(vfilter[j]));
>> +            vleft = vec_add(vleft, vtmp);
>> +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r),
>> "v"(vfilter[j]));
>> +            vright = vec_add(vright, vtmp);
>> +#else
>> +            // No idea which compilers this works in, untested. Copied
>> from
>> libsimdpp
>> +            vtmp = vec_vmuluwm(vin32l, vfilter[j]);
>> +            vleft = vec_add(vleft, vtmp);
>> +            vtmp = vec_vmuluwm(vin32r, vfilter[j]);
>> +            vright = vec_add(vright, vtmp);
>> +#endif
>
> Is there no xlc installed on your test system?
> I suspect an earlier patch from you already
> broke xlc compilation...

46c5693ea3a9364e24e2f5336bcdb5b191a2329f is the first bad commit

Testing this version:
$ make libswscale/ppc/swscale_altivec.o
CC      libswscale/ppc/swscale_altivec.o
warning: 1540-5200 The option "-fomit-frame-pointer" is not supported.
warning: 1540-5200 The option "-mabi=altivec" is not supported.
warning: 1540-5200 The option "-mvsx" is not supported.
warning: 1540-5200 The option "-fno-math-errno" is not supported.
warning: 1540-5200 The option "-fno-signed-zeros" is not supported.
In file included from src/libswscale/ppc/swscale_altivec.c:28:
src/libswscale/swscale_internal.h:641:1: warning: unknown attribute
'cold' ignored [-Wunknown-attributes]
av_cold void ff_sws_init_range_convert(SwsContext *c);
^
src/libavutil/attributes.h:82:36: note: expanded from macro 'av_cold'
#    define av_cold __attribute__((cold))
                                   ^
src/libswscale/ppc/swscale_altivec.c:104:30: warning: unused variable
'perm' [-Wunused-variable]
        vector unsigned char perm;
                             ^
src/libswscale/ppc/swscale_altivec.c:245:46: warning: unused variable
'src_v0' [-Wunused-variable]
                vector unsigned char src_vF, src_v0, src_v1;
                                             ^
src/libswscale/ppc/swscale_altivec.c:245:54: warning: unused variable
'src_v1' [-Wunused-variable]
                vector unsigned char src_vF, src_v0, src_v1;
                                                     ^
src/libswscale/ppc/swscale_altivec.c:246:38: warning: unused variable
'permS' [-Wunused-variable]
                vector unsigned char permS;
                                     ^
src/libswscale/ppc/swscale_altivec.c:295:42: warning: unused variable
'src_v1' [-Wunused-variable]
                    vector unsigned char src_v1, src_vF;
                                         ^
src/libswscale/ppc/swscale_altivec.c:296:41: warning: unused variable
'filter_v1R' [-Wunused-variable]
                    vector signed short filter_v1R, filter_v2R,
filter_v0, filter_v1;
                                        ^
src/libswscale/ppc/swscale_altivec.c:296:53: warning: unused variable
'filter_v2R' [-Wunused-variable]
                    vector signed short filter_v1R, filter_v2R,
filter_v0, filter_v1;
                                                    ^
src/libswscale/ppc/swscale_altivec.c:312:42: warning: unused variable
'src_v1' [-Wunused-variable]
                    vector unsigned char src_v1, src_vF;
                                         ^
src/libswscale/ppc/swscale_altivec.c:313:48: warning: unused variable
'filter_v1R' [-Wunused-variable]
                    vector signed short src_v, filter_v1R, filter_v;
                                               ^
src/libswscale/ppc/swscale_altivec.c:285:33: warning: unused variable
'offset' [-Wunused-variable]
                register int j, offset = i * 2 * filterSize;
                                ^
src/libswscale/ppc/swscale_altivec.c:289:37: warning: unused variable
'filter_v0R' [-Wunused-variable]
                vector signed short filter_v0R;
                                    ^
src/libswscale/ppc/swscale_altivec.c:290:38: warning: unused variable
'permF' [-Wunused-variable]
                vector unsigned char permF, src_v0, permS;
                                     ^
src/libswscale/ppc/swscale_altivec.c:290:45: warning: unused variable
'src_v0' [-Wunused-variable]
                vector unsigned char permF, src_v0, permS;
                                            ^
src/libswscale/ppc/swscale_altivec.c:290:53: warning: unused variable
'permS' [-Wunused-variable]
                vector unsigned char permF, src_v0, permS;
                                                    ^
src/libswscale/ppc/swscale_altivec.c:344:11: error: unknown type name 'vector'
    const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};

Carl Eugen
Michael Niedermayer Jan. 10, 2019, 12:31 a.m. UTC | #3
On Tue, Jan 08, 2019 at 11:11:56AM +0200, Lauri Kasanen wrote:
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \
> -s 1920x1728 -f null -vframes 100 -v error -nostats -
> 
> 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
> Fate passes, each format tested with an image to video conversion.
> 
> Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
> of the 16-bit function. This includes the vec_mulo/mule functions too,
> not just vmuluwm.
> 
> yuv420p9le
>   12341 UNITS in planarX,  130976 runs,     96 skips
>   73752 UNITS in planarX,  131066 runs,      6 skips
> yuv420p9be
>   12364 UNITS in planarX,  131025 runs,     47 skips
>   73001 UNITS in planarX,  131055 runs,     17 skips
> yuv420p10le
>   12386 UNITS in planarX,  131042 runs,     30 skips
>   72735 UNITS in planarX,  131062 runs,     10 skips
> yuv420p10be
>   12337 UNITS in planarX,  131045 runs,     27 skips
>   72734 UNITS in planarX,  131057 runs,     15 skips
> yuv420p12le
>   12236 UNITS in planarX,  131058 runs,     14 skips
>   73029 UNITS in planarX,  131062 runs,     10 skips
> yuv420p12be
>   12218 UNITS in planarX,  130973 runs,     99 skips
>   72402 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14le
>   12168 UNITS in planarX,  131067 runs,      5 skips
>   72480 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14be
>   12358 UNITS in planarX,  130948 runs,    124 skips
>   73772 UNITS in planarX,  131063 runs,      9 skips
> yuv420p16le
>   10439 UNITS in planarX,  130911 runs,    161 skips
>  157923 UNITS in planarX,  131068 runs,      4 skips
> yuv420p16be
>   10463 UNITS in planarX,  130874 runs,    198 skips
>  154405 UNITS in planarX,  131061 runs,     11 skips
> 
> Signed-off-by: Lauri Kasanen <cand@gmx.com>
> ---
> 
> v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
> v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check
> 
> As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at least
> power8, meaning with the current setup such a binary wouldn't run on POWER7.
> However using the configure define lets it be disabled in configure like Michael
> pointed out, and having the runtime check doesn't hurt any (it allows for future
> splits like on x86, where one binary can run on low cpu but use higher ISA if
> available).
> 
>  libswscale/ppc/swscale_ppc_template.c |   4 +-
>  libswscale/ppc/swscale_vsx.c          | 195 +++++++++++++++++++++++++++++++++-
>  2 files changed, 193 insertions(+), 6 deletions(-)
> 
> diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c
> index 00e4b99..11decab 100644
> --- a/libswscale/ppc/swscale_ppc_template.c
> +++ b/libswscale/ppc/swscale_ppc_template.c
> @@ -21,7 +21,7 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>   */
>  
> -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
> +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
>                                    const int16_t **src, uint8_t *dest,
>                                    const uint8_t *dither, int offset, int x)
>  {
> @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
>      yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
>  
>      for (i = dst_u; i < dstW - 15; i += 16)
> -        FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
> +        FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
>                                offset, i);
>  
>      yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
> diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
> index 70da6ae..77680f8 100644
> --- a/libswscale/ppc/swscale_vsx.c
> +++ b/libswscale/ppc/swscale_vsx.c
> @@ -83,6 +83,8 @@
>  #include "swscale_ppc_template.c"
>  #undef FUNC
>  
> +#undef vzero
> +
>  #endif /* !HAVE_BIGENDIAN */
>  
>  static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
> @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
>      yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>  
> +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
> +                              const int16_t **src, uint16_t *dest, int dstW,
> +                              int big_endian, int output_bits, int start)
> +{
> +    int i;
> +    int shift = 11 + 16 - output_bits;
> +
> +    for (i = start; i < dstW; i++) {
> +        int val = 1 << (shift - 1);
> +        int j;
> +
> +        for (j = 0; j < filterSize; j++)
> +            val += src[j][i] * filter[j];
> +
> +        output_pixel(&dest[i], val);
> +    }
> +}
> +
> +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
> +                                const int16_t **src, uint16_t *dest, int dstW,
> +                                int big_endian, int output_bits)
> +{
> +    const int dst_u = -(uintptr_t)dest & 7;
> +    const int shift = 11 + 16 - output_bits;
> +    const int add = (1 << (shift - 1));
> +    const int clip = (1 << output_bits) - 1;
> +    const uint16_t swap = big_endian ? 8 : 0;
> +    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
> +    const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
> +    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
> +    const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
> +    const vector int16_t vzero = vec_splat_s16(0);
> +    const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
> +    vector int16_t vfilter[MAX_FILTER_SIZE], vin;
> +    vector uint16_t v;
> +    vector uint32_t vleft, vright, vtmp;
> +    int i, j;
> +
> +    for (i = 0; i < filterSize; i++) {
> +        vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], filter[i],
> +                                       filter[i], filter[i], filter[i], filter[i]};
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
> +
> +    for (i = dst_u; i < dstW - 7; i += 8) {
> +        vleft = vright = vadd;
> +
> +        for (j = 0; j < filterSize; j++) {
> +            vin = vec_vsx_ld(0, &src[j][i]);
> +            vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
> +            vleft = vec_add(vleft, vtmp);
> +            vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
> +            vright = vec_add(vright, vtmp);
> +        }
> +
> +        vleft = vec_sra(vleft, vshift);
> +        vright = vec_sra(vright, vshift);
> +        v = vec_packsu(vleft, vright);
> +        v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
> +        v = vec_min(v, vlargest);
> +        v = vec_rl(v, vswap);
> +        v = vec_perm(v, v, vperm);
> +        vec_st(v, 0, &dest[i]);
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
> +}
> +
> +
>  #undef output_pixel
>  
>  #define output_pixel(pos, val, bias, signedness) \
> @@ -234,7 +306,97 @@ static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
>      yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>  
> +#ifdef HAVE_POWER8

this probably should be #if, similar for others

[...]
Lauri Kasanen Jan. 10, 2019, 9:48 a.m. UTC | #4
On Wed, 9 Jan 2019 22:26:25 +0100
Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:

> > +#ifdef __GNUC__
> > +            // GCC does not support vmuluwm yet. Bug open.
> > +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l),
> > "v"(vfilter[j]));
> > +            vleft = vec_add(vleft, vtmp);
> > +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r),
> > "v"(vfilter[j]));
> > +            vright = vec_add(vright, vtmp);
> > +#else
> > +            // No idea which compilers this works in, untested. Copied from
> > libsimdpp
> > +            vtmp = vec_vmuluwm(vin32l, vfilter[j]);
> > +            vleft = vec_add(vleft, vtmp);
> > +            vtmp = vec_vmuluwm(vin32r, vfilter[j]);
> > +            vright = vec_add(vright, vtmp);
> > +#endif
> 
> Is there no xlc installed on your test system?
> I suspect an earlier patch from you already
> broke xlc compilation...

No, I don't really care about proprietary compilers. You reported
previously that xlc created invalid code anyway?

- Lauri
Carl Eugen Hoyos Jan. 10, 2019, 5:08 p.m. UTC | #5
2019-01-10 10:48 GMT+01:00, Lauri Kasanen <cand@gmx.com>:
> On Wed, 9 Jan 2019 22:26:25 +0100
> Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:
>
>> > +#ifdef __GNUC__
>> > +            // GCC does not support vmuluwm yet. Bug open.
>> > +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l),
>> > "v"(vfilter[j]));
>> > +            vleft = vec_add(vleft, vtmp);
>> > +            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r),
>> > "v"(vfilter[j]));
>> > +            vright = vec_add(vright, vtmp);
>> > +#else
>> > +            // No idea which compilers this works in, untested. Copied
>> > from
>> > libsimdpp
>> > +            vtmp = vec_vmuluwm(vin32l, vfilter[j]);
>> > +            vleft = vec_add(vleft, vtmp);
>> > +            vtmp = vec_vmuluwm(vin32r, vfilter[j]);
>> > +            vright = vec_add(vright, vtmp);
>> > +#endif
>>
>> Is there no xlc installed on your test system?
>> I suspect an earlier patch from you already
>> broke xlc compilation...
>
> No, I don't really care about proprietary compilers.

Ok, just wondering which other compilers your comment
above meant...

> You reported previously that xlc created invalid code anyway?

True, I forgot.

Sorry, Carl Eugen
diff mbox

Patch

diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@ 
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
                                   const int16_t **src, uint8_t *dest,
                                   const uint8_t *dither, int offset, int x)
 {
@@ -88,7 +88,7 @@  static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
     yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
     for (i = dst_u; i < dstW - 15; i += 16)
-        FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+        FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
                               offset, i);
 
     yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..77680f8 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@ 
 #include "swscale_ppc_template.c"
 #undef FUNC
 
+#undef vzero
+
 #endif /* !HAVE_BIGENDIAN */
 
 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@  static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
     yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+                              const int16_t **src, uint16_t *dest, int dstW,
+                              int big_endian, int output_bits, int start)
+{
+    int i;
+    int shift = 11 + 16 - output_bits;
+
+    for (i = start; i < dstW; i++) {
+        int val = 1 << (shift - 1);
+        int j;
+
+        for (j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+
+        output_pixel(&dest[i], val);
+    }
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+                                const int16_t **src, uint16_t *dest, int dstW,
+                                int big_endian, int output_bits)
+{
+    const int dst_u = -(uintptr_t)dest & 7;
+    const int shift = 11 + 16 - output_bits;
+    const int add = (1 << (shift - 1));
+    const int clip = (1 << output_bits) - 1;
+    const uint16_t swap = big_endian ? 8 : 0;
+    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+    const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
+    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
+    const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
+    const vector int16_t vzero = vec_splat_s16(0);
+    const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+    vector int16_t vfilter[MAX_FILTER_SIZE], vin;
+    vector uint16_t v;
+    vector uint32_t vleft, vright, vtmp;
+    int i, j;
+
+    for (i = 0; i < filterSize; i++) {
+        vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], filter[i],
+                                       filter[i], filter[i], filter[i], filter[i]};
+    }
+
+    yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
+
+    for (i = dst_u; i < dstW - 7; i += 8) {
+        vleft = vright = vadd;
+
+        for (j = 0; j < filterSize; j++) {
+            vin = vec_vsx_ld(0, &src[j][i]);
+            vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
+            vleft = vec_add(vleft, vtmp);
+            vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
+            vright = vec_add(vright, vtmp);
+        }
+
+        vleft = vec_sra(vleft, vshift);
+        vright = vec_sra(vright, vshift);
+        v = vec_packsu(vleft, vright);
+        v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
+        v = vec_min(v, vlargest);
+        v = vec_rl(v, vswap);
+        v = vec_perm(v, v, vperm);
+        vec_st(v, 0, &dest[i]);
+    }
+
+    yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
+}
+
+
 #undef output_pixel
 
 #define output_pixel(pos, val, bias, signedness) \
@@ -234,7 +306,97 @@  static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
     yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+#ifdef HAVE_POWER8
+
+static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
+                            const int32_t **src, uint16_t *dest, int dstW,
+                            int big_endian, int output_bits, int start)
+{
+    int i;
+    int shift = 15;
+
+    for (i = start; i < dstW; i++) {
+        int val = 1 << (shift - 1);
+        int j;
+
+        /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
+         * filters (or anything with negative coeffs, the range can be slightly
+         * wider in both directions. To account for this overflow, we subtract
+         * a constant so it always fits in the signed range (assuming a
+         * reasonable filterSize), and re-add that at the end. */
+        val -= 0x40000000;
+        for (j = 0; j < filterSize; j++)
+            val += src[j][i] * (unsigned)filter[j];
+
+        output_pixel(&dest[i], val, 0x8000, int);
+    }
+}
+
+static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
+                              const int32_t **src, uint16_t *dest, int dstW,
+                              int big_endian, int output_bits)
+{
+    const int dst_u = -(uintptr_t)dest & 7;
+    const int shift = 15;
+    const int bias = 0x8000;
+    const int add = (1 << (shift - 1)) - 0x40000000;
+    const uint16_t swap = big_endian ? 8 : 0;
+    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+    const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
+    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
+    const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias, bias, bias, bias, bias, bias};
+    vector int32_t vfilter[MAX_FILTER_SIZE];
+    vector uint16_t v;
+    vector uint32_t vleft, vright, vtmp;
+    vector int32_t vin32l, vin32r;
+    int i, j;
+
+    for (i = 0; i < filterSize; i++) {
+        vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i], filter[i]};
+    }
+
+    yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
+
+    for (i = dst_u; i < dstW - 7; i += 8) {
+        vleft = vright = vadd;
+
+        for (j = 0; j < filterSize; j++) {
+            vin32l = vec_vsx_ld(0, &src[j][i]);
+            vin32r = vec_vsx_ld(0, &src[j][i + 4]);
+
+#ifdef __GNUC__
+            // GCC does not support vmuluwm yet. Bug open.
+            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l), "v"(vfilter[j]));
+            vleft = vec_add(vleft, vtmp);
+            __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r), "v"(vfilter[j]));
+            vright = vec_add(vright, vtmp);
+#else
+            // No idea which compilers this works in, untested. Copied from libsimdpp
+            vtmp = vec_vmuluwm(vin32l, vfilter[j]);
+            vleft = vec_add(vleft, vtmp);
+            vtmp = vec_vmuluwm(vin32r, vfilter[j]);
+            vright = vec_add(vright, vtmp);
+#endif
+        }
+
+        vleft = vec_sra(vleft, vshift);
+        vright = vec_sra(vright, vshift);
+        v = (vector uint16_t) vec_packs((vector int32_t) vleft, (vector int32_t) vright);
+        v = vec_add(v, vbias);
+        v = vec_rl(v, vswap);
+        vec_st(v, 0, &dest[i]);
+    }
+
+    yuv2planeX_16_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
+}
+
+#endif /* HAVE_POWER8 */
+
 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
+    yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
+    yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t)
+
+#define yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
                              uint8_t *dest, int dstW, \
                              const uint8_t *dither, int offset) \
@@ -243,6 +405,16 @@  static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
                          (uint16_t *) dest, dstW, is_be, bits); \
 }
 
+#define yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t) \
+static void yuv2planeX_ ## bits ## BE_LE ## _vsx(const int16_t *filter, int filterSize, \
+                              const int16_t **src, uint8_t *dest, int dstW, \
+                              const uint8_t *dither, int offset)\
+{ \
+    yuv2planeX_## template_size ## _vsx(filter, \
+                         filterSize, (const typeX_t **) src, \
+                         (uint16_t *) dest, dstW, is_be, bits); \
+}
+
 yuv2NBPS( 9, BE, 1, nbps, int16_t)
 yuv2NBPS( 9, LE, 0, nbps, int16_t)
 yuv2NBPS(10, BE, 1, nbps, int16_t)
@@ -251,8 +423,13 @@  yuv2NBPS(12, BE, 1, nbps, int16_t)
 yuv2NBPS(12, LE, 0, nbps, int16_t)
 yuv2NBPS(14, BE, 1, nbps, int16_t)
 yuv2NBPS(14, LE, 0, nbps, int16_t)
-yuv2NBPS(16, BE, 1, 16, int32_t)
-yuv2NBPS(16, LE, 0, 16, int32_t)
+
+yuv2NBPS1(16, BE, 1, 16, int32_t)
+yuv2NBPS1(16, LE, 0, 16, int32_t)
+#ifdef HAVE_POWER8
+yuv2NBPSX(16, BE, 1, 16, int32_t)
+yuv2NBPSX(16, LE, 0, 16, int32_t)
+#endif
 
 #endif /* !HAVE_BIGENDIAN */
 
@@ -262,8 +439,9 @@  av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
 {
 #if HAVE_VSX
     enum AVPixelFormat dstFormat = c->dstFormat;
+    const int cpu_flags = av_get_cpu_flags();
 
-    if (!(av_get_cpu_flags() & AV_CPU_FLAG_VSX))
+    if (!(cpu_flags & AV_CPU_FLAG_VSX))
         return;
 
 #if !HAVE_BIGENDIAN
@@ -286,20 +464,29 @@  av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
 #if !HAVE_BIGENDIAN
         case 9:
             c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx  : yuv2plane1_9LE_vsx;
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_vsx  : yuv2planeX_9LE_vsx;
             break;
         case 10:
             c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx  : yuv2plane1_10LE_vsx;
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_vsx  : yuv2planeX_10LE_vsx;
             break;
         case 12:
             c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx  : yuv2plane1_12LE_vsx;
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_vsx  : yuv2planeX_12LE_vsx;
             break;
         case 14:
             c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx  : yuv2plane1_14LE_vsx;
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_vsx  : yuv2planeX_14LE_vsx;
             break;
         case 16:
             c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx  : yuv2plane1_16LE_vsx;
+#ifdef HAVE_POWER8
+            if (cpu_flags & AV_CPU_FLAG_POWER8) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx  : yuv2planeX_16LE_vsx;
+            }
+#endif /* HAVE_POWER8 */
             break;
-#endif
+#endif /* !HAVE_BIGENDIAN */
         }
     }
 #endif /* HAVE_VSX */