diff mbox series

[FFmpeg-devel] POWER8 VSX vectorization libswscale/input.c Track ticket 5570

Message ID 1585056463-7934-1-git-send-email-pestov.vyach@yandex.ru
State New
Headers show
Series [FFmpeg-devel] POWER8 VSX vectorization libswscale/input.c Track ticket 5570
Related show

Checks

Context Check Description
andriy/ffmpeg-patchwork pending
andriy/ffmpeg-patchwork success Applied patch
andriy/ffmpeg-patchwork success Configure finished
andriy/ffmpeg-patchwork success Make finished
andriy/ffmpeg-patchwork success Make fate finished

Commit Message

Vyacheslav Pestov March 24, 2020, 1:27 p.m. UTC
yuy2ToY_c: 10157
yuy2ToY_c_vsx: 2353

yuy2ToUV_c: 4907
yuy2ToUV_c_vsx: 1357

rgb24ToY_c: 21172
rgb24ToY_c_vsx: 9191

rgb24ToUV_c: 33568
rgb24ToUV_c_vsx: 12746

bgr24ToY_c: 20983
bgr24ToY_c_vsx: 9381

bgr24ToUV_c: 34513
bgr24ToUV_c_vsx: 12708

monowhite2Y_c: 5247
monowhite2Y_c_vsx: 2099

monoblack2Y_c: 5584
monoblack2Y_c_vsx: 1993

uyvyToY_c: 10111
uyvyToY_c_vsx: 1780

uyvyToUV_c: 4872
uyvyToUV_c_vsx: 1284

nvXXtoUV_c: 5128
nvXXtoUV_c_vsx: 1456

rgbaToA_c: 9930
rgbaToA_c_vsx: 2599

bswap16Y_c: 10399
bswap16Y_c_vsx: 2451

rgb16_32ToUV_half_c_template: 42350
rgb16_32ToUV_half_c_template_vsx: 18583

bswap16UV_c: 11784
bswap16UV_c_vsx: 2873

planar_rgb_to_y: 24602
planar_rgb_to_y_vsx: 10792

planar_rgb_to_uv: 35601
planar_rgb_to_uv_vsx: 14112

planar_rgb16_to_y: 25686
planar_rgb16_to_y_vsx: 10293

planar_rgb16_to_uv: 36367
planar_rgb16_to_uv_vsx: 13575

yvy2ToUV_c: 4879
yvy2ToUV_c_vsx: 1239

read_ya16be_gray_c: 9591
read_ya16be_gray_c_vsx: 4164

read_ya16be_alpha_c: 9390
read_ya16be_alpha_c_vsx: 1874

read_ya16le_gray_c: 9884
read_ya16le_gray_c_vsx: 4224

read_ya16le_alpha_c: 9403
read_ya16le_alpha_c_vsx: 2026

planar_rgb_to_a: 10262
planar_rgb_to_a_vsx: 9361

planar_rgb16_to_a: 9554
planar_rgb16_to_a_vsx: 9393

read_ayuv64le_Y_c: 10457
read_ayuv64le_Y_c_vsx: 7703

read_ayuv64le_A_c: 9404
read_ayuv64le_A_c_vsx: 2797

read_ayuv64le_UV_c: 9464
read_ayuv64le_UV_c_vsx: 3781

p010LEToY_c: 9546
p010LEToY_c_vsx: 2422

p010LEToUV_c: 6390
p010LEToUV_c_vsx: 2681

p010BEToY_c: 9836
p010BEToY_c_vsx: 2572

p010BEToUV_c: 7022
p010BEToUV_c_vsx: 2660

p016LEToUV_c: 5022
p016LEToUV_c_vsx: 2447

p016BEToUV_c: 5293
p016BEToUV_c_vsx: 2307
---
 libswscale/ppc/Makefile       |    3 +-
 libswscale/ppc/input_vsx.c    | 4562 +++++++++++++++++++++++++++++++++++++++++
 libswscale/swscale.c          |    2 +
 libswscale/swscale_internal.h |    1 +
 4 files changed, 4567 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/ppc/input_vsx.c

Comments

Carl Eugen Hoyos March 24, 2020, 10:12 p.m. UTC | #1
Am Di., 24. März 2020 um 14:28 Uhr schrieb Pestov Vyacheslav
<pestov.vyach@yandex.ru>:
>
> yuy2ToY_c: 10157
> yuy2ToY_c_vsx: 2353
>
> yuy2ToUV_c: 4907
> yuy2ToUV_c_vsx: 1357
>
> rgb24ToY_c: 21172
> rgb24ToY_c_vsx: 9191
>
> rgb24ToUV_c: 33568
> rgb24ToUV_c_vsx: 12746
>
> bgr24ToY_c: 20983
> bgr24ToY_c_vsx: 9381
>
> bgr24ToUV_c: 34513
> bgr24ToUV_c_vsx: 12708
>
> monowhite2Y_c: 5247
> monowhite2Y_c_vsx: 2099
>
> monoblack2Y_c: 5584
> monoblack2Y_c_vsx: 1993
>
> uyvyToY_c: 10111
> uyvyToY_c_vsx: 1780
>
> uyvyToUV_c: 4872
> uyvyToUV_c_vsx: 1284
>
> nvXXtoUV_c: 5128
> nvXXtoUV_c_vsx: 1456
>
> rgbaToA_c: 9930
> rgbaToA_c_vsx: 2599
>
> bswap16Y_c: 10399
> bswap16Y_c_vsx: 2451
>
> rgb16_32ToUV_half_c_template: 42350
> rgb16_32ToUV_half_c_template_vsx: 18583
>
> bswap16UV_c: 11784
> bswap16UV_c_vsx: 2873
>
> planar_rgb_to_y: 24602
> planar_rgb_to_y_vsx: 10792
>
> planar_rgb_to_uv: 35601
> planar_rgb_to_uv_vsx: 14112
>
> planar_rgb16_to_y: 25686
> planar_rgb16_to_y_vsx: 10293
>
> planar_rgb16_to_uv: 36367
> planar_rgb16_to_uv_vsx: 13575
>
> yvy2ToUV_c: 4879
> yvy2ToUV_c_vsx: 1239
>
> read_ya16be_gray_c: 9591
> read_ya16be_gray_c_vsx: 4164
>
> read_ya16be_alpha_c: 9390
> read_ya16be_alpha_c_vsx: 1874
>
> read_ya16le_gray_c: 9884
> read_ya16le_gray_c_vsx: 4224
>
> read_ya16le_alpha_c: 9403
> read_ya16le_alpha_c_vsx: 2026
>
> planar_rgb_to_a: 10262
> planar_rgb_to_a_vsx: 9361
>
> planar_rgb16_to_a: 9554
> planar_rgb16_to_a_vsx: 9393
>
> read_ayuv64le_Y_c: 10457
> read_ayuv64le_Y_c_vsx: 7703
>
> read_ayuv64le_A_c: 9404
> read_ayuv64le_A_c_vsx: 2797
>
> read_ayuv64le_UV_c: 9464
> read_ayuv64le_UV_c_vsx: 3781
>
> p010LEToY_c: 9546
> p010LEToY_c_vsx: 2422
>
> p010LEToUV_c: 6390
> p010LEToUV_c_vsx: 2681
>
> p010BEToY_c: 9836
> p010BEToY_c_vsx: 2572
>
> p010BEToUV_c: 7022
> p010BEToUV_c_vsx: 2660
>
> p016LEToUV_c: 5022
> p016LEToUV_c_vsx: 2447
>
> p016BEToUV_c: 5293
> p016BEToUV_c_vsx: 2307

To make our lives a little easier, could you tell us what you tested
and how we can reproduce your results?

Also: Is your patch expected to be bit-exact? If yes, do you
have a script that allows to compare C and vsx code?
If not, how did you test your code?
(Or does fate cover these conversions? I wouldn't expect so.)

Thank you, Carl Eugen
Vyacheslav Pestov March 25, 2020, 7:55 a.m. UTC | #2
On 25.03.2020 1:12, Carl Eugen Hoyos wrote:

> Am Di., 24. März 2020 um 14:28 Uhr schrieb Pestov Vyacheslav
> <pestov.vyach@yandex.ru>:
>> yuy2ToY_c: 10157
>> yuy2ToY_c_vsx: 2353
>>
>> yuy2ToUV_c: 4907
>> yuy2ToUV_c_vsx: 1357
>>
>> rgb24ToY_c: 21172
>> rgb24ToY_c_vsx: 9191
>>
>> rgb24ToUV_c: 33568
>> rgb24ToUV_c_vsx: 12746
>>
>> bgr24ToY_c: 20983
>> bgr24ToY_c_vsx: 9381
>>
>> bgr24ToUV_c: 34513
>> bgr24ToUV_c_vsx: 12708
>>
>> monowhite2Y_c: 5247
>> monowhite2Y_c_vsx: 2099
>>
>> monoblack2Y_c: 5584
>> monoblack2Y_c_vsx: 1993
>>
>> uyvyToY_c: 10111
>> uyvyToY_c_vsx: 1780
>>
>> uyvyToUV_c: 4872
>> uyvyToUV_c_vsx: 1284
>>
>> nvXXtoUV_c: 5128
>> nvXXtoUV_c_vsx: 1456
>>
>> rgbaToA_c: 9930
>> rgbaToA_c_vsx: 2599
>>
>> bswap16Y_c: 10399
>> bswap16Y_c_vsx: 2451
>>
>> rgb16_32ToUV_half_c_template: 42350
>> rgb16_32ToUV_half_c_template_vsx: 18583
>>
>> bswap16UV_c: 11784
>> bswap16UV_c_vsx: 2873
>>
>> planar_rgb_to_y: 24602
>> planar_rgb_to_y_vsx: 10792
>>
>> planar_rgb_to_uv: 35601
>> planar_rgb_to_uv_vsx: 14112
>>
>> planar_rgb16_to_y: 25686
>> planar_rgb16_to_y_vsx: 10293
>>
>> planar_rgb16_to_uv: 36367
>> planar_rgb16_to_uv_vsx: 13575
>>
>> yvy2ToUV_c: 4879
>> yvy2ToUV_c_vsx: 1239
>>
>> read_ya16be_gray_c: 9591
>> read_ya16be_gray_c_vsx: 4164
>>
>> read_ya16be_alpha_c: 9390
>> read_ya16be_alpha_c_vsx: 1874
>>
>> read_ya16le_gray_c: 9884
>> read_ya16le_gray_c_vsx: 4224
>>
>> read_ya16le_alpha_c: 9403
>> read_ya16le_alpha_c_vsx: 2026
>>
>> planar_rgb_to_a: 10262
>> planar_rgb_to_a_vsx: 9361
>>
>> planar_rgb16_to_a: 9554
>> planar_rgb16_to_a_vsx: 9393
>>
>> read_ayuv64le_Y_c: 10457
>> read_ayuv64le_Y_c_vsx: 7703
>>
>> read_ayuv64le_A_c: 9404
>> read_ayuv64le_A_c_vsx: 2797
>>
>> read_ayuv64le_UV_c: 9464
>> read_ayuv64le_UV_c_vsx: 3781
>>
>> p010LEToY_c: 9546
>> p010LEToY_c_vsx: 2422
>>
>> p010LEToUV_c: 6390
>> p010LEToUV_c_vsx: 2681
>>
>> p010BEToY_c: 9836
>> p010BEToY_c_vsx: 2572
>>
>> p010BEToUV_c: 7022
>> p010BEToUV_c_vsx: 2660
>>
>> p016LEToUV_c: 5022
>> p016LEToUV_c_vsx: 2447
>>
>> p016BEToUV_c: 5293
>> p016BEToUV_c_vsx: 2307
> To make our lives a little easier, could you tell us what you tested
> and how we can reproduce your results?
>
> Also: Is your patch expected to be bit-exact? If yes, do you
> have a script that allows to compare C and vsx code?
> If not, how did you test your code?
> (Or does fate cover these conversions? I wouldn't expect so.)
>
> Thank you, Carl Eugen
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Hi, yes I am using some scripts(see attached archive):

1) Put macros into tested functions in input_vsx.c and input.c   (START_TIMER and STOP_TIMET). 
You can take files with macros from my dropbox 
https://www.dropbox.com/sh/eoed5pjp9a9psx0/AACpsa6PKGAIl5pYF58sHeRda?dl=0

2) I am generating some random input file with dd utility. For example:

dd if=/dev/urandom of=/tmp/test.raw bs=1024 count=40900

3) Run script script ffmpeg_bench.sh from directory where created test.raw

cd /tmp

bash ./ffmpeg_bench.sh

4) Run python3 script print_result.py.  This script takes the ffmpeg_bench results from files bench_cpu.txt and bench_vsx.txt, calculates the average values and displays in a convenient form 

With best regards, Pestov Vyacheslav
Vyacheslav Pestov March 30, 2020, 8:12 a.m. UTC | #3
On 25.03.2020 1:12, Carl Eugen Hoyos wrote:

> Am Di., 24. März 2020 um 14:28 Uhr schrieb Pestov Vyacheslav
> <pestov.vyach@yandex.ru>:
>> yuy2ToY_c: 10157
>> yuy2ToY_c_vsx: 2353
>>
>> yuy2ToUV_c: 4907
>> yuy2ToUV_c_vsx: 1357
>>
>> rgb24ToY_c: 21172
>> rgb24ToY_c_vsx: 9191
>>
>> rgb24ToUV_c: 33568
>> rgb24ToUV_c_vsx: 12746
>>
>> bgr24ToY_c: 20983
>> bgr24ToY_c_vsx: 9381
>>
>> bgr24ToUV_c: 34513
>> bgr24ToUV_c_vsx: 12708
>>
>> monowhite2Y_c: 5247
>> monowhite2Y_c_vsx: 2099
>>
>> monoblack2Y_c: 5584
>> monoblack2Y_c_vsx: 1993
>>
>> uyvyToY_c: 10111
>> uyvyToY_c_vsx: 1780
>>
>> uyvyToUV_c: 4872
>> uyvyToUV_c_vsx: 1284
>>
>> nvXXtoUV_c: 5128
>> nvXXtoUV_c_vsx: 1456
>>
>> rgbaToA_c: 9930
>> rgbaToA_c_vsx: 2599
>>
>> bswap16Y_c: 10399
>> bswap16Y_c_vsx: 2451
>>
>> rgb16_32ToUV_half_c_template: 42350
>> rgb16_32ToUV_half_c_template_vsx: 18583
>>
>> bswap16UV_c: 11784
>> bswap16UV_c_vsx: 2873
>>
>> planar_rgb_to_y: 24602
>> planar_rgb_to_y_vsx: 10792
>>
>> planar_rgb_to_uv: 35601
>> planar_rgb_to_uv_vsx: 14112
>>
>> planar_rgb16_to_y: 25686
>> planar_rgb16_to_y_vsx: 10293
>>
>> planar_rgb16_to_uv: 36367
>> planar_rgb16_to_uv_vsx: 13575
>>
>> yvy2ToUV_c: 4879
>> yvy2ToUV_c_vsx: 1239
>>
>> read_ya16be_gray_c: 9591
>> read_ya16be_gray_c_vsx: 4164
>>
>> read_ya16be_alpha_c: 9390
>> read_ya16be_alpha_c_vsx: 1874
>>
>> read_ya16le_gray_c: 9884
>> read_ya16le_gray_c_vsx: 4224
>>
>> read_ya16le_alpha_c: 9403
>> read_ya16le_alpha_c_vsx: 2026
>>
>> planar_rgb_to_a: 10262
>> planar_rgb_to_a_vsx: 9361
>>
>> planar_rgb16_to_a: 9554
>> planar_rgb16_to_a_vsx: 9393
>>
>> read_ayuv64le_Y_c: 10457
>> read_ayuv64le_Y_c_vsx: 7703
>>
>> read_ayuv64le_A_c: 9404
>> read_ayuv64le_A_c_vsx: 2797
>>
>> read_ayuv64le_UV_c: 9464
>> read_ayuv64le_UV_c_vsx: 3781
>>
>> p010LEToY_c: 9546
>> p010LEToY_c_vsx: 2422
>>
>> p010LEToUV_c: 6390
>> p010LEToUV_c_vsx: 2681
>>
>> p010BEToY_c: 9836
>> p010BEToY_c_vsx: 2572
>>
>> p010BEToUV_c: 7022
>> p010BEToUV_c_vsx: 2660
>>
>> p016LEToUV_c: 5022
>> p016LEToUV_c_vsx: 2447
>>
>> p016BEToUV_c: 5293
>> p016BEToUV_c_vsx: 2307
> To make our lives a little easier, could you tell us what you tested
> and how we can reproduce your results?
>
> Also: Is your patch expected to be bit-exact? If yes, do you
> have a script that allows to compare C and vsx code?
> If not, how did you test your code?
> (Or does fate cover these conversions? I wouldn't expect so.)
>
> Thank you, Carl Eugen
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Hi, yes I am using some scripts(see attached archive):

1) Put macros into tested functions in input_vsx.c and input.c   (START_TIMER and STOP_TIMET). 
You can take files with macros from my dropbox 
https://www.dropbox.com/sh/eoed5pjp9a9psx0/AACpsa6PKGAIl5pYF58sHeRda?dl=0

2) I am generating some random input file with dd utility. For example:

dd if=/dev/urandom of=/tmp/test.raw bs=1024 count=40900

3) Run script script ffmpeg_bench.sh from directory where created test.raw

cd /tmp

bash ./ffmpeg_bench.sh

4) Run python3 script print_result.py.  This script takes the ffmpeg_bench results from files bench_cpu.txt and bench_vsx.txt, calculates the average values and displays in a convenient form
Vyacheslav Pestov May 9, 2020, 4:10 p.m. UTC | #4
On 25.03.2020 10:55, Pestov Vyacheslav wrote:

> On 25.03.2020 1:12, Carl Eugen Hoyos wrote:
>
>> Am Di., 24. März 2020 um 14:28 Uhr schrieb Pestov Vyacheslav
>> <pestov.vyach@yandex.ru>:
>>> yuy2ToY_c: 10157
>>> yuy2ToY_c_vsx: 2353
>>>
>>> yuy2ToUV_c: 4907
>>> yuy2ToUV_c_vsx: 1357
>>>
>>> rgb24ToY_c: 21172
>>> rgb24ToY_c_vsx: 9191
>>>
>>> rgb24ToUV_c: 33568
>>> rgb24ToUV_c_vsx: 12746
>>>
>>> bgr24ToY_c: 20983
>>> bgr24ToY_c_vsx: 9381
>>>
>>> bgr24ToUV_c: 34513
>>> bgr24ToUV_c_vsx: 12708
>>>
>>> monowhite2Y_c: 5247
>>> monowhite2Y_c_vsx: 2099
>>>
>>> monoblack2Y_c: 5584
>>> monoblack2Y_c_vsx: 1993
>>>
>>> uyvyToY_c: 10111
>>> uyvyToY_c_vsx: 1780
>>>
>>> uyvyToUV_c: 4872
>>> uyvyToUV_c_vsx: 1284
>>>
>>> nvXXtoUV_c: 5128
>>> nvXXtoUV_c_vsx: 1456
>>>
>>> rgbaToA_c: 9930
>>> rgbaToA_c_vsx: 2599
>>>
>>> bswap16Y_c: 10399
>>> bswap16Y_c_vsx: 2451
>>>
>>> rgb16_32ToUV_half_c_template: 42350
>>> rgb16_32ToUV_half_c_template_vsx: 18583
>>>
>>> bswap16UV_c: 11784
>>> bswap16UV_c_vsx: 2873
>>>
>>> planar_rgb_to_y: 24602
>>> planar_rgb_to_y_vsx: 10792
>>>
>>> planar_rgb_to_uv: 35601
>>> planar_rgb_to_uv_vsx: 14112
>>>
>>> planar_rgb16_to_y: 25686
>>> planar_rgb16_to_y_vsx: 10293
>>>
>>> planar_rgb16_to_uv: 36367
>>> planar_rgb16_to_uv_vsx: 13575
>>>
>>> yvy2ToUV_c: 4879
>>> yvy2ToUV_c_vsx: 1239
>>>
>>> read_ya16be_gray_c: 9591
>>> read_ya16be_gray_c_vsx: 4164
>>>
>>> read_ya16be_alpha_c: 9390
>>> read_ya16be_alpha_c_vsx: 1874
>>>
>>> read_ya16le_gray_c: 9884
>>> read_ya16le_gray_c_vsx: 4224
>>>
>>> read_ya16le_alpha_c: 9403
>>> read_ya16le_alpha_c_vsx: 2026
>>>
>>> planar_rgb_to_a: 10262
>>> planar_rgb_to_a_vsx: 9361
>>>
>>> planar_rgb16_to_a: 9554
>>> planar_rgb16_to_a_vsx: 9393
>>>
>>> read_ayuv64le_Y_c: 10457
>>> read_ayuv64le_Y_c_vsx: 7703
>>>
>>> read_ayuv64le_A_c: 9404
>>> read_ayuv64le_A_c_vsx: 2797
>>>
>>> read_ayuv64le_UV_c: 9464
>>> read_ayuv64le_UV_c_vsx: 3781
>>>
>>> p010LEToY_c: 9546
>>> p010LEToY_c_vsx: 2422
>>>
>>> p010LEToUV_c: 6390
>>> p010LEToUV_c_vsx: 2681
>>>
>>> p010BEToY_c: 9836
>>> p010BEToY_c_vsx: 2572
>>>
>>> p010BEToUV_c: 7022
>>> p010BEToUV_c_vsx: 2660
>>>
>>> p016LEToUV_c: 5022
>>> p016LEToUV_c_vsx: 2447
>>>
>>> p016BEToUV_c: 5293
>>> p016BEToUV_c_vsx: 2307
>> To make our lives a little easier, could you tell us what you tested
>> and how we can reproduce your results?
>>
>> Also: Is your patch expected to be bit-exact? If yes, do you
>> have a script that allows to compare C and vsx code?
>> If not, how did you test your code?
>> (Or does fate cover these conversions? I wouldn't expect so.)
>>
>> Thank you, Carl Eugen
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> Hi, yes I am using some scripts(see attached archive):
>
> 1) Put macros into tested functions in input_vsx.c and input.c   (START_TIMER and STOP_TIMET). 
> You can take files with macros from my dropbox 
> https://www.dropbox.com/sh/eoed5pjp9a9psx0/AACpsa6PKGAIl5pYF58sHeRda?dl=0
>
> 2) I am generating some random input file with dd utility. For example:
>
> dd if=/dev/urandom of=/tmp/test.raw bs=1024 count=40900
>
> 3) Run script script ffmpeg_bench.sh from directory where created test.raw
>
> cd /tmp
>
> bash ./ffmpeg_bench.sh
>
> 4) Run python3 script print_result.py.  This script takes the ffmpeg_bench results from files bench_cpu.txt and bench_vsx.txt, calculates the average values and displays in a convenient form 
>
> With best regards, Pestov Vyacheslav
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Please, check my patch. I have been waiting for its commiting more than a
 year(Just over a year ago, I too sent out a patch in Devel list). I want to get my bounty from bountysource platform.
Vyacheslav Pestov June 1, 2020, 5:49 p.m. UTC | #5
Hi!

There is a patch that I sent you a couple of months ago to check. I did it for the platform bountysource.com. There is a reward for this task, but I can’t get it for quite some time, since the patch is in pending status. All tests were passed, and the correct operation of the functions was checked. I also sent my scripts for these tests. Please tell me how to close this task.

Issue from bountysource:

https://www.bountysource.com/issues/34315232-power8-vsx-vectorization-libswscale-input-c

Patch pending status:

https://patchwork.ffmpeg.org/project/ffmpeg/patch/1585056463-7934-1-git-send-email-pestov.vyach@yandex.ru/

Trac ticket:

https://trac.ffmpeg.org/ticket/5570



With best regards, Vyacheslav Pestov
Vyacheslav Pestov June 17, 2020, 8:23 a.m. UTC | #6
Hi, Carl Eugen

Please, check my patch. It has been in pending status for a long time. I can’t get a bounty. If something needs to be finalized, just tell me.

https://patchwork.ffmpeg.org/project/ffmpeg/patch/1585056463-7934-1-git-send-email-pestov.vyach@yandex.ru/
Vyacheslav Pestov July 13, 2020, 2:38 p.m. UTC | #7
Please, check my patch. It has been in pending status for a long time. I can’t get a bounty. If something needs to be finalized, just tell me.

https://patchwork.ffmpeg.org/project/ffmpeg/patch/1585056463-7934-1-git-send-email-pestov.vyach@yandex.ru/

https://trac.ffmpeg.org/ticket/5570

On 25.03.2020 10:55, Pestov Vyacheslav wrote:

> On 25.03.2020 1:12, Carl Eugen Hoyos wrote:
>
>> Am Di., 24. März 2020 um 14:28 Uhr schrieb Pestov Vyacheslav
>> <pestov.vyach@yandex.ru>:
>>> yuy2ToY_c: 10157
>>> yuy2ToY_c_vsx: 2353
>>>
>>> yuy2ToUV_c: 4907
>>> yuy2ToUV_c_vsx: 1357
>>>
>>> rgb24ToY_c: 21172
>>> rgb24ToY_c_vsx: 9191
>>>
>>> rgb24ToUV_c: 33568
>>> rgb24ToUV_c_vsx: 12746
>>>
>>> bgr24ToY_c: 20983
>>> bgr24ToY_c_vsx: 9381
>>>
>>> bgr24ToUV_c: 34513
>>> bgr24ToUV_c_vsx: 12708
>>>
>>> monowhite2Y_c: 5247
>>> monowhite2Y_c_vsx: 2099
>>>
>>> monoblack2Y_c: 5584
>>> monoblack2Y_c_vsx: 1993
>>>
>>> uyvyToY_c: 10111
>>> uyvyToY_c_vsx: 1780
>>>
>>> uyvyToUV_c: 4872
>>> uyvyToUV_c_vsx: 1284
>>>
>>> nvXXtoUV_c: 5128
>>> nvXXtoUV_c_vsx: 1456
>>>
>>> rgbaToA_c: 9930
>>> rgbaToA_c_vsx: 2599
>>>
>>> bswap16Y_c: 10399
>>> bswap16Y_c_vsx: 2451
>>>
>>> rgb16_32ToUV_half_c_template: 42350
>>> rgb16_32ToUV_half_c_template_vsx: 18583
>>>
>>> bswap16UV_c: 11784
>>> bswap16UV_c_vsx: 2873
>>>
>>> planar_rgb_to_y: 24602
>>> planar_rgb_to_y_vsx: 10792
>>>
>>> planar_rgb_to_uv: 35601
>>> planar_rgb_to_uv_vsx: 14112
>>>
>>> planar_rgb16_to_y: 25686
>>> planar_rgb16_to_y_vsx: 10293
>>>
>>> planar_rgb16_to_uv: 36367
>>> planar_rgb16_to_uv_vsx: 13575
>>>
>>> yvy2ToUV_c: 4879
>>> yvy2ToUV_c_vsx: 1239
>>>
>>> read_ya16be_gray_c: 9591
>>> read_ya16be_gray_c_vsx: 4164
>>>
>>> read_ya16be_alpha_c: 9390
>>> read_ya16be_alpha_c_vsx: 1874
>>>
>>> read_ya16le_gray_c: 9884
>>> read_ya16le_gray_c_vsx: 4224
>>>
>>> read_ya16le_alpha_c: 9403
>>> read_ya16le_alpha_c_vsx: 2026
>>>
>>> planar_rgb_to_a: 10262
>>> planar_rgb_to_a_vsx: 9361
>>>
>>> planar_rgb16_to_a: 9554
>>> planar_rgb16_to_a_vsx: 9393
>>>
>>> read_ayuv64le_Y_c: 10457
>>> read_ayuv64le_Y_c_vsx: 7703
>>>
>>> read_ayuv64le_A_c: 9404
>>> read_ayuv64le_A_c_vsx: 2797
>>>
>>> read_ayuv64le_UV_c: 9464
>>> read_ayuv64le_UV_c_vsx: 3781
>>>
>>> p010LEToY_c: 9546
>>> p010LEToY_c_vsx: 2422
>>>
>>> p010LEToUV_c: 6390
>>> p010LEToUV_c_vsx: 2681
>>>
>>> p010BEToY_c: 9836
>>> p010BEToY_c_vsx: 2572
>>>
>>> p010BEToUV_c: 7022
>>> p010BEToUV_c_vsx: 2660
>>>
>>> p016LEToUV_c: 5022
>>> p016LEToUV_c_vsx: 2447
>>>
>>> p016BEToUV_c: 5293
>>> p016BEToUV_c_vsx: 2307
>> To make our lives a little easier, could you tell us what you tested
>> and how we can reproduce your results?
>>
>> Also: Is your patch expected to be bit-exact? If yes, do you
>> have a script that allows to compare C and vsx code?
>> If not, how did you test your code?
>> (Or does fate cover these conversions? I wouldn't expect so.)
>>
>> Thank you, Carl Eugen
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> Hi, yes I am using some scripts(see attached archive):
>
> 1) Put macros into tested functions in input_vsx.c and input.c   (START_TIMER and STOP_TIMET). 
> You can take files with macros from my dropbox 
> https://www.dropbox.com/sh/eoed5pjp9a9psx0/AACpsa6PKGAIl5pYF58sHeRda?dl=0
>
> 2) I am generating some random input file with dd utility. For example:
>
> dd if=/dev/urandom of=/tmp/test.raw bs=1024 count=40900
>
> 3) Run script script ffmpeg_bench.sh from directory where created test.raw
>
> cd /tmp
>
> bash ./ffmpeg_bench.sh
>
> 4) Run python3 script print_result.py.  This script takes the ffmpeg_bench results from files bench_cpu.txt and bench_vsx.txt, calculates the average values and displays in a convenient form 
>
> With best regards, Pestov Vyacheslav
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Please, check my patch. It has been in pending status for a long time. I can’t get a bounty. If something needs to be finalized, just tell me.
Michael Niedermayer July 13, 2020, 8:23 p.m. UTC | #8
Hi

On Mon, Jul 13, 2020 at 05:38:40PM +0300, Pestov Vyacheslav wrote:
> Please, check my patch. It has been in pending status for a long time. I can’t get a bounty. If something needs to be finalized, just tell me.
> 
> https://patchwork.ffmpeg.org/project/ffmpeg/patch/1585056463-7934-1-git-send-email-pestov.vyach@yandex.ru/
> 
> https://trac.ffmpeg.org/ticket/5570

There was a previous patch IIRC by Dan Parrot to which again IIRC
ronald objected due to use of intrinsics instead of hand written asm.

I did not entirely agree with ronald though i always saw his point and id
like to hear his oppinion on this patchset. Also i dont have a working ppc 
with which i could test this ATM.

So to move forward what we need is
1. ronalds oppinion
2. someone with the right hardware needs to test and confirm both
that this is faster and that it works correctly
3. simple code review

If these 3 pass, my oppinion is that its an improvment and it could be
applied.

thx

[...]
Michael Niedermayer July 13, 2020, 8:48 p.m. UTC | #9
On Mon, Jul 13, 2020 at 10:23:28PM +0200, Michael Niedermayer wrote:
> Hi
> 
> On Mon, Jul 13, 2020 at 05:38:40PM +0300, Pestov Vyacheslav wrote:
> > Please, check my patch. It has been in pending status for a long time. I can’t get a bounty. If something needs to be finalized, just tell me.
> > 
> > https://patchwork.ffmpeg.org/project/ffmpeg/patch/1585056463-7934-1-git-send-email-pestov.vyach@yandex.ru/
> > 
> > https://trac.ffmpeg.org/ticket/5570
> 

> There was a previous patch IIRC by Dan Parrot to which again IIRC
> ronald objected due to use of intrinsics instead of hand written asm.

to clarify this, and this is all IIRC ronald did a great and harsh review
of the previous patchset pointing at many loose ends and issues.
Thats why i do want to hear his oppinion on this patchset ...


[...]
diff mbox series

Patch

diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile
index 0a31a30..90c16f5 100644
--- a/libswscale/ppc/Makefile
+++ b/libswscale/ppc/Makefile
@@ -1,4 +1,5 @@ 
 OBJS += ppc/swscale_altivec.o                                           \
         ppc/yuv2rgb_altivec.o                                           \
         ppc/yuv2yuv_altivec.o                                           \
-        ppc/swscale_vsx.o
+        ppc/swscale_vsx.o                                                \
+        ppc/input_vsx.o
diff --git a/libswscale/ppc/input_vsx.c b/libswscale/ppc/input_vsx.c
new file mode 100644
index 0000000..2bacda6
--- /dev/null
+++ b/libswscale/ppc/input_vsx.c
@@ -0,0 +1,4562 @@ 
+/*
+ * POWER8 VSX vectorization libswscale/input.c
+ * Written by Vyacheslav Pestov.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * POWER8 VSX vectorization libswscale/input.c
+ * @author Vyacheslav Pestov
+ */
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#include "libavutil/avutil.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavutil/timer.h"
+#include "config.h"
+#include "../rgb2rgb.h"
+#include "../swscale.h"
+#include "../swscale_internal.h"
+
+
+
+
+#if HAVE_VSX 
+#if !HAVE_BIGENDIAN
+
+
+//vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+//vector signed short v_8000 = vec_splats((signed short)0x8000);
+//vector signed short v_7FFF = vec_splats((signed short)0x7FFF);
+//vector signed short v_FFFF = vec_splats((signed short)0xFFFF);
+vector unsigned int v_000000FF = ((vector unsigned int){0xFF, 0xFF, 0xFF, 0xFF});
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+            origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+            ? b_r : r_b)
+#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+            origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+            ? r_b : b_r)
+#define v_r1 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+               origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+               ? v_b_r1 : v_r_b1)
+#define v_b1 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+               origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+               ? v_r_b1 : v_b_r1)
+#define v_r2 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+               origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+               ? v_b_r2 : v_r_b2)
+#define v_b2 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+               origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+               ? v_r_b2 : v_b_r2)
+
+static av_always_inline void
+rgb64ToY_c_template_vsx(uint16_t *dst, const uint16_t *src, int width,
+                    enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+    
+    int i, width_adj, is_BE;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3;
+    vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    vector signed int v_ry, v_gy, v_by;
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x2001<<(RGB2YUV_SHIFT-1)));
+        shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+        v_ry = vec_splats((signed int)ry);
+        v_gy = vec_splats((signed int)gy);
+        v_by = vec_splats((signed int)by);
+        is_BE = isBE(origin);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+        v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr+48));
+
+        if(is_BE){
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+            v_r_b2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+            v_g2 = vec_perm(v_rd2, v_rd3, 
+                   ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+            v_b_r2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+        }else{
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+            v_r_b2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+            v_g2 = vec_perm(v_rd2, v_rd3, 
+                   ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+            v_b_r2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+        }
+        
+
+        v_r_b1 = vec_mergeh(v_r_b1, v_null);
+        v_g1 = vec_mergeh(v_g1, v_null);
+        v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+        v_r_b2 = vec_mergeh(v_r_b2, v_null);
+        v_g2 = vec_mergeh(v_g2, v_null);
+        v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, v_gy ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, v_by ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ry);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_g2, v_gy ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_b2,  v_by ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, ((vector unsigned char)
+                          {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); 
+
+        src_addr += 64;
+        dst_addr += 16;
+    }
+    
+    for (i = width_adj; i < width; i++) {
+        unsigned int r_b = input_pixel(&src[i*4+0]);
+        unsigned int   g = input_pixel(&src[i*4+1]);
+        unsigned int b_r = input_pixel(&src[i*4+2]);
+
+        dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+    
+}
+
+static av_always_inline void
+rgb64ToUV_c_template_vsx(uint16_t *dstU, uint16_t *dstV,
+                    const uint16_t *src1, const uint16_t *src2,
+                    int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+    av_assert1(src1==src2);
+    
+    int i, width_adj, is_BE ;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3;
+    vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x10001<<(RGB2YUV_SHIFT-1)));
+        shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+        is_BE = isBE(origin);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+        v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr+48));
+
+        if(is_BE){
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+            v_r_b2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+            v_g2 = vec_perm(v_rd2, v_rd3, 
+                   ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+            v_b_r2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+        }else{
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+            v_r_b2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+            v_g2 = vec_perm(v_rd2, v_rd3, 
+                   ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+            v_b_r2 = vec_perm(v_rd2, v_rd3, 
+                     ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+        }
+        
+
+        v_r_b1 = vec_mergeh(v_r_b1, v_null);
+        v_g1 = vec_mergeh(v_g1, v_null);
+        v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+        v_r_b2 = vec_mergeh(v_r_b2, v_null);
+        v_g2 = vec_mergeh(v_g2, v_null);
+        v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_g2, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_b2,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                               vec_mul((vector signed int)v_g1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                               vec_mul((vector signed int)v_b1, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                               vec_mul((vector signed int)v_g2, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                               vec_mul((vector signed int)v_b2,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 64;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        int r_b = input_pixel(&src1[i*4+0]);
+        int   g = input_pixel(&src1[i*4+1]);
+        int b_r = input_pixel(&src1[i*4+2]);
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+    
+}
+
+static av_always_inline void
+rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
+                          const uint16_t *src1, const uint16_t *src2,
+                          int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+    int i;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    av_assert1(src1==src2);
+    for (i = 0; i < width; i++) {
+        int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
+        int   g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
+        int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;
+
+        dstU[i]= (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i]= (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+#define rgb64funcs(pattern, BE_LE, origin) \
+static void pattern ## 64 ## BE_LE ## ToY_c_vsx(uint8_t *_dst, const uint8_t *_src, \
+                                    const uint8_t *unused0, const uint8_t *unused1, \
+                                    int width, uint32_t *rgb2yuv) \
+{ \
+    const uint16_t *src = (const uint16_t *) _src; \
+    uint16_t *dst = (uint16_t *) _dst; \
+    rgb64ToY_c_template_vsx(dst, src, width, origin, rgb2yuv); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *rgb2yuv) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb64ToUV_c_template_vsx(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+}\
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *rgb2yuv) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+}
+
+rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
+rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE)
+rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE)
+rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE)
+
+static av_always_inline void rgb48ToY_c_template_vsx(uint16_t *dst,
+                                                 const uint16_t *src, int width,
+                                                 enum AVPixelFormat origin,
+                                                 int32_t *rgb2yuv)
+{
+    
+    int i, width_adj, is_BE;
+    vector unsigned short v_rd0, v_rd1, v_rd2;
+    vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    vector signed int v_ry, v_gy, v_by;
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x2001<<(RGB2YUV_SHIFT-1)));
+        shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+        v_ry = vec_splats((signed int)ry);
+        v_gy = vec_splats((signed int)gy);
+        v_by = vec_splats((signed int)by);
+        is_BE = isBE(origin);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+        if(is_BE){
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){1, 0, 7, 6, 13, 12, 19, 18}));
+            v_r_b2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){9, 8, 15, 14, 21, 20, 27, 26}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){3, 2, 9, 8, 15, 14, 21, 20}));
+            v_g2 = vec_perm(v_rd1, v_rd2, 
+                   ((vector unsigned char){11, 10, 17, 16, 23, 22, 29, 28}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){5, 4, 11, 10, 17, 16, 23, 22}));
+            v_b_r2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){13, 12, 19, 18, 25, 24, 31, 30}));
+        }else{
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){0, 1, 6, 7, 12, 13, 18, 19}));
+            v_r_b2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){8, 9, 14, 15, 20, 21, 26, 27}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){2, 3, 8, 9, 14, 15, 20, 21}));
+            v_g2 = vec_perm(v_rd1, v_rd2, 
+                   ((vector unsigned char){10, 11, 16, 17, 22, 23, 28, 29}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){4, 5, 10, 11, 16, 17, 22, 23}));
+            v_b_r2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){12, 13, 18, 19, 24, 25, 30, 31}));
+        }
+        
+
+        v_r_b1 = vec_mergeh(v_r_b1, v_null);
+        v_g1 = vec_mergeh(v_g1, v_null);
+        v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+        v_r_b2 = vec_mergeh(v_r_b2, v_null);
+        v_g2 = vec_mergeh(v_g2, v_null);
+        v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, v_gy ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, v_by ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ry);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_g2, v_gy ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_b2,  v_by ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); 
+
+        src_addr += 48;
+        dst_addr += 16;
+    }
+    
+    for (i = width_adj; i < width; i++) {
+        unsigned int r_b = input_pixel(&src[i * 3 + 0]);
+        unsigned int g   = input_pixel(&src[i * 3 + 1]);
+        unsigned int b_r = input_pixel(&src[i * 3 + 2]);
+
+        dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+    }
+    
+}
+
+static av_always_inline void rgb48ToUV_c_template_vsx(uint16_t *dstU,
+                                                  uint16_t *dstV,
+                                                  const uint16_t *src1,
+                                                  const uint16_t *src2,
+                                                  int width,
+                                                  enum AVPixelFormat origin,
+                                                  int32_t *rgb2yuv)
+{
+    av_assert1(src1==src2);
+    
+    int i, width_adj, is_BE ;
+    vector unsigned short v_rd0, v_rd1, v_rd2;
+    vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX],
+           rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x10001<<(RGB2YUV_SHIFT-1)));
+        shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+        is_BE = isBE(origin);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+        if(is_BE){
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){1, 0, 7, 6, 13, 12, 19, 18}));
+            v_r_b2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){9, 8, 15, 14, 21, 20, 27, 26}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){3, 2, 9, 8, 15, 14, 21, 20}));
+            v_g2 = vec_perm(v_rd1, v_rd2, 
+                   ((vector unsigned char){11, 10, 17, 16, 23, 22, 29, 28}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){5, 4, 11, 10, 17, 16, 23, 22}));
+            v_b_r2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){13, 12, 19, 18, 25, 24, 31, 30}));
+        }else{
+            v_r_b1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){0, 1, 6, 7, 12, 13, 18, 19}));
+            v_r_b2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){8, 9, 14, 15, 20, 21, 26, 27}));
+            v_g1 = vec_perm(v_rd0, v_rd1, 
+                   ((vector unsigned char){2, 3, 8, 9, 14, 15, 20, 21}));
+            v_g2 = vec_perm(v_rd1, v_rd2, 
+                   ((vector unsigned char){10, 11, 16, 17, 22, 23, 28, 29}));
+            v_b_r1 = vec_perm(v_rd0, v_rd1, 
+                     ((vector unsigned char){4, 5, 10, 11, 16, 17, 22, 23}));
+            v_b_r2 = vec_perm(v_rd1, v_rd2, 
+                     ((vector unsigned char){12, 13, 18, 19, 24, 25, 30, 31}));
+        }
+        
+
+        v_r_b1 = vec_mergeh(v_r_b1, v_null);
+        v_g1 = vec_mergeh(v_g1, v_null);
+        v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+        v_r_b2 = vec_mergeh(v_r_b2, v_null);
+        v_g2 = vec_mergeh(v_g2, v_null);
+        v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g2, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b2,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_g1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_b1, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g2, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b2,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 48;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        int r_b = input_pixel(&src1[i * 3 + 0]);
+        int g   = input_pixel(&src1[i * 3 + 1]);
+        int b_r = input_pixel(&src1[i * 3 + 2]);
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+    }
+    
+}
+
+static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
+                                                       uint16_t *dstV,
+                                                       const uint16_t *src1,
+                                                       const uint16_t *src2,
+                                                       int width,
+                                                       enum AVPixelFormat origin,
+                                                       int32_t *rgb2yuv)
+{
+    
+    int i;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    
+    av_assert1(src1 == src2);
+    for (i = 0; i < width; i++) {
+        int r_b = (input_pixel(&src1[6 * i + 0]) +
+                   input_pixel(&src1[6 * i + 3]) + 1) >> 1;
+        int g   = (input_pixel(&src1[6 * i + 1]) +
+                   input_pixel(&src1[6 * i + 4]) + 1) >> 1;
+        int b_r = (input_pixel(&src1[6 * i + 2]) +
+                   input_pixel(&src1[6 * i + 5]) + 1) >> 1;
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+    }
+    STOP_TIMER("3.1")
+}
+
+#undef r
+#undef b
+#undef v_r1
+#undef v_b1
+#undef v_r2
+#undef v_b2
+#undef input_pixel
+
+#define rgb48funcs(pattern, BE_LE, origin)                              \
+static void pattern ## 48 ## BE_LE ## ToY_c_vsx(uint8_t *_dst,              \
+                                            const uint8_t *_src,        \
+                                            const uint8_t *unused0, const uint8_t *unused1,\
+                                            int width,                  \
+                                            uint32_t *rgb2yuv)          \
+{                                                                       \
+    const uint16_t *src = (const uint16_t *)_src;                       \
+    uint16_t *dst       = (uint16_t *)_dst;                             \
+    rgb48ToY_c_template_vsx(dst, src, width, origin, rgb2yuv);              \
+}                                                                       \
+                                                                        \
+static void pattern ## 48 ## BE_LE ## ToUV_c_vsx(uint8_t *_dstU,            \
+                                             uint8_t *_dstV,            \
+                                             const uint8_t *unused0,    \
+                                             const uint8_t *_src1,      \
+                                             const uint8_t *_src2,      \
+                                             int width,                 \
+                                             uint32_t *rgb2yuv)         \
+{                                                                       \
+    const uint16_t *src1 = (const uint16_t *)_src1,                     \
+                   *src2 = (const uint16_t *)_src2;                     \
+    uint16_t *dstU = (uint16_t *)_dstU,                                 \
+             *dstV = (uint16_t *)_dstV;                                 \
+    rgb48ToUV_c_template_vsx(dstU, dstV, src1, src2, width, origin, rgb2yuv);        \
+}                                                                       \
+                                                                        \
+static void pattern ## 48 ## BE_LE ## ToUV_half_c_vsx(uint8_t *_dstU,       \
+                                                  uint8_t *_dstV,       \
+                                                  const uint8_t *unused0,    \
+                                                  const uint8_t *_src1, \
+                                                  const uint8_t *_src2, \
+                                                  int width,            \
+                                                  uint32_t *rgb2yuv)    \
+{                                                                       \
+    const uint16_t *src1 = (const uint16_t *)_src1,                     \
+                   *src2 = (const uint16_t *)_src2;                     \
+    uint16_t *dstU = (uint16_t *)_dstU,                                 \
+             *dstV = (uint16_t *)_dstV;                                 \
+    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv);   \
+}
+
+rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE)
+rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE)
+rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE)
+rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
+
+#define input_pixel(i) ((origin == AV_PIX_FMT_RGBA ||                      \
+                         origin == AV_PIX_FMT_BGRA ||                      \
+                         origin == AV_PIX_FMT_ARGB ||                      \
+                         origin == AV_PIX_FMT_ABGR)                        \
+                        ? AV_RN32A(&src[(i) * 4])                       \
+                        : (isBE(origin) ? AV_RB16(&src[(i) * 2])        \
+                                        : AV_RL16(&src[(i) * 2])))
+
+
+static av_always_inline void rgb16_32ToY_c_template_vsx(int16_t *dst,
+                                                        const uint8_t *src,
+                                                        int width,
+                                                        enum AVPixelFormat origin,
+                                                        int shr, int shg,
+                                                        int shb, int shp,
+                                                        int maskr, int maskg,
+                                                        int maskb, int rsh,
+                                                        int gsh, int bsh, int S,
+                                                        int32_t *rgb2yuv)
+{
+     
+    int i, width_adj, is_DW, is_BE;
+    vector signed short v_rd0, v_rd1, v_px,v_sign,v_val;
+    vector signed short v_r1, v_r2, v_b1, v_b2, v_g1, v_g2;  
+    vector signed int v_dst1, v_dst2;
+    vector signed int shift1;
+    vector signed int shift2;
+    const int ry = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, 
+              by = rgb2yuv[BY_IDX]<<bsh;
+    const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((signed int)rnd);
+        shift2 = vec_splats((signed int)((S)-6));
+        is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA || 
+                 origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR);
+        if(!is_DW)
+            is_BE = isBE(origin);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); 
+
+        if(is_DW){
+            src_addr += 16;
+            v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+            v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, 
+                                        (vector unsigned int)vec_splats((signed int)shp));
+            v_b1 = (vector signed short)vec_and((vector signed int)v_rd0, 
+                                       vec_splats((signed int)maskb));
+            v_b1 = (vector signed short)vec_sr((vector unsigned int)v_b1, 
+                                       (vector unsigned int)vec_splats((signed int)shb));
+            v_g1 = (vector signed short)vec_and((vector signed int)v_rd0, 
+                                        vec_splats((signed int)maskg));
+            v_g1 = (vector signed short)vec_sr((vector unsigned int)v_g1, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            v_r1 = (vector signed short)vec_and((vector signed int)v_rd0, 
+                                        vec_splats((signed int)maskr));
+            v_r1 = (vector signed short)vec_sr((vector unsigned int)v_r1, 
+                                        (vector unsigned int)vec_splats((signed int)shr));
+
+            v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, 
+                                         (vector unsigned int)vec_splats((signed int)shp));
+            v_b2 = (vector signed short)vec_and((vector signed int)v_rd1, 
+                                        vec_splats((signed int)maskb));
+            v_b2 = (vector signed short)vec_sr((vector unsigned int)v_b2, 
+                                        (vector unsigned int)vec_splats((signed int)shb));
+            v_g2 = (vector signed short)vec_and((vector signed int)v_rd1, 
+                                        vec_splats((signed int)maskg));
+            v_g2 = (vector signed short)vec_sr((vector unsigned int)v_g2, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            v_r2 = (vector signed short)vec_and((vector signed int)v_rd1, 
+                                        vec_splats((signed int)maskr));
+            v_r2 = (vector signed short)vec_sr((vector unsigned int)v_r2, 
+                                        (vector unsigned int)vec_splats((signed int)shr));
+        }else{
+             if(is_BE){
+                v_rd0 = vec_perm(v_rd0, v_rd0, 
+                       ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}));
+            }
+            v_px = (vector signed short)vec_sr((vector unsigned short)v_rd0, 
+                                            (vector unsigned short)vec_splats((signed short)shp));
+            v_b1 = (vector signed short)vec_and(v_px, 
+                                            vec_splats((signed short)maskb));
+            v_b1 = (vector signed short)vec_sr((vector unsigned short)v_b1, 
+                                            (vector unsigned short)vec_splats((signed short)shb));
+            v_g1 = (vector signed short)vec_and(v_px, 
+                                            vec_splats((signed short)maskg));
+            v_g1 = (vector signed short)vec_sr((vector unsigned short)v_g1, 
+                                            (vector unsigned short)vec_splats((signed short)shg));
+            v_r1 = (vector signed short)vec_and(v_px, 
+                                            vec_splats((signed short)maskr));
+            v_r1 = (vector signed short)vec_sr((vector unsigned short)v_r1, 
+                                            (vector unsigned short)vec_splats((signed short)shr));
+
+
+            v_b2 = vec_mergel(v_b1, (vector signed short)v_null);
+            v_g2 = vec_mergel(v_g1, (vector signed short)v_null);
+            v_r2 = vec_mergel(v_r1, (vector signed short)v_null);
+            v_b1 = vec_mergeh(v_b1, (vector signed short)v_null);
+            v_g1 = vec_mergeh(v_g1, (vector signed short)v_null);
+            v_r1 = vec_mergeh(v_r1, (vector signed short)v_null);
+            
+        }
+        vec_vsx_st((vector unsigned char)v_r1, 0, (unsigned char *)dst_addr);
+
+        v_dst1 = vec_mul((vector signed int)v_r1, 
+                                              vec_splats((signed int)ry));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, 
+                                              vec_splats((signed int)gy) ));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, 
+                                              vec_splats((signed int)by) ));
+        v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+        v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+        v_dst2 = vec_mul((vector signed int)v_r2, 
+                                              vec_splats((signed int)ry));
+        v_dst2 = vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_g2, 
+                                              vec_splats((signed int)gy) ));
+        v_dst2 = vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_b2, 
+                                              vec_splats((signed int)by) ));
+        v_dst2 = vec_add(v_dst2, (vector signed int)shift1);
+        v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+                
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+        dst_addr += 16;      
+        src_addr += 16;
+            
+    }
+    
+    for (i = width_adj; i < width; i++) {
+        int px = input_pixel(i) >> shp;
+        int b  = (px & maskb) >> shb;
+        int g  = (px & maskg) >> shg;
+        int r  = (px & maskr) >> shr;
+        dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
+    }
+    
+}
+
+
+static av_always_inline void rgb16_32ToUV_c_template_vsx(int16_t *dstU,
+                                                     int16_t *dstV,
+                                                     const uint8_t *src,
+                                                     int width,
+                                                     enum AVPixelFormat origin,
+                                                     int shr, int shg,
+                                                     int shb, int shp,
+                                                     int maskr, int maskg,
+                                                     int maskb, int rsh,
+                                                     int gsh, int bsh, int S,
+                                                     int32_t *rgb2yuv)
+{
+    
+    int i, width_adj, is_DW, is_BE;
+    vector signed short v_rd0, v_rd1, v_px, v_sign, v_val;
+    vector signed short v_r1, v_r2, v_b1, v_b2, v_g1, v_g2;  
+    vector signed int v_dst1, v_dst2;
+    vector unsigned int shift1;
+    vector signed int shift2;
+    const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, 
+              bu = rgb2yuv[BU_IDX] << bsh, rv = rgb2yuv[RV_IDX] << rsh, 
+              gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh;
+    const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)rnd);
+        shift2 = vec_splats((signed int)((S)-6));
+        is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA || 
+                 origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR);
+        if(!is_DW)
+            is_BE = isBE(origin);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); 
+
+        if(is_DW){
+            src_addr += 16;
+            v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+            v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, 
+                                        (vector unsigned int)vec_splats((signed int)shp));
+            v_b1 = (vector signed short)vec_and((vector signed int)v_rd0, 
+                                       vec_splats((signed int)maskb));
+            v_b1 = (vector signed short)vec_sr((vector unsigned int)v_b1, 
+                                       (vector unsigned int)vec_splats((signed int)shb));
+            v_g1 = (vector signed short)vec_and((vector signed int)v_rd0, 
+                                        vec_splats((signed int)maskg));
+            v_g1 = (vector signed short)vec_sr((vector unsigned int)v_g1, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            v_r1 = (vector signed short)vec_and((vector signed int)v_rd0, 
+                                        vec_splats((signed int)maskr));
+            v_r1 = (vector signed short)vec_sr((vector unsigned int)v_r1, 
+                                        (vector unsigned int)vec_splats((signed int)shr));
+
+            v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, 
+                                         (vector unsigned int)vec_splats((signed int)shp));
+            v_b2 = (vector signed short)vec_and((vector signed int)v_rd1, 
+                                        vec_splats((signed int)maskb));
+            v_b2 = (vector signed short)vec_sr((vector unsigned int)v_b2, 
+                                        (vector unsigned int)vec_splats((signed int)shb));
+            v_g2 = (vector signed short)vec_and((vector signed int)v_rd1, 
+                                        vec_splats((signed int)maskg));
+            v_g2 = (vector signed short)vec_sr((vector unsigned int)v_g2, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            v_r2 = (vector signed short)vec_and((vector signed int)v_rd1, 
+                                        vec_splats((signed int)maskr));
+            v_r2 = (vector signed short)vec_sr((vector unsigned int)v_r2, 
+                                        (vector unsigned int)vec_splats((signed int)shr));
+        }else{
+             if(is_BE){
+                v_rd0 = vec_perm(v_rd0, v_rd0, 
+                       ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}));
+            }
+            v_px = (vector signed short)vec_sr((vector unsigned short)v_rd0, 
+                                            (vector unsigned short)vec_splats((signed short)shp));
+            v_b1 = (vector signed short)vec_and(v_px, 
+                                            vec_splats((signed short)maskb));
+            v_b1 = (vector signed short)vec_sr((vector unsigned short)v_b1, 
+                                            (vector unsigned short)vec_splats((signed short)shb));
+            v_g1 = (vector signed short)vec_and(v_px, 
+                                            vec_splats((signed short)maskg));
+            v_g1 = (vector signed short)vec_sr((vector unsigned short)v_g1, 
+                                            (vector unsigned short)vec_splats((signed short)shg));
+            v_r1 = (vector signed short)vec_and(v_px, 
+                                            vec_splats((signed short)maskr));
+            v_r1 = (vector signed short)vec_sr((vector unsigned short)v_r1, 
+                                            (vector unsigned short)vec_splats((signed short)shr));
+
+
+            v_b2 = vec_mergel(v_b1, (vector signed short)v_null);
+            v_g2 = vec_mergel(v_g1, (vector signed short)v_null);
+            v_r2 = vec_mergel(v_r1, (vector signed short)v_null);
+            v_b1 = vec_mergeh(v_b1, (vector signed short)v_null);
+            v_g1 = vec_mergeh(v_g1, (vector signed short)v_null);
+            v_r1 = vec_mergeh(v_r1, (vector signed short)v_null);
+            
+        }
+        
+
+        v_dst1 = vec_mul((vector signed int)v_r1, 
+                                              vec_splats((signed int)ru));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, 
+                                              vec_splats((signed int)gu) ));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, 
+                                              vec_splats((signed int)bu) ));
+        v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+        v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+        v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)ru));
+        v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_g2, vec_splats((signed int)gu) ));
+        v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_b2, vec_splats((signed int)bu) ));
+
+        v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+        
+        v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+                
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); 
+
+        v_dst1 = vec_mul((vector signed int)v_r1, 
+                                              vec_splats((signed int)rv));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, 
+                                              vec_splats((signed int)gv) ));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, 
+                                              vec_splats((signed int)bv) ));
+        v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+        v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+        v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)rv));
+        v_dst2 = vec_add((vector signed int)v_dst2, 
+                                                vec_mul((vector signed int)v_g2, 
+                                                vec_splats((signed int)gv) ));
+        v_dst2 = vec_add((vector signed int)v_dst2, 
+                                                vec_mul((vector signed int)v_b2, 
+                                                vec_splats((signed int)bv) ));
+
+        v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+        
+        v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+                
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); 
+
+        dstU_addr += 16; 
+        dstV_addr += 16;     
+        src_addr += 16;
+            
+    }
+    
+    for (i = width_adj; i < width; i++) {
+        int px = input_pixel(i) >> shp;
+        int b  = (px & maskb)   >> shb;
+        int g  = (px & maskg)   >> shg;
+        int r  = (px & maskr)   >> shr;
+        
+        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
+        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
+    }
+    
+}
+
+static av_always_inline void rgb16_32ToUV_half_c_template_vsx(int16_t *dstU,
+                                                          int16_t *dstV,
+                                                          const uint8_t *src,
+                                                          int width,
+                                                          enum AVPixelFormat origin,
+                                                          int shr, int shg,
+                                                          int shb, int shp,
+                                                          int maskr, int maskg,
+                                                          int maskb, int rsh,
+                                                          int gsh, int bsh, int S,
+                                                          int32_t *rgb2yuv)
+{
+    
+    int i, width_adj, is_DW, is_BE;
+    vector signed short v_rd0, v_rd1, v_sign, v_val;
+    vector unsigned int v_px0, v_px1;
+    vector signed int v_r2, v_g2, v_b2, v_r1, v_g1, v_b1,v_rb;  
+    vector signed int v_dst1, v_dst2;
+    vector unsigned int shift1;
+    vector signed int shift2;
+    const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, 
+              bu = rgb2yuv[BU_IDX] << bsh, rv = rgb2yuv[RV_IDX] << rsh, 
+              gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh;
+    const int maskgx   = ~(maskr | maskb);
+    const unsigned rnd = (256u<<(S)) + (1<<(S-6));
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    maskr |= maskr << 1;
+    maskb |= maskb << 1;
+    maskg |= maskg << 1;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)rnd);
+        shift2 = vec_splats((signed int)((S)-6+1));
+        is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA || 
+                 origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR);
+        if(!is_DW)
+            is_BE = isBE(origin);
+    }
+
+    for (i = 0; i < width_adj; i+=4) {
+        v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); 
+        src_addr += 16;
+        v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+        if(is_DW){
+            v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, 
+                                        (vector unsigned int)vec_splats((signed int)shp));
+            v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, 
+                                         (vector unsigned int)vec_splats((signed int)shp));
+            v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd1, 
+                       ((vector unsigned char){0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}));
+            v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd1, 
+                       ((vector unsigned char){4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}));
+            v_g1 = (vector signed int)vec_and((vector unsigned int)v_px0, (vector unsigned int)vec_splats(maskgx));
+            v_g1 = (vector signed int)vec_add((vector signed int)v_g1, (vector signed int)vec_and((vector unsigned int)v_px1, (vector unsigned int)vec_splats(maskgx)));
+            v_rb = (vector signed int)vec_add(v_px0, v_px1);
+            v_rb = (vector signed int)vec_sub((vector signed int)v_rb, (vector signed int)v_g1);
+
+            v_b1 = vec_and((vector signed int)v_rb, 
+                                        vec_splats((signed int)maskb));
+            v_b1 = (vector signed int)vec_sr((vector unsigned int)v_b1, 
+                                        (vector unsigned int)vec_splats((signed int)shb));
+
+            if(shp ||
+                origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+                origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+                v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            }else{
+                v_g1 = vec_and((vector signed int)v_g1, 
+                                        vec_splats((signed int)maskg));
+                v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            }
+            v_r1 = vec_and((vector signed int)v_rb, 
+                                        vec_splats((signed int)maskr));
+            v_r1 = (vector signed int)vec_sr((vector unsigned int)v_r1, 
+                                        (vector unsigned int)vec_splats((signed int)shr));
+
+            src_addr += 16;
+            v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); 
+            src_addr += 16;
+            v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+            v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, 
+                                        (vector unsigned int)vec_splats((signed int)shp));
+            v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, 
+                                         (vector unsigned int)vec_splats((signed int)shp));
+            v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd1, 
+                       ((vector unsigned char){0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}));
+            v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd1, 
+                       ((vector unsigned char){4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}));
+            v_g2 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats(maskgx));
+            v_g2 = (vector signed int)vec_add((vector signed int)v_g2, (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats(maskgx)));
+            v_rb = (vector signed int)vec_add(v_px0, v_px1);
+            v_rb = (vector signed int)vec_sub((vector signed int)v_rb, (vector signed int)v_g1);
+
+            v_b2 = vec_and((vector signed int)v_rb, 
+                                        vec_splats((signed int)maskb));
+            v_b2 = (vector signed int)vec_sr((vector unsigned int)v_b2, 
+                                        (vector unsigned int)vec_splats((signed int)shb));
+            if(shp ||
+                origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+                origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+                v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            }else{
+                v_g2 = vec_and((vector signed int)v_g2, 
+                                        vec_splats((signed int)maskg));
+                v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, 
+                                        (vector unsigned int)vec_splats((signed int)shg));
+            }
+            v_r2 = vec_and((vector signed int)v_rb, 
+                                        vec_splats((signed int)maskr));
+            v_r2 = (vector signed int)vec_sr((vector unsigned int)v_r2, 
+                                        (vector unsigned int)vec_splats((signed int)shr));
+        }else{
+            if(is_BE){
+                v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd0, 
+                       ((vector unsigned char){1, 0, 0, 0, 5, 4, 0, 0, 9, 8, 0, 0, 13, 12, 0, 0}));
+                v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd0, 
+                       ((vector unsigned char){3, 2, 0, 0, 7, 6, 0, 0, 11, 10, 0, 0, 15, 14, 0, 0}));
+            }else{
+                v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd0, 
+                       ((vector unsigned char){0, 1, 0, 0, 4, 5, 0, 0, 8, 9, 0, 0, 12, 13, 0, 0}));
+                v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd0, 
+                       ((vector unsigned char){2, 3, 0, 0, 6, 7, 0, 0, 10, 11, 0, 0, 14, 15, 0, 0}));
+            }
+
+            v_px0 = vec_and(v_px0, vec_splats((unsigned int)0x0000FFFF));
+            v_px1 = vec_and(v_px1, vec_splats((unsigned int)0x0000FFFF));
+
+            v_px0 = (vector unsigned int)vec_sr(v_px0, 
+                           (vector unsigned int)vec_splats((signed int)shp));
+            v_px1 = (vector unsigned int)vec_sr(v_px1, 
+                           (vector unsigned int)vec_splats((signed short)shp));
+            
+           
+            v_g1 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats((unsigned int)maskgx));
+            v_g1 = (vector signed int)vec_add(v_g1, (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats((signed int)maskgx)));
+            v_rb = (vector signed int)vec_add(v_px0, v_px1);
+            v_rb = (vector signed int)vec_sub(v_rb, v_g1);
+
+            
+
+            v_b1 = (vector signed int)vec_and(v_rb, vec_splats((signed int)maskb));
+            v_b1 = (vector signed int)vec_sr((vector unsigned int)v_b1, 
+                                            (vector unsigned int)vec_splats((signed int)shb));
+            
+            if(shp ||
+                origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+                origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+                v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, (vector unsigned int)vec_splats((signed int)shg));
+            }else{
+                v_g1 = vec_and(v_g1, vec_splats((signed int)maskg));
+                v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, 
+                                            (vector unsigned int)vec_splats((signed int)shg));
+            }
+            
+            v_r1 = (vector signed int)vec_and((vector signed int)v_rb, 
+                                       vec_splats((signed int)maskr));
+            v_r1 = (vector signed int)vec_sr((vector unsigned int)v_r1, 
+                                       (vector unsigned int)vec_splats((signed int)shr));
+            if(is_BE){
+                v_px0 = (vector unsigned int)vec_perm(v_rd1, v_rd1, 
+                       ((vector unsigned char){1, 0, 0, 0, 5, 4, 0, 0, 9, 8, 0, 0, 13, 12, 0, 0}));
+                v_px1 = (vector unsigned int)vec_perm(v_rd1, v_rd1, 
+                       ((vector unsigned char){3, 2, 0, 0, 7, 6, 0, 0, 11, 10, 0, 0, 15, 14, 0, 0}));
+            }else{
+                v_px0 = (vector unsigned int)vec_perm(v_rd1, v_rd1, 
+                       ((vector unsigned char){0, 1, 0, 0, 4, 5, 0, 0, 8, 9, 0, 0, 12, 13, 0, 0}));
+                v_px1 = (vector unsigned int)vec_perm(v_rd1, v_rd1, 
+                       ((vector unsigned char){2, 3, 0, 0, 6, 7, 0, 0, 10, 11, 0, 0, 14, 15, 0, 0}));
+            }
+
+            v_px0 = vec_and(v_px0, vec_splats((unsigned int)0x0000FFFF));
+            v_px1 = vec_and(v_px1, vec_splats((unsigned int)0x0000FFFF));
+
+            v_px0 = vec_sr((vector unsigned int)v_px0, 
+                           (vector unsigned int)vec_splats((signed int)shp));
+            v_px1 = vec_sr((vector unsigned int)v_px1, 
+                           (vector unsigned int)vec_splats((signed int)shp));
+            
+           
+            v_g2 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats((unsigned int)maskgx));
+            v_g2 = (vector signed int)vec_add(v_g2, 
+                   (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats((signed int)maskgx)));
+            v_rb = (vector signed int)vec_add(v_px0, v_px1);
+            v_rb = (vector signed int)vec_sub(v_rb, v_g2); 
+
+            v_b2 = (vector signed int)vec_and(v_rb, vec_splats((signed int)maskb));
+            v_b2 = (vector signed int)vec_sr((vector unsigned int)v_b2, 
+                                            (vector unsigned int)vec_splats((signed int)shb));
+            if(shp ||
+                origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+                origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+                v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, (vector unsigned int)vec_splats((signed int)shg));
+            }else{
+                v_g2 = vec_and(v_g2, 
+                                            vec_splats((signed int)maskg));
+                v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, (vector unsigned int)vec_splats((signed int)shg));
+            }
+            v_r2 = vec_and(v_rb, vec_splats((signed int)maskr));
+            v_r2 = (vector signed int)vec_sr((vector unsigned int)v_r2, (vector unsigned int)vec_splats((signed short)shr));                                
+        }
+
+        v_dst1 = vec_mul((vector signed int)v_r1, 
+                                              vec_splats((signed int)ru));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, 
+                                              vec_splats((signed int)gu) ));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, 
+                                              vec_splats((signed int)bu) ));
+        v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+        v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+        v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)ru));
+        v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_g2, vec_splats((signed int)gu) ));
+        v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_b2, vec_splats((signed int)bu) ));
+
+        v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+        
+        v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+                
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); 
+
+        v_dst1 = vec_mul((vector signed int)v_r1, 
+                                              vec_splats((signed int)rv));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_g1, 
+                                              vec_splats((signed int)gv) ));
+        v_dst1 = vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_b1, 
+                                              vec_splats((signed int)bv) ));
+        v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+        v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+        v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)rv));
+        v_dst2 = vec_add((vector signed int)v_dst2, 
+                                                vec_mul((vector signed int)v_g2, 
+                                                vec_splats((signed int)gv) ));
+        v_dst2 = vec_add((vector signed int)v_dst2, 
+                                                vec_mul((vector signed int)v_b2, 
+                                                vec_splats((signed int)bv) ));
+
+        v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+        
+        v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+                
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); 
+
+        dstU_addr += 16; 
+        dstV_addr += 16;     
+        src_addr += 16;
+            
+    }
+    
+    for (i = width_adj; i < width; i++) {
+
+        unsigned px0 = input_pixel(2 * i + 0) >> shp;
+        unsigned px1 = input_pixel(2 * i + 1) >> shp;
+        int b, r, g = (px0 & maskgx) + (px1 & maskgx);
+        int rb = px0 + px1 - g;
+
+        b = (rb & maskb) >> shb;
+        if (shp ||
+            origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+            origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE) {
+            g >>= shg;
+        } else {
+            g = (g & maskg) >> shg;
+        }
+        r = (rb & maskr) >> shr;
+
+        dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
+        dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
+    }
+    
+}
+
+#undef input_pixel
+
+#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr,          \
+                         maskg, maskb, rsh, gsh, bsh, S)                \
+static void name ## ToY_c_vsx(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,            \
+                          int width, uint32_t *tab)                     \
+{                                                                       \
+    rgb16_32ToY_c_template_vsx((int16_t*)dst, src, width, fmt, shr, shg, shb, shp,    \
+                           maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
+}                                                                       \
+                                                                        \
+static void name ## ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,                \
+                           const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy,    \
+                           int width, uint32_t *tab)                    \
+{                                                                       \
+    rgb16_32ToUV_c_template_vsx((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,                \
+                            shr, shg, shb, shp,                         \
+                            maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\
+}                                                                       \
+                                                                        \
+static void name ## ToUV_half_c_vsx(uint8_t *dstU, uint8_t *dstV,           \
+                                const uint8_t *unused0, const uint8_t *src,                     \
+                                const uint8_t *dummy,                   \
+                                int width, uint32_t *tab)               \
+{                                                                       \
+    rgb16_32ToUV_half_c_template_vsx((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,           \
+                                 shr, shg, shb, shp,                    \
+                                 maskr, maskg, maskb,                   \
+                                 rsh, gsh, bsh, S, tab);                \
+}
+
+rgb16_32_wrapper(AV_PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_BGR444LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_RGB444LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_BGR444BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
+
+static void gbr24pToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV,
+                         const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
+                         int width, uint32_t *rgb2yuv)
+{
+    
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_rd00, v_rd01, v_rd02;
+    int i, width_adj;
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    uint16_t *dstU = (uint16_t *)_dstU;
+    uint16_t *dstV = (uint16_t *)_dstV;
+    const int ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    const int rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; 
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+    uintptr_t gsrc_addr = (uintptr_t)gsrc;
+    uintptr_t bsrc_addr = (uintptr_t)bsrc;
+    uintptr_t rsrc_addr = (uintptr_t)rsrc;
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = ((vector unsigned int){(0x4001<<(RGB2YUV_SHIFT-6)),(0x4001<<(RGB2YUV_SHIFT-6)),
+                                        (0x4001<<(RGB2YUV_SHIFT-6)),(0x4001<<(RGB2YUV_SHIFT-6))} );
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6+1));
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)gsrc_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)bsrc_addr);
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)rsrc_addr);
+
+        v_rd00 = vec_sr(v_rd0, vec_splats((unsigned short)8));
+        v_rd01 = vec_sr(v_rd1, vec_splats((unsigned short)8));
+        v_rd02 = vec_sr(v_rd2, vec_splats((unsigned short)8));
+        v_rd0  = vec_and(v_rd0, vec_splats((unsigned short)0xFF));
+        v_rd1  = vec_and(v_rd1, vec_splats((unsigned short)0xFF));
+        v_rd2  = vec_and(v_rd2, vec_splats((unsigned short)0xFF));
+
+        v_rd0 = vec_add(v_rd0, v_rd00);
+        v_rd1 = vec_add(v_rd1, v_rd01);
+        v_rd2 = vec_add(v_rd2, v_rd02);
+
+        v_rd00 = vec_mergeh(v_rd0, v_null);
+        v_rd01 = vec_mergeh(v_rd1, v_null);
+        v_rd02 = vec_mergeh(v_rd2, v_null);
+        v_rd0 = vec_mergel(v_rd0, v_null);
+        v_rd1 = vec_mergel(v_rd1, v_null);
+        v_rd2 = vec_mergel(v_rd2, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd02, 
+                                              ((vector signed int){ru,ru,ru,ru}));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_rd00, 
+                                              ((vector signed int){gu,gu,gu,gu}) ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_rd01, 
+                                              ((vector signed int){bu,bu,bu,bu})));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_rd2, 
+                                              ((vector signed int){ru,ru,ru,ru}));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_rd0, 
+                                              ((vector signed int){gu,gu,gu,gu})));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_rd1,  
+                                              ((vector signed int){bu,bu,bu,bu})));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); dstU_addr+=16;
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd02, 
+                                              ((vector signed int){rv,rv,rv,rv}));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_rd00, 
+                                              ((vector signed int){gv,gv,gv,gv})));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_rd01, 
+                                              ((vector signed int){bv,bv,bv,bv})));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_rd2, 
+                                              ((vector signed int){rv,rv,rv,rv}));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_rd0, 
+                                              ((vector signed int){gv,gv,gv,gv})));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_rd1, 
+                                              ((vector signed int){bv,bv,bv,bv})));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); dstV_addr+=16;
+
+        gsrc_addr += 16;
+        bsrc_addr += 16;
+        rsrc_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
+        unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
+        unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+        dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+    }
+    
+}
+
+static void rgba64leToA_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+                          const uint8_t *unused2, int width, uint32_t *unused)
+{
+    
+    int16_t *dst = (int16_t *)_dst;
+    const uint16_t *src = (const uint16_t *)_src;
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+
+    uintptr_t src_addr = (uintptr_t)_src;
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+    // compute integral number of vector-length items and length of final fragment
+    width_adj = width & (~(int)0x07);
+
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr + 32));
+        v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr + 48));
+
+        v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31}));
+        v_rd0 = vec_perm(v_rd2, v_rd3, ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31}));
+        v_dst = vec_sld(v_dst, v_dst, 8);
+        v_dst = vec_sld(v_rd0, v_dst, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 64;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dst[i]= AV_RL16(src + 4*i + 3);
+    }
+    
+}
+
+static void rgba64beToA_c_vsx(uint8_t *_dst, const uint8_t *_src, 
+                              const uint8_t *unused1, const uint8_t *unused2, 
+                              int width, uint32_t *unused)
+{
+    
+    int16_t *dst = (int16_t *)_dst;
+    const uint16_t *src = (const uint16_t *)_src;
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+
+    uintptr_t src_addr = (uintptr_t)_src;
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+    // compute integral number of vector-length items and length of final fragment
+    width_adj = width & (~(int)0x07);
+
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr + 32));
+        v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr + 48));
+
+        v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char){7, 6, 15, 14, 23, 22, 31, 30}));
+        v_rd0 = vec_perm(v_rd2, v_rd3, ((vector unsigned char){7, 6, 15, 14, 23, 22, 31, 30}));
+        v_dst = vec_sld(v_dst, v_dst, 8);
+        v_dst = vec_sld(v_rd0, v_dst, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 64;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dst[i]= AV_RB16(src + 4*i + 3);
+    }
+    
+}
+
+static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                          const uint8_t *unused1, const uint8_t *unused2, 
+                          int width, uint32_t *unused)
+{
+    
+    int16_t *dst = (int16_t *)_dst;
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_dst;
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    // compute integral number of vector-length items and length of final fragment
+    width_adj = width & (~(int)0x07);
+
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)(src_addr));
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+        v_rd0 = vec_and(v_rd0, v_FF);
+        v_rd1 = vec_and(v_rd1, v_FF);
+
+        v_rd0 = vec_sl(v_rd0, vec_splats((unsigned short)6));
+        v_rd1 = vec_sl(v_rd1, vec_splats((unsigned short)6));
+
+        v_dst = vec_perm(v_rd0, v_rd1, 
+                ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dst[i]= src[4*i]<<6;
+    }
+    
+}
+
+static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                          const uint8_t *unused1, const uint8_t *unused2,
+                          int width, uint32_t *unused)
+{
+    
+    int16_t *dst = (int16_t *)_dst;
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_dst;
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    // compute integral number of vector-length items and length of final fragment
+    width_adj = width & (~(int)0x07);
+
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)(src_addr));
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+        v_rd0 = vec_sld(v_rd0, v_rd0, 13);
+        v_rd1 = vec_sld(v_rd1, v_rd1, 13);
+
+        v_rd0 = vec_and(v_rd0, v_FF);
+        v_rd1 = vec_and(v_rd1, v_FF);
+
+        v_rd0 = vec_sl(v_rd0, vec_splats((unsigned short)6));
+        v_rd1 = vec_sl(v_rd1, vec_splats((unsigned short)6));
+
+        v_dst = vec_perm(v_rd0, v_rd1, 
+                ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dst[i]= src[4*i+3]<<6;
+    }
+    
+}
+
+static void palToA_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                         const uint8_t *unused1, const uint8_t *unused2, 
+                         int width, uint32_t *pal)
+{
+    
+    int16_t *dst = (int16_t *)_dst;
+    int i, j, d, width_adj;
+    uint32_t _pal[8];
+
+    vector unsigned short v_dst;
+    vector unsigned int v_rd0, v_rd1, v_rd3, v_rd4;
+    vector unsigned char sample;
+    vector unsigned int shift1;
+    vector unsigned short shift2;
+
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+        shift1 = ((vector unsigned int){24, 24, 24, 24});
+        shift2 = vec_splats((unsigned short)6);
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        for( j=0; j<8; ++j)
+            _pal[j] = pal[src[j]];
+        
+        v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal);
+        v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4]));
+        v_rd3 = vec_sr(v_rd0, shift1);
+        v_rd4 = vec_sr(v_rd1, shift1);
+        v_rd0 = vec_perm(v_rd3, v_rd4, sample);
+        v_dst = vec_sl((vector unsigned short)v_rd0, shift2);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src += 8;
+        dst_addr += 16;
+
+    }
+
+    for (i = width_adj; i < width; i++) {
+        d = *src;
+        dst[i]= (pal[d] >> 24)<<6;
+        ++src;
+    }
+    
+}
+
+/*static void palToY_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                         const uint8_t *unused1, const uint8_t *unused2, 
+                         int width, uint32_t *pal)
+{
+    
+    int16_t *dst = (int16_t *)_dst;
+    int i, j, d, width_adj;
+    uint32_t _pal[8];
+
+    vector unsigned short v_dst;
+    vector unsigned int v_rd0, v_rd1, v_rd3, v_rd4;
+    vector unsigned char sample;
+    vector unsigned short shift;
+
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+        shift = vec_splats((unsigned short)6);
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        for( j=0; j<8; ++j)
+            _pal[j] = pal[src[j]];
+
+        v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal);
+        v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4]));
+        v_rd3 = vec_and(v_rd0, v_000000FF);
+        v_rd4 = vec_and(v_rd1, v_000000FF);
+        v_rd0 = vec_perm(v_rd3, v_rd4, sample);
+        v_dst = vec_sl((vector unsigned short)v_rd0, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src += 8;
+        dst_addr += 16;
+
+    }
+
+    for (i = width_adj; i < width; i++) {
+        d= *src;
+        dst[i] = (pal[d] & 0xFF)<<6;
+        src++;
+    }
+    
+}
+
+static void palToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV,
+                          const uint8_t *unused0, const uint8_t *src1, 
+                          const uint8_t *src2, int width, uint32_t *pal)
+{
+    av_assert1(src1 == src2);
+    
+    uint16_t *dstU = (uint16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
+    int i, j, d, width_adj;
+    uint32_t _pal[8];
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    vector unsigned short v_dst, v_tmp0, v_tmp1;
+    vector unsigned int v_rd0, v_rd1, shift1, shift2;
+    vector unsigned char sample;
+    vector unsigned short shift3;
+
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+    
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+        shift1 = vec_splats((unsigned int)8);
+        shift2 = vec_splats((unsigned int)16);
+        shift3 = vec_splats((unsigned short)6);
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        for( j = 0; j < 8; ++j)
+            _pal[j] = pal[src1[j]];
+
+        v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal);
+        v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4]));
+
+        v_tmp0 = (vector unsigned short)vec_sr(v_rd0, shift1);
+        v_tmp1 = (vector unsigned short)vec_sr(v_rd1, shift1);
+        v_dst = (vector unsigned short)vec_perm(v_tmp0, v_tmp1, sample);
+        v_tmp0 = vec_and(v_dst, v_FF);
+        v_dst = vec_sl((vector unsigned short)v_tmp0, shift3);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+
+        v_tmp0 = (vector unsigned short)vec_sr(v_rd0, shift2);
+        v_tmp1 = (vector unsigned short)vec_sr(v_rd1, shift2);
+        v_dst = (vector unsigned short)vec_perm(v_tmp0, v_tmp1, sample);
+        v_tmp0 = vec_and(v_dst, v_FF);
+        v_dst = vec_sl((vector unsigned short)v_tmp0, shift3);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src1 += 8;
+        dstU_addr += 16;
+        dstV_addr += 16;
+
+    }
+
+    for (i = width_adj; i < width; i++) {
+        d = pal[*src1];;
+        dstU[i] = (uint8_t)(d>> 8)<<6;
+        dstV[i] = (uint8_t)(d>>16)<<6;
+        src1++;
+    }
+    
+}*/
+
+static void monowhite2Y_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                              const uint8_t *unused1, const uint8_t *unused2, 
+                              int width, uint32_t *unused)
+{
+    
+
+    int16_t *dst = (int16_t *)_dst;
+    int i, j;
+    vector unsigned short v_rd0, v_dst;
+
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width = (width + 7) >> 3;
+    for (i = 0; i < width; i++) {
+        v_rd0 = vec_splats((unsigned short)~src[i]);
+
+        v_dst = vec_sr(v_rd0, ((vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0}));
+        v_dst = vec_and(v_dst, vec_splats((unsigned short)0x01));
+        v_dst = vec_mul(v_dst, vec_splats((unsigned short)16383));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+        dst_addr += 16;
+    }
+    if(width&7){
+        int d= ~src[i];
+        for (j = 0; j < (width&7); j++)
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+    }
+    
+}
+
+static void monoblack2Y_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                              const uint8_t *unused1, const uint8_t *unused2, 
+                              int width, uint32_t *unused)
+{
+    
+        int16_t *dst = (int16_t *)_dst;
+    int i, j;
+    vector unsigned short v_rd0, v_dst;
+
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width = (width + 7) >> 3;
+    for (i = 0; i < width; i++) {
+        v_rd0 = vec_splats((unsigned short)src[i]);
+
+        v_dst = vec_sr(v_rd0, ((vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0}));
+        v_dst = vec_and(v_dst, vec_splats((unsigned short)0x01));
+        v_dst = vec_mul(v_dst, vec_splats((unsigned short)16383));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+        dst_addr += 16;
+    }
+    if(width&7){
+        int d= src[i];
+        for (j = 0; j < (width&7); j++)
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+    }
+    
+}
+
+static void yuy2ToY_c_vsx(uint8_t *dst, const uint8_t *src, 
+                         const uint8_t *unused1, const uint8_t *unused2, 
+                         int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x0F);
+
+    for ( i = 0; i < width_adj; i += 16) {
+        vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+
+        vector int v_dst = vec_perm(v_rd0, v_rd1, 
+                           ((vector unsigned char){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dst[i] = src[2 * i];
+    }
+    
+}
+
+static void yuy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, 
+                           const uint8_t *unused0, const uint8_t *src1,
+                           const uint8_t *src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x0F);
+
+    vec_and(v_dst, vec_splats((int)0x000));
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31});
+        sample2 = ((vector unsigned char){3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29});
+    }
+    for ( i = 0; i < width_adj; i += 16) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd2 = vec_perm(v_rd0, v_rd1, sample1);
+        src_addr += 32;
+
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd3 = vec_perm(v_rd0, v_rd1, sample2);
+        v_dst = vec_sld(v_rd2, v_rd3, 8);
+        v_dst = vec_sld(v_dst, v_dst, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        v_dst = vec_sld(v_rd3, v_rd2, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 32;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dstU[i] = src1[4 * i + 1];
+        dstV[i] = src1[4 * i + 3];
+    }
+    
+    av_assert1(src1 == src2);
+}
+
+static void yvy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, 
+                           const uint8_t *unused0, const uint8_t *src1,
+                           const uint8_t *src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x0F);
+
+    vec_and(v_dst, vec_splats(0x000));
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31});
+        sample2 = ((vector unsigned char){3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29});
+    }
+    for ( i = 0; i < width_adj; i += 16) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd2 = vec_perm(v_rd0, v_rd1, sample1);
+        src_addr += 32;
+
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd3 = vec_perm(v_rd0, v_rd1, sample2);
+        v_dst = vec_sld(v_rd2, v_rd3, 8);
+        v_dst = vec_sld(v_dst, v_dst, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+        v_dst = vec_sld(v_rd3, v_rd2, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+
+        src_addr += 32;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        dstV[i] = src1[4 * i + 1];
+        dstU[i] = src1[4 * i + 3];
+    }
+    
+    av_assert1(src1 == src2);
+}
+static void bswap16Y_c_vsx(uint8_t *_dst, const uint8_t *_src, 
+                           const uint8_t *unused1, const uint8_t *unused2, 
+                           int width, uint32_t *unused)
+{
+    
+
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_dst, v_shift;
+
+    const uint16_t *src = (const uint16_t *)_src;
+    uint16_t *dst       = (uint16_t *)_dst;
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj)
+        v_shift = (vector unsigned short)vec_splats((unsigned short)8);
+    for ( i = 0; i < width_adj; i += 8) {
+        v_dst = vec_vsx_ld(0, (unsigned short *)src_addr);
+
+        v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift);
+        v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift);
+        v_dst = vec_or(v_rd0, v_rd1);
+
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+        
+        src_addr += 16;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dst[i] = (src[i]>>8) | (src[i]<<8);
+    }
+    
+}
+
+static void bswap16UV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, 
+                            const uint8_t *unused0, const uint8_t *_src1, 
+                            const uint8_t *_src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_dst, v_shift;
+
+    const uint16_t *src1 = (const uint16_t *)_src1,
+                   *src2 = (const uint16_t *)_src2;
+    uint16_t *dstU       = (uint16_t *)_dstU,
+             *dstV       = (uint16_t *)_dstV;
+    uintptr_t src1_addr = (uintptr_t)_src1,
+              src2_addr = (uintptr_t)_src2;
+    uintptr_t dstU_addr = (uintptr_t)dstU,
+              dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj)
+        v_shift = (vector unsigned short)vec_splats((unsigned short)8);
+    for ( i = 0; i < width_adj; i += 8) {
+        // load to dstU
+        v_dst = vec_vsx_ld(0, (unsigned short *)src1_addr);
+        v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift);
+        v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift);
+        v_dst = vec_or(v_rd0, v_rd1);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        // load to dstV
+        v_dst = vec_vsx_ld(0, (unsigned short *)src2_addr);
+        v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift);
+        v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift);
+        v_dst = vec_or(v_rd0, v_rd1);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+        //
+        src1_addr += 16;
+        src2_addr += 16;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        dstU[i] = (src1[i]>>8) | (src1[i]<<8);
+        dstV[i] = (src2[i]>>8) | (src2[i]<<8);
+    }
+    
+}
+
+static void read_ya16le_gray_c_vsx(uint8_t *dst, const uint8_t *src, 
+                                   const uint8_t *unused1, const uint8_t *unused2, 
+                                   int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector int v_rd0, v_rd1, v_dst;
+    vector unsigned char sample;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_dst = vec_perm(v_rd0, v_rd1, sample);
+        
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RL16(src + i * 4));
+    }
+    
+}
+
+static void read_ya16le_alpha_c_vsx(uint8_t *dst, const uint8_t *src, 
+                                    const uint8_t *unused1, const uint8_t *unused2, 
+                                    int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector int v_rd0, v_rd1, v_dst;
+    vector unsigned char sample;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_dst = vec_perm(v_rd0, v_rd1, sample);
+        
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RL16(src + i * 4 + 2));
+    }
+    
+}
+
+static void read_ya16be_gray_c_vsx(uint8_t *dst, const uint8_t *src, 
+                                   const uint8_t *unused1, const uint8_t *unused2, 
+                                   int width, uint32_t *unused)
+{
+    
+
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_dst;
+    vector unsigned char sample;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_dst = vec_perm(v_rd0, v_rd1, sample);
+        
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RB16(src + i * 4));
+    }
+    
+}
+
+static void read_ya16be_alpha_c_vsx(uint8_t *dst, const uint8_t *src, 
+                                    const uint8_t *unused1, const uint8_t *unused2, 
+                                    int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_dst;
+    vector unsigned char sample;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_dst = vec_perm(v_rd0, v_rd1, sample);
+        
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RB16(src + i * 4 + 2));
+    }
+    
+
+}
+
+static void read_ayuv64le_Y_c_vsx(uint8_t *dst, const uint8_t *src, 
+                                  const uint8_t *unused0, const uint8_t *unused1, 
+                                  int width, uint32_t *unused2)
+{
+    
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 10, 11, 18, 19, 26, 27});
+        sample2 = ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+        v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+        v_rd0 = vec_perm(v_rd0, v_rd1, sample1);
+        v_rd2 = vec_perm(v_rd2, v_rd3, sample2);
+        v_dst = vec_sld(v_rd2, v_rd0, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 64;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RL16(src + i * 8 + 2));
+    }
+    
+}
+
+
+static void read_ayuv64le_UV_c_vsx(uint8_t *dstU, uint8_t *dstV, 
+                                   const uint8_t *unused0, const uint8_t *src, 
+                                   const uint8_t *unused1, int width, uint32_t *unused2)
+{
+    
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_rd2, v_rd3, v_rd4, v_rd5, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31, 4, 5, 12, 13, 20, 21, 28, 29});
+        sample2 = ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+        v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+        v_rd4 = vec_perm(v_rd0, v_rd1, sample1);
+        v_rd5 = vec_perm(v_rd2, v_rd3, sample2);
+        v_dst = vec_sld(v_rd5, v_rd4, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        v_dst = vec_sld(v_rd4, v_rd5, 8);
+        v_dst = vec_sld(v_dst, v_dst, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 64;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dstU + i * 2, AV_RL16(src + i * 8 + 4));
+        AV_WN16(dstV + i * 2, AV_RL16(src + i * 8 + 6));
+    }
+    
+}
+
+static void read_ayuv64le_A_c_vsx(uint8_t *dst, const uint8_t *src, 
+                                  const uint8_t *unused0, const uint8_t *unused1, 
+                                  int width, uint32_t *unused2)
+{
+    
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 9, 16, 17, 24, 25});
+        sample2 = ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25, 0, 0, 0, 0, 0, 0, 0, 0});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+        v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+        v_rd0 = vec_perm(v_rd0, v_rd1, sample1);
+        v_rd2 = vec_perm(v_rd2, v_rd3, sample2);
+        v_dst = vec_sld(v_rd2, v_rd0, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 64;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RL16(src + i * 8));
+    }
+    
+}
+
+/* This is almost identical to the previous, end exists only because
+ * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
+static void uyvyToY_c_vsx(uint8_t *dst, const uint8_t *src, 
+                          const uint8_t *unused1, const uint8_t *unused2, 
+                          int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x0F);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+    }
+    for ( i = 0; i < width_adj; i += 16) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+
+        v_dst = vec_perm(v_rd0, v_rd1, sample1);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        dst[i] = src[2 * i + 1];
+    }
+    
+}
+
+static void uyvyToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, 
+                           const uint8_t *unused0, const uint8_t *src1, 
+                           const uint8_t *src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_rd2, v_rd3, v_rd4, v_rd5, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x0F);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){2, 6, 10, 14, 18, 22, 26, 30, 0, 4, 8, 12, 16, 20, 24, 28});
+        sample2 = ((vector unsigned char){0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30});
+    }
+    for ( i = 0; i < width_adj; i += 16) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+        v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+        v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+        v_rd4 = vec_perm(v_rd0, v_rd1, sample1);
+        v_rd5 = vec_perm(v_rd2, v_rd3, sample2);
+        v_dst = vec_sld(v_rd5, v_rd4, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        v_dst = vec_sld(v_rd4, v_rd5, 8);
+        v_dst = vec_sld(v_dst, v_dst, 8);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 64;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        dstU[i] = src1[4 * i + 0];
+        dstV[i] = src1[4 * i + 2];
+    }
+    
+    av_assert1(src1 == src2);
+}
+
+static av_always_inline void nvXXtoUV_c_vsx(uint8_t *dst1, uint8_t *dst2,
+                                        const uint8_t *src, int width)
+{
+    
+    int i, width_adj;
+
+    vector int v_rd0, v_rd1, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst1_addr = (uintptr_t)dst1;
+    uintptr_t dst2_addr = (uintptr_t)dst2;
+
+    width_adj = width & (~(int)0x0F);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+        sample2 = ((vector unsigned char){1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+    }
+    for ( i = 0; i < width_adj; i += 16) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+
+        v_dst = vec_perm(v_rd0, v_rd1, sample1);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst1_addr);
+        v_dst = vec_perm(v_rd0, v_rd1, sample2);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst2_addr);
+
+        src_addr += 32;
+        dst1_addr += 16;
+        dst2_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        dst1[i] = src[2 * i + 0];
+        dst2[i] = src[2 * i + 1];
+    }
+    
+}
+
+static void nv12ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+                       int width, uint32_t *unused)
+{
+    nvXXtoUV_c_vsx(dstU, dstV, src1, width);
+}
+
+static void nv21ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+                       int width, uint32_t *unused)
+{
+    nvXXtoUV_c_vsx(dstV, dstU, src1, width);
+}
+
+static void p010LEToY_c_vsx(uint8_t *dst, const uint8_t *src, 
+                           const uint8_t *unused1, const uint8_t *unused2, 
+                           int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_dst, shift;
+    vector unsigned char sample;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj)
+        shift = vec_splats((unsigned short)6);
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        
+        v_dst = vec_sr(v_rd0, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 16;
+        dst_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RL16(src + i * 2) >> 6);
+    }
+    
+}
+
+static void p010BEToY_c_vsx(uint8_t *dst, const uint8_t *src, 
+                            const uint8_t *unused1, const uint8_t *unused2, 
+                            int width, uint32_t *unused)
+{
+    
+
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_dst, shift;
+    vector unsigned char sample;
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)dst;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample = ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14});
+        shift = vec_splats((unsigned short)6);
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        
+        v_rd1 = vec_perm(v_rd0, v_rd0, sample);
+        v_dst = vec_sr(v_rd1, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+        src_addr += 16;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dst + i * 2, AV_RB16(src + i * 2) >> 6);
+    }
+    
+}
+
+static void p010LEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *unused0, const uint8_t *src1, 
+                       const uint8_t *src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    vector unsigned short v_rd0, v_rd1, v_dst;
+    vector unsigned char sample1, sample2;
+    vector unsigned short shift;
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+        sample2 = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31});
+        shift = vec_splats((unsigned short)6);
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+        v_dst = vec_perm(v_rd0, v_rd1, sample1);
+        v_dst = vec_sr(v_dst, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        v_dst = vec_perm(v_rd0, v_rd1, sample2);
+        v_dst = vec_sr(v_dst, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 32;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dstU + i * 2, AV_RL16(src1 + i * 4 + 0) >> 6);
+        AV_WN16(dstV + i * 2, AV_RL16(src1 + i * 4 + 2) >> 6);
+    }
+    
+}
+
+static void p010BEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *unused0, const uint8_t *src1, 
+                       const uint8_t *src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    vector unsigned short v_rd0, v_rd1, v_dst;
+    vector unsigned char sample1, sample2;
+    vector unsigned short shift;
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28});
+        sample2 = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30});
+        shift = vec_splats((unsigned short)6);
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+        v_dst = vec_perm(v_rd0, v_rd1, sample1);
+        v_dst = vec_sr(v_dst, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        v_dst = vec_perm(v_rd0, v_rd1, sample2);
+        v_dst = vec_sr(v_dst, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 32;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dstU + i * 2, AV_RB16(src1 + i * 4 + 0) >> 6);
+        AV_WN16(dstV + i * 2, AV_RB16(src1 + i * 4 + 2) >> 6);
+
+    }
+    
+}
+
+static void p016LEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *unused0, const uint8_t *src1, 
+                       const uint8_t *src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    vector unsigned short v_rd0, v_rd1, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+        sample2 = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+        v_dst = vec_perm(v_rd0, v_rd1, sample1);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        v_dst = vec_perm(v_rd0, v_rd1, sample2);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 32;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dstU + i * 2, AV_RL16(src1 + i * 4 + 0));
+        AV_WN16(dstV + i * 2, AV_RL16(src1 + i * 4 + 2));
+    }
+    
+}
+
+static void p016BEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, 
+                       const uint8_t *unused0, const uint8_t *src1, 
+                       const uint8_t *src2, int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+
+    vector unsigned short v_rd0, v_rd1, v_dst;
+    vector unsigned char sample1, sample2;
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)dstU;
+    uintptr_t dstV_addr = (uintptr_t)dstV;
+
+    width_adj = width & (~(int)0x07);
+
+    if(width_adj){
+        sample1 = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28});
+        sample2 = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30});
+    }
+    for ( i = 0; i < width_adj; i += 8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+        v_dst = vec_perm(v_rd0, v_rd1, sample1);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+        v_dst = vec_perm(v_rd0, v_rd1, sample2);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+        src_addr += 32;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+
+    for (i = width_adj; i < width; i++) {
+        AV_WN16(dstU + i * 2, AV_RB16(src1 + i * 4 + 0));
+        AV_WN16(dstV + i * 2, AV_RB16(src1 + i * 4 + 2));;
+    }
+    
+}
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+static void bgr24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                           const uint8_t *unused1, const uint8_t *unused2, 
+                           int width, uint32_t *rgb2yuv)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dst = (int16_t *)_dst;
+    vector signed int v_ry, v_gy, v_by;
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+        v_ry = vec_splats((signed int)ry);
+        v_gy = vec_splats((signed int)gy);
+        v_by = vec_splats((signed int)by);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+        v_b = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+        v_g = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+        v_r = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+        v_b = vec_and(v_b, v_FF);
+        v_g = vec_and(v_g, v_FF);
+        v_r = vec_and(v_r, v_FF);
+
+        v_rd0 = vec_mergeh(v_b, v_null);
+        v_rd1 = vec_mergeh(v_g, v_null);
+        v_rd2 = vec_mergeh(v_r, v_null);
+
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+        v_r = vec_mergel(v_r, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ry);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_rd1, v_gy ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                              vec_mul((vector signed int)v_rd0, v_by ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_g, v_gy ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                              vec_mul((vector signed int)v_b,  v_by ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); 
+
+        src_addr += 24;
+        dst_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        unsigned int b   = src[3*i];
+        unsigned int g   = src[3*i + 1];
+        unsigned int r   = src[3*i + 2];
+
+        dst[i] = ((ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+    }
+    
+}
+
+
+static void bgr24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, 
+                            const uint8_t *unused0, const uint8_t *src1, 
+                            const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+        v_b = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+        v_g = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+        v_r = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+        v_b = vec_and(v_b, v_FF);
+        v_g = vec_and(v_g, v_FF);
+        v_r = vec_and(v_r, v_FF);
+
+        v_rd0 = vec_mergeh(v_b, v_null);
+        v_rd1 = vec_mergeh(v_g, v_null);
+        v_rd2 = vec_mergeh(v_r, v_null);
+
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+        v_r = vec_mergel(v_r, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd0, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); 
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd0, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); 
+
+        src_addr += 24;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        int b = src1[3 * i + 0];
+        int g = src1[3 * i + 1];
+        int r = src1[3 * i + 2];
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+    }
+      
+    av_assert1(src1 == src2);
+}
+
+static void bgr24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+                             const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+    
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x8002<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-5));
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+        
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+        v_b = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){0, 3, 6, 9,  12, 15, 18, 21, 24, 27, 30}));
+        v_g = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31}));
+        v_r = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){2, 5, 8, 11, 14, 17, 20, 23, 26, 29}));
+
+        v_b = vec_perm(v_b, v_rd2, 
+              ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}));
+        v_g = vec_perm(v_g, v_rd2, 
+              ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}));
+        v_r = vec_perm(v_r, v_rd2, 
+              ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16,  19, 22, 25, 28, 31}));
+
+        v_r = vec_add(vec_and(v_r, v_FF), vec_sr(v_r, vec_splats((unsigned short)8)));
+        v_g = vec_add(vec_and(v_g, v_FF), vec_sr(v_g, vec_splats((unsigned short)8)));
+        v_b = vec_add(vec_and(v_b, v_FF), vec_sr(v_b, vec_splats((unsigned short)8)));
+
+        v_rd0 = vec_mergeh(v_r, v_null);
+        v_rd1 = vec_mergeh(v_g, v_null);
+        v_rd2 = vec_mergeh(v_b, v_null);
+
+        v_r = vec_mergel(v_r, v_null);
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd2, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); 
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd2, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); 
+
+        src_addr += 48;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        int b = src1[6 * i + 0] + src1[6 * i + 3];
+        int g = src1[6 * i + 1] + src1[6 * i + 4];
+        int r = src1[6 * i + 2] + src1[6 * i + 5];
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+        dstV[i] = (rv*r + gv*g + bv*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+    }
+    
+    av_assert1(src1 == src2);
+}
+
+static void rgb24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, 
+                           const uint8_t *unused1, const uint8_t *unused2, 
+                           int width, uint32_t *rgb2yuv)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dst = (int16_t *)_dst;
+    vector signed int v_ry, v_gy, v_by;
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+        v_ry = vec_splats((signed int)ry);
+        v_gy = vec_splats((signed int)gy);
+        v_by = vec_splats((signed int)by);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+        v_r = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+        v_g = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+        v_b = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+        v_b = vec_and(v_b, v_FF);
+        v_g = vec_and(v_g, v_FF);
+        v_r = vec_and(v_r, v_FF);
+
+        v_rd0 = vec_mergeh(v_b, v_null);
+        v_rd1 = vec_mergeh(v_g, v_null);
+        v_rd2 = vec_mergeh(v_r, v_null);
+
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+        v_r = vec_mergel(v_r, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ry);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gy ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd0, v_by ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gy ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_by ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); 
+
+        src_addr += 24;
+        dst_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        unsigned int r   = src[3*i];
+        unsigned int g   = src[3*i + 1];
+        unsigned int b   = src[3*i + 2];
+
+        //dst[i] = ((ry*r + gy*g + by*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+        dst[i] = ((ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+    }
+    
+}
+
+static void rgb24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, 
+                            const uint8_t *unused0, const uint8_t *src1, 
+                            const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+    av_assert1(src1 == src2);
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+        
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+        v_r = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+        v_g = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+        v_b = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+        v_r = vec_and(v_r, v_FF);
+        v_g = vec_and(v_g, v_FF);
+        v_b = vec_and(v_b, v_FF);
+
+        v_rd0 = vec_mergeh(v_r, v_null);
+        v_rd1 = vec_mergeh(v_g, v_null);
+        v_rd2 = vec_mergeh(v_b, v_null);
+
+        v_r = vec_mergel(v_r, v_null);
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd2, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); 
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd2, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); 
+
+        src_addr += 24;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        int r = src1[3 * i + 0];
+        int g = src1[3 * i + 1];
+        int b = src1[3 * i + 2];
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+    }
+     
+}
+
+static void rgb24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+                             const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+    av_assert1(src1 == src2);
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t src_addr = (uintptr_t)src1;
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x8002<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-5));
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+        
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+        v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+        v_r = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){0, 3, 6, 9,  12, 15, 18, 21, 24, 27, 30}));
+        v_g = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31}));
+        v_b = vec_perm(v_rd0, v_rd1, 
+              ((vector unsigned char){2, 5, 8, 11, 14, 17, 20, 23, 26, 29}));
+
+        v_r = vec_perm(v_r, v_rd2, 
+              ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}));
+        v_g = vec_perm(v_g, v_rd2, 
+              ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}));
+        v_b = vec_perm(v_b, v_rd2, 
+              ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16,  19, 22, 25, 28, 31}));
+
+        v_r = vec_add(vec_and(v_r, v_FF), vec_sr(v_r, vec_splats((unsigned short)8)));
+        v_g = vec_add(vec_and(v_g, v_FF), vec_sr(v_g, vec_splats((unsigned short)8)));
+        v_b = vec_add(vec_and(v_b, v_FF), vec_sr(v_b, vec_splats((unsigned short)8)));
+
+        v_rd0 = vec_mergeh(v_r, v_null);
+        v_rd1 = vec_mergeh(v_g, v_null);
+        v_rd2 = vec_mergeh(v_b, v_null);
+
+        v_r = vec_mergel(v_r, v_null);
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd2, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); 
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_rd2, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); 
+
+        src_addr += 48;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        int r = src1[6 * i + 0] + src1[6 * i + 3];
+        int g = src1[6 * i + 1] + src1[6 * i + 4];
+        int b = src1[6 * i + 2] + src1[6 * i + 5];
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+        dstV[i] = (rv*r + gv*g + bv*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+    }
+    
+}
+
+static void planar_rgb_to_y_vsx(uint8_t *_dst, const uint8_t *src[4], 
+                                int width, int32_t *rgb2yuv)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dst = (int16_t *)_dst;
+    vector signed int v_ry, v_gy, v_by;
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+        v_ry = vec_splats((signed int)ry);
+        v_gy = vec_splats((signed int)gy);
+        v_by = vec_splats((signed int)by);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        if(i&1){
+            v_rd0 = vec_sld(v_rd0, v_rd0, 8);
+            v_rd1 = vec_sld(v_rd1, v_rd1, 8);
+            v_rd2 = vec_sld(v_rd2, v_rd2, 8); 
+        }else{
+            v_rd0 = vec_vsx_ld(0, (unsigned short *)src[0]);
+            v_rd1 = vec_vsx_ld(0, (unsigned short *)(src[1]));
+            v_rd2 = vec_vsx_ld(0, (unsigned short *)(src[2]));
+        }
+
+        v_g = vec_perm(v_rd0, v_rd0, 
+              ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+        v_b = vec_perm(v_rd1, v_rd1, 
+              ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+        v_r = vec_perm(v_rd2, v_rd2, 
+              ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+
+        v_b = vec_and(v_b, v_FF);
+        v_g = vec_and(v_g, v_FF);
+        v_r = vec_and(v_r, v_FF);
+
+        v_b1 = vec_mergeh(v_b, v_null);
+        v_g1 = vec_mergeh(v_g, v_null);
+        v_r1 = vec_mergeh(v_r, v_null);
+
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+        v_r = vec_mergel(v_r, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_g1, v_gy ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_b1, v_by ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gy ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_by ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); 
+
+        src[0] += 8;
+        src[1] += 8;
+        src[2] += 8;
+        dst_addr += 16;
+    }
+
+    for (i = width_adj; i < width; i++) {
+        int g = src[0][0];
+        int b = src[1][0];
+        int r = src[2][0];
+        dst[i] = (ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+        ++src[0];
+        ++src[1];
+        ++src[2];
+    }
+    
+}
+
+
+static void planar_rgb_to_a_vsx(uint8_t *_dst, const uint8_t *src[4], 
+                                int width, int32_t *unused)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_a, v_dst;  
+    int16_t *dst = (int16_t *)_dst;
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    for (i = 0; i < width_adj; i+=8) {
+        if(i&1)
+            v_rd0 = vec_sld(v_rd0, v_rd0, 8);
+        else
+            v_rd0 = vec_vsx_ld(0, (unsigned short *)src[3]);
+
+        v_a = vec_perm(v_rd0, v_rd0, 
+              ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+        v_a = vec_and(v_a, v_FF);
+        v_dst = vec_sl(v_a, vec_splats((unsigned short)6));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); 
+
+        src[3] += 8;
+        dst_addr += 16;
+    }
+    for (i = width_adj; i < width; i++){
+        dst[i] = src[3][0] << 6;
+        ++src[3];
+    }
+    
+}
+
+
+static void planar_rgb_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV, 
+                                 const uint8_t *src[4], int width, int32_t *rgb2yuv)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    uint16_t *dstU = (uint16_t *)_dstU;
+    uint16_t *dstV = (uint16_t *)_dstV;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+    vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        if(i&1){
+            v_rd0 = vec_sld(v_rd0, v_rd0, 8);
+            v_rd1 = vec_sld(v_rd1, v_rd1, 8);
+            v_rd2 = vec_sld(v_rd2, v_rd2, 8); 
+        }else{
+            v_rd0 = vec_vsx_ld(0, (unsigned short *)src[0]);
+            v_rd1 = vec_vsx_ld(0, (unsigned short *)(src[1]));
+            v_rd2 = vec_vsx_ld(0, (unsigned short *)(src[2]));
+        }
+
+        v_g = vec_perm(v_rd0, v_rd0, 
+              ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+        v_b = vec_perm(v_rd1, v_rd1, 
+              ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+        v_r = vec_perm(v_rd2, v_rd2, 
+              ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+
+        v_b = vec_and(v_b, v_FF);
+        v_g = vec_and(v_g, v_FF);
+        v_r = vec_and(v_r, v_FF);
+
+        v_b1 = vec_mergeh(v_b, v_null);
+        v_g1 = vec_mergeh(v_g, v_null);
+        v_r1 = vec_mergeh(v_r, v_null);
+
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+        v_r = vec_mergel(v_r, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_g1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_b1, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_g1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_b1, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+        src[0] += 8;
+        src[1] += 8;
+        src[2] += 8;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        int g = src[0][0];
+        int b = src[1][0];
+        int r = src[2][0];
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+        dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+        ++src[0];
+        ++src[1];
+        ++src[2];
+
+    }
+    
+}
+
+#define rdpx(src) \
+    is_be ? AV_RB16(src) : AV_RL16(src)
+static av_always_inline 
+void planar_rgb16_to_y_vsx(uint8_t *_dst, const uint8_t *_src[4],
+                           int width, int bpc, int is_be, int32_t *rgb2yuv)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    int16_t *dst = (int16_t *)_dst;
+    const uint16_t **src = (const uint16_t **)_src;
+    vector signed int v_ry, v_gy, v_by;
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    int sh = bpc < 16 ? bpc : 14;
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+    uintptr_t src_addr = (uintptr_t)src;
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(33 << (RGB2YUV_SHIFT + bpc - 9)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT + sh - 14));
+        v_ry = vec_splats((signed int)ry);
+        v_gy = vec_splats((signed int)gy);
+        v_by = vec_splats((signed int)by);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+
+        v_g = vec_vsx_ld(0, (unsigned short *)src[0]);
+        v_b = vec_vsx_ld(0, (unsigned short *)(src[1]));
+        v_r = vec_vsx_ld(0, (unsigned short *)(src[2]));
+        if(is_be){
+            v_g = vec_perm(v_g, v_g, 
+                  ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+            v_b = vec_perm(v_b, v_b, 
+                  ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+            v_r = vec_perm(v_r, v_r, 
+                  ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+        }
+
+
+        v_b1 = vec_mergeh(v_b, v_null);
+        v_g1 = vec_mergeh(v_g, v_null);
+        v_r1 = vec_mergeh(v_r, v_null);
+
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+        v_r = vec_mergel(v_r, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_g1, v_gy ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_b1, v_by ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gy ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_by ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); 
+
+        src[0] += 8;
+        src[1] += 8;
+        src[2] += 8;
+        dst_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        int g = rdpx(src[0]);
+        int b = rdpx(src[1]);
+        int r = rdpx(src[2]);
+
+        dst[i] = ((ry*r + gy*g + by*b + 
+                  (33 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14));
+        ++src[0];
+        ++src[1];
+        ++src[2];
+    }
+    
+}
+//ToDO
+static av_always_inline 
+void planar_rgb16_to_a_vsx(uint8_t *_dst, const uint8_t *_src[4],
+                           int width, int bpc, int is_be, int32_t *rgb2yuv)
+{
+    
+    int i, width_adj;
+    vector unsigned short v_rd0, v_a, v_dst, shift;  
+    const uint16_t **src = (const uint16_t **)_src;
+    uint16_t *dst        = (uint16_t *)_dst;
+    int sh = bpc < 16 ? bpc : 14;
+    uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+    width_adj = width&(~(int)0x07);
+    if(width_adj){
+        shift = vec_splats((unsigned short)(14 - sh));
+    }
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (unsigned short *)src[3]);
+        if(is_be)
+            v_dst = vec_perm(v_rd0, v_rd0, 
+                    ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+        else
+            v_dst = vec_sl(v_rd0, shift);
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); 
+
+        src[3] += 8;
+        dst_addr += 16;
+    }
+    for (i=width_adj; i< width; i++){
+        dst[i] = rdpx(src[3]) << (14 - sh);
+        ++src[3];
+    }
+    
+}
+
+static av_always_inline 
+void planar_rgb16_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV,
+                            const uint8_t *_src[4], int width,
+                            int bpc, int is_be, int32_t *rgb2yuv)
+{
+    
+
+    int i, width_adj;
+    vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;  
+    vector unsigned int v_dst1, v_dst2;
+    vector unsigned int shift1, shift2;
+    const uint16_t **src = (const uint16_t **)_src;
+    uint16_t *dstU       = (uint16_t *)_dstU;
+    uint16_t *dstV       = (uint16_t *)_dstV;
+    vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int sh = bpc < 16 ? bpc : 14;
+    vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+    uintptr_t dstU_addr = (uintptr_t)_dstU;
+    uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+    width_adj = width&(~(int)0x07);
+
+    if(width_adj){
+        shift1 = vec_splats((unsigned int)(257 << (RGB2YUV_SHIFT + bpc - 9)));
+        shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT + sh - 14));
+        v_ru = vec_splats((signed int)ru);
+        v_gu = vec_splats((signed int)gu);
+        v_bu = vec_splats((signed int)bu);
+        v_rv = vec_splats((signed int)rv);
+        v_gv = vec_splats((signed int)gv);
+        v_bv = vec_splats((signed int)bv);
+    }
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_g = vec_vsx_ld(0, (unsigned short *)src[0]);
+        v_b = vec_vsx_ld(0, (unsigned short *)(src[1]));
+        v_r = vec_vsx_ld(0, (unsigned short *)(src[2]));
+        if(is_be){
+            v_g = vec_perm(v_g, v_g, 
+                  ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+            v_b = vec_perm(v_b, v_b, 
+                  ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+            v_r = vec_perm(v_r, v_r, 
+                  ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+        }
+        
+
+
+        v_b1 = vec_mergeh(v_b, v_null);
+        v_g1 = vec_mergeh(v_g, v_null);
+        v_r1 = vec_mergeh(v_r, v_null);
+
+        v_g = vec_mergel(v_g, v_null);
+        v_b = vec_mergel(v_b, v_null);
+        v_r = vec_mergel(v_r, v_null);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_g1, v_gu ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_b1, v_bu ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gu ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bu ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                 ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+        v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_g1, v_gv ));
+        v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, 
+                                      vec_mul((vector signed int)v_b1, v_bv ));
+        v_dst1 = vec_add(v_dst1, shift1);
+        v_dst1 = vec_sr(v_dst1, shift2);
+        v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_g, v_gv ));
+        v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, 
+                                      vec_mul((vector signed int)v_b,  v_bv ));
+        v_dst2 = vec_add(v_dst2, shift1);
+        v_dst2 = vec_sr(v_dst2, shift2);
+        v_dst1 = vec_perm(v_dst1, v_dst2, 
+                ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);  
+
+        src[0] += 8;
+        src[1] += 8;
+        src[2] += 8;
+        dstU_addr += 16;
+        dstV_addr += 16;
+    }
+    for (i = width_adj; i < width; i++) {
+        int g = rdpx(src[0]);
+        int b = rdpx(src[1]);
+        int r = rdpx(src[2]);
+
+        dstU[i] = (ru*r + gu*g + bu*b + 
+                  (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14);
+        dstV[i] = (rv*r + gv*g + bv*b + 
+                  (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14);
+        ++src[0];
+        ++src[1];
+        ++src[2];
+    }
+    
+}
+#undef rdpx
+
+static av_always_inline void grayf32ToY16_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+                                            const uint8_t *unused2, int width, uint32_t *unused)
+{
+    
+    int i;
+    const float *src = (const float *)_src;
+    uint16_t *dst    = (uint16_t *)_dst;
+
+    for (i = 0; i < width; ++i){
+        dst[i] = av_clip_uint16(lrintf(65535.0f * src[i]));
+    }
+    STOP_TIMER("47")
+}
+
+static av_always_inline void grayf32ToY16_bswap_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+                                                  const uint8_t *unused2, int width, uint32_t *unused)
+{
+    
+    int i;
+    const uint32_t *src = (const uint32_t *)_src;
+    uint16_t *dst    = (uint16_t *)_dst;
+
+    for (i = 0; i < width; ++i){
+        dst[i] = av_clip_uint16(lrintf(65535.0f * av_int2float(av_bswap32(src[i]))));
+    }
+    STOP_TIMER("48")
+}
+
+/*static av_always_inline 
+void grayf32ToY16_c_vsx(uint8_t *_dst, const uint8_t *_src, 
+                        const uint8_t *unused1, const uint8_t *unused2, 
+                        int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector float v_rd0, v_rd1;
+    vector signed int v_rd00, v_rd01, v_rd02, v_rd03;
+    vector unsigned short v_dst;  
+    const float *src = (const float *)_src;
+    uint16_t *dst        = (uint16_t *)_dst;
+
+    uintptr_t dst_addr = (uintptr_t)_dst;
+    uintptr_t src_addr = (uintptr_t)_src;
+
+
+    width_adj = width&(~(int)0x07);
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (float *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (float *)(src_addr+16));
+
+        v_rd0 = vec_rint(vec_mul(v_rd0, vec_splats((float)65535.0f)));
+        v_rd1 = vec_rint(vec_mul(v_rd1, vec_splats((float)65535.0f)));
+        v_rd00 = (vector signed int)vec_cts(v_rd0, 0);
+        v_rd01 = (vector signed int)vec_cts(v_rd1, 0);
+        v_rd02 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd00, 
+                                    vec_splats((unsigned int)0xFFFF));
+        v_rd03 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd01, 
+                                    vec_splats((unsigned int)0xFFFF));
+        v_rd00 = vec_or(v_rd00, v_rd02);
+        v_rd01 = vec_or(v_rd01, v_rd03);
+
+        v_dst = (vector unsigned short)vec_perm(v_rd00, v_rd01, 
+                ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); 
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+    for (i = width_adj; i < width; i++){
+        dst[i] = av_clip_uint16(lrintf(65535.0f * src[i]));
+    }
+    
+}
+static av_always_inline 
+void grayf32ToY16_bswap_c_vsx(uint8_t *_dst, const uint8_t *_src, 
+                              const uint8_t *unused1, const uint8_t *unused2, 
+                              int width, uint32_t *unused)
+{
+    
+    int i, width_adj;
+    vector signed int v_rd0, v_rd1, v_rd2, v_rd3;
+    vector float v_rd00, v_rd01;
+    vector unsigned short v_dst;  
+    const uint32_t *src = (const float *)_src;
+    uint16_t *dst        = (uint16_t *)_dst;
+
+    uintptr_t dst_addr = (uintptr_t)_dst;
+    uintptr_t src_addr = (uintptr_t)_src;
+
+
+    width_adj = width&(~(int)0x07);
+
+    for (i = 0; i < width_adj; i+=8) {
+        v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+        v_rd1 = vec_vsx_ld(0, (int *)(src_addr+16));
+
+        v_rd0 = vec_perm(v_rd0, v_rd0, 
+                ((vector unsigned char){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}));
+        v_rd1 = vec_perm(v_rd1, v_rd1, 
+                ((vector unsigned char){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}));
+        v_rd00 = vec_round(vec_mul((vector float)v_rd0, vec_splats((float)65535.0f)));
+        v_rd01 = vec_round(vec_mul((vector float)v_rd1, vec_splats((float)65535.0f)));
+        
+
+        v_rd0 = vec_cts(v_rd00, 0);
+        v_rd1 = vec_cts(v_rd01, 0);
+        v_rd2 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd0, 
+                                   vec_splats((unsigned int)0xFFFF));
+        v_rd3 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd1, 
+                                   vec_splats((unsigned int)0xFFFF));
+        v_rd0 = vec_or(v_rd0, v_rd2);
+        v_rd1 = vec_or(v_rd1, v_rd3);
+
+        
+        
+        v_dst = (vector unsigned short)vec_perm(v_rd0, v_rd1, 
+                ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+        vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); 
+
+        src_addr += 32;
+        dst_addr += 16;
+    }
+    for (i = width_adj; i < width; i++){
+    
+        dst[i] = av_clip_uint16(lrintf(65535.0f * av_int2float(av_bswap32(src[i]))));
+    }
+    
+}*/
+
+#define rgb9plus_planar_funcs_endian(nbits, endian_name, endian)                                    \
+static void planar_rgb##nbits##endian_name##_to_y_vsx(uint8_t *dst, const uint8_t *src[4],              \
+                                                  int w, int32_t *rgb2yuv)                          \
+{                                                                                                   \
+    planar_rgb16_to_y_vsx(dst, src, w, nbits, endian, rgb2yuv);                                         \
+}                                                                                                   \
+static void planar_rgb##nbits##endian_name##_to_uv_vsx(uint8_t *dstU, uint8_t *dstV,                    \
+                                                   const uint8_t *src[4], int w, int32_t *rgb2yuv)  \
+{                                                                                                   \
+    planar_rgb16_to_uv_vsx(dstU, dstV, src, w, nbits, endian, rgb2yuv);                                 \
+}                                                                                                   \
+
+
+#define rgb9plus_planar_transparency_funcs(nbits)                           \
+static void planar_rgb##nbits##le_to_a_vsx(uint8_t *dst, const uint8_t *src[4], \
+                                       int w, int32_t *rgb2yuv)             \
+{                                                                           \
+    planar_rgb16_to_a_vsx(dst, src, w, nbits, 0, rgb2yuv);                      \
+}                                                                           \
+static void planar_rgb##nbits##be_to_a_vsx(uint8_t *dst, const uint8_t *src[4], \
+                                       int w, int32_t *rgb2yuv)             \
+{                                                                           \
+    planar_rgb16_to_a_vsx(dst, src, w, nbits, 1, rgb2yuv);                      \
+}
+
+#define rgb9plus_planar_funcs(nbits)            \
+    rgb9plus_planar_funcs_endian(nbits, le, 0)  \
+    rgb9plus_planar_funcs_endian(nbits, be, 1)
+
+rgb9plus_planar_funcs(9)
+rgb9plus_planar_funcs(10)
+rgb9plus_planar_funcs(12)
+rgb9plus_planar_funcs(14)
+rgb9plus_planar_funcs(16)
+
+rgb9plus_planar_transparency_funcs(10)
+rgb9plus_planar_transparency_funcs(12)
+rgb9plus_planar_transparency_funcs(16)
+#endif //!HAVE_BIGENDIAN
+#endif //HAVE_VSX 
+av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c)
+{
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_VSX))
+        return;
+#if HAVE_VSX 
+#if !HAVE_BIGENDIAN
+    enum AVPixelFormat srcFormat = c->srcFormat;
+
+    c->chrToYV12 = NULL;
+    switch (srcFormat) {
+    case AV_PIX_FMT_YUYV422:
+        c->chrToYV12 = yuy2ToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_YVYU422:
+        c->chrToYV12 = yvy2ToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_UYVY422:
+        c->chrToYV12 = uyvyToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV24:
+        c->chrToYV12 = nv12ToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_NV21:
+    case AV_PIX_FMT_NV42:
+        c->chrToYV12 = nv21ToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB8:
+    case AV_PIX_FMT_BGR8:
+    case AV_PIX_FMT_PAL8:
+    case AV_PIX_FMT_BGR4_BYTE:
+    case AV_PIX_FMT_RGB4_BYTE:
+        c->chrToYV12 = palToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_GBRP9LE:
+        c->readChrPlanar = planar_rgb9le_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP10LE:
+    case AV_PIX_FMT_GBRP10LE:
+        c->readChrPlanar = planar_rgb10le_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP12LE:
+    case AV_PIX_FMT_GBRP12LE:
+        c->readChrPlanar = planar_rgb12le_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRP14LE:
+        c->readChrPlanar = planar_rgb14le_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP16LE:
+    case AV_PIX_FMT_GBRP16LE:
+        c->readChrPlanar = planar_rgb16le_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRP9BE:
+        c->readChrPlanar = planar_rgb9be_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP10BE:
+    case AV_PIX_FMT_GBRP10BE:
+        c->readChrPlanar = planar_rgb10be_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP12BE:
+    case AV_PIX_FMT_GBRP12BE:
+        c->readChrPlanar = planar_rgb12be_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRP14BE:
+        c->readChrPlanar = planar_rgb14be_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP16BE:
+    case AV_PIX_FMT_GBRP16BE:
+        c->readChrPlanar = planar_rgb16be_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GBRP:
+        c->readChrPlanar = planar_rgb_to_uv_vsx;
+        break;
+    case AV_PIX_FMT_YUV420P9BE:
+    case AV_PIX_FMT_YUV422P9BE:
+    case AV_PIX_FMT_YUV444P9BE:
+    case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV422P10BE:
+    case AV_PIX_FMT_YUV440P10BE:
+    case AV_PIX_FMT_YUV444P10BE:
+    case AV_PIX_FMT_YUV420P12BE:
+    case AV_PIX_FMT_YUV422P12BE:
+    case AV_PIX_FMT_YUV440P12BE:
+    case AV_PIX_FMT_YUV444P12BE:
+    case AV_PIX_FMT_YUV420P14BE:
+    case AV_PIX_FMT_YUV422P14BE:
+    case AV_PIX_FMT_YUV444P14BE:
+    case AV_PIX_FMT_YUV420P16BE:
+    case AV_PIX_FMT_YUV422P16BE:
+    case AV_PIX_FMT_YUV444P16BE:
+
+    case AV_PIX_FMT_YUVA420P9BE:
+    case AV_PIX_FMT_YUVA422P9BE:
+    case AV_PIX_FMT_YUVA444P9BE:
+    case AV_PIX_FMT_YUVA420P10BE:
+    case AV_PIX_FMT_YUVA422P10BE:
+    case AV_PIX_FMT_YUVA444P10BE:
+    case AV_PIX_FMT_YUVA422P12BE:
+    case AV_PIX_FMT_YUVA444P12BE:
+    case AV_PIX_FMT_YUVA420P16BE:
+    case AV_PIX_FMT_YUVA422P16BE:
+    case AV_PIX_FMT_YUVA444P16BE:
+        c->chrToYV12 = bswap16UV_c_vsx;
+        break;
+    case AV_PIX_FMT_AYUV64LE:
+        c->chrToYV12 = read_ayuv64le_UV_c_vsx;
+        break;
+    case AV_PIX_FMT_P010LE:
+        c->chrToYV12 = p010LEToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_P010BE:
+        c->chrToYV12 = p010BEToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_P016LE:
+        c->chrToYV12 = p016LEToUV_c_vsx;
+        break;
+    case AV_PIX_FMT_P016BE:
+        c->chrToYV12 = p016BEToUV_c_vsx;
+        break;
+    }
+    if (c->chrSrcHSubSample) {
+        switch (srcFormat) {
+        case AV_PIX_FMT_RGBA64BE:
+            c->chrToYV12 = rgb64BEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGBA64LE:
+            c->chrToYV12 = rgb64LEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGRA64BE:
+            c->chrToYV12 = bgr64BEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGRA64LE:
+            c->chrToYV12 = bgr64LEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB48BE:
+            c->chrToYV12 = rgb48BEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB48LE:
+            c->chrToYV12 = rgb48LEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR48BE:
+            c->chrToYV12 = bgr48BEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR48LE:
+            c->chrToYV12 = bgr48LEToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB32:
+            c->chrToYV12 = bgr32ToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB32_1:
+            c->chrToYV12 = bgr321ToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->chrToYV12 = bgr24ToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR565LE:
+            c->chrToYV12 = bgr16leToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR565BE:
+            c->chrToYV12 = bgr16beToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR555LE:
+            c->chrToYV12 = bgr15leToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR555BE:
+            c->chrToYV12 = bgr15beToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_GBRAP:
+        case AV_PIX_FMT_GBRP:
+            c->chrToYV12 = gbr24pToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR444LE:
+            c->chrToYV12 = bgr12leToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR444BE:
+            c->chrToYV12 = bgr12beToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR32:
+            c->chrToYV12 = rgb32ToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR32_1:
+            c->chrToYV12 = rgb321ToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->chrToYV12 = rgb24ToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB565LE:
+            c->chrToYV12 = rgb16leToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB565BE:
+            c->chrToYV12 = rgb16beToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB555LE:
+            c->chrToYV12 = rgb15leToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB555BE:
+            c->chrToYV12 = rgb15beToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB444LE:
+            c->chrToYV12 = rgb12leToUV_half_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB444BE:
+            c->chrToYV12 = rgb12beToUV_half_c_vsx;
+            break;
+        }
+    } else {
+        switch (srcFormat) {
+        case AV_PIX_FMT_RGBA64BE:
+            c->chrToYV12 = rgb64BEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGBA64LE:
+            c->chrToYV12 = rgb64LEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGRA64BE:
+            c->chrToYV12 = bgr64BEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGRA64LE:
+            c->chrToYV12 = bgr64LEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB48BE:
+            c->chrToYV12 = rgb48BEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB48LE:
+            c->chrToYV12 = rgb48LEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR48BE:
+            c->chrToYV12 = bgr48BEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR48LE:
+            c->chrToYV12 = bgr48LEToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB32:
+            c->chrToYV12 = bgr32ToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB32_1:
+            c->chrToYV12 = bgr321ToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->chrToYV12 = bgr24ToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR565LE:
+            c->chrToYV12 = bgr16leToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR565BE:
+            c->chrToYV12 = bgr16beToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR555LE:
+            c->chrToYV12 = bgr15leToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR555BE:
+            c->chrToYV12 = bgr15beToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR444LE:
+            c->chrToYV12 = bgr12leToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR444BE:
+            c->chrToYV12 = bgr12beToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR32:
+            c->chrToYV12 = rgb32ToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_BGR32_1:
+            c->chrToYV12 = rgb321ToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->chrToYV12 = rgb24ToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB565LE:
+            c->chrToYV12 = rgb16leToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB565BE:
+            c->chrToYV12 = rgb16beToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB555LE:
+            c->chrToYV12 = rgb15leToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB555BE:
+            c->chrToYV12 = rgb15beToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB444LE:
+            c->chrToYV12 = rgb12leToUV_c_vsx;
+            break;
+        case AV_PIX_FMT_RGB444BE:
+            c->chrToYV12 = rgb12beToUV_c_vsx;
+            break;
+        }
+    }
+
+    c->lumToYV12 = NULL;
+    c->alpToYV12 = NULL;
+    switch (srcFormat) {
+    case AV_PIX_FMT_GBRP9LE:
+        c->readLumPlanar = planar_rgb9le_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP10LE:
+        c->readAlpPlanar = planar_rgb10le_to_a_vsx;
+    case AV_PIX_FMT_GBRP10LE:
+        c->readLumPlanar = planar_rgb10le_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP12LE:
+        c->readAlpPlanar = planar_rgb12le_to_a_vsx;
+    case AV_PIX_FMT_GBRP12LE:
+        c->readLumPlanar = planar_rgb12le_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRP14LE:
+        c->readLumPlanar = planar_rgb14le_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP16LE:
+        c->readAlpPlanar = planar_rgb16le_to_a_vsx;
+    case AV_PIX_FMT_GBRP16LE:
+        c->readLumPlanar = planar_rgb16le_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRP9BE:
+        c->readLumPlanar = planar_rgb9be_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP10BE:
+        c->readAlpPlanar = planar_rgb10be_to_a_vsx;
+    case AV_PIX_FMT_GBRP10BE:
+        c->readLumPlanar = planar_rgb10be_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP12BE:
+        c->readAlpPlanar = planar_rgb12be_to_a_vsx;
+    case AV_PIX_FMT_GBRP12BE:
+        c->readLumPlanar = planar_rgb12be_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRP14BE:
+        c->readLumPlanar = planar_rgb14be_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP16BE:
+        c->readAlpPlanar = planar_rgb16be_to_a_vsx;
+    case AV_PIX_FMT_GBRP16BE:
+        c->readLumPlanar = planar_rgb16be_to_y_vsx;
+        break;
+    case AV_PIX_FMT_GBRAP:
+        c->readAlpPlanar = planar_rgb_to_a_vsx;
+    case AV_PIX_FMT_GBRP:
+        c->readLumPlanar = planar_rgb_to_y_vsx;
+        break;
+
+    case AV_PIX_FMT_YUV420P9BE:
+    case AV_PIX_FMT_YUV422P9BE:
+    case AV_PIX_FMT_YUV444P9BE:
+    case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV422P10BE:
+    case AV_PIX_FMT_YUV440P10BE:
+    case AV_PIX_FMT_YUV444P10BE:
+    case AV_PIX_FMT_YUV420P12BE:
+    case AV_PIX_FMT_YUV422P12BE:
+    case AV_PIX_FMT_YUV440P12BE:
+    case AV_PIX_FMT_YUV444P12BE:
+    case AV_PIX_FMT_YUV420P14BE:
+    case AV_PIX_FMT_YUV422P14BE:
+    case AV_PIX_FMT_YUV444P14BE:
+    case AV_PIX_FMT_YUV420P16BE:
+    case AV_PIX_FMT_YUV422P16BE:
+    case AV_PIX_FMT_YUV444P16BE:
+
+    case AV_PIX_FMT_GRAY9BE:
+    case AV_PIX_FMT_GRAY10BE:
+    case AV_PIX_FMT_GRAY12BE:
+    case AV_PIX_FMT_GRAY14BE:
+    case AV_PIX_FMT_GRAY16BE:
+
+    case AV_PIX_FMT_P016BE:
+        c->lumToYV12 = bswap16Y_c_vsx;
+        break;
+    case AV_PIX_FMT_YUVA420P9BE:
+    case AV_PIX_FMT_YUVA422P9BE:
+    case AV_PIX_FMT_YUVA444P9BE:
+    case AV_PIX_FMT_YUVA420P10BE:
+    case AV_PIX_FMT_YUVA422P10BE:
+    case AV_PIX_FMT_YUVA444P10BE:
+    case AV_PIX_FMT_YUVA422P12BE:
+    case AV_PIX_FMT_YUVA444P12BE:
+    case AV_PIX_FMT_YUVA420P16BE:
+    case AV_PIX_FMT_YUVA422P16BE:
+    case AV_PIX_FMT_YUVA444P16BE:
+        c->lumToYV12 = bswap16Y_c_vsx;
+        c->alpToYV12 = bswap16Y_c_vsx;
+        break;
+    case AV_PIX_FMT_YA16LE:
+        c->lumToYV12 = read_ya16le_gray_c_vsx;
+        break;
+    case AV_PIX_FMT_YA16BE:
+        c->lumToYV12 = read_ya16be_gray_c_vsx;
+        break;
+    case AV_PIX_FMT_AYUV64LE:
+        c->lumToYV12 = read_ayuv64le_Y_c_vsx;
+        break;
+    case AV_PIX_FMT_YUYV422:
+    case AV_PIX_FMT_YVYU422:
+    case AV_PIX_FMT_YA8:
+        c->lumToYV12 = yuy2ToY_c_vsx;
+        break;
+    case AV_PIX_FMT_UYVY422:
+        c->lumToYV12 = uyvyToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR24:
+        c->lumToYV12 = bgr24ToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR565LE:
+        c->lumToYV12 = bgr16leToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR565BE:
+        c->lumToYV12 = bgr16beToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR555LE:
+        c->lumToYV12 = bgr15leToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR555BE:
+        c->lumToYV12 = bgr15beToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR444LE:
+        c->lumToYV12 = bgr12leToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR444BE:
+        c->lumToYV12 = bgr12beToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB24:
+        c->lumToYV12 = rgb24ToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB565LE:
+        c->lumToYV12 = rgb16leToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB565BE:
+        c->lumToYV12 = rgb16beToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB555LE:
+        c->lumToYV12 = rgb15leToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB555BE:
+        c->lumToYV12 = rgb15beToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB444LE:
+        c->lumToYV12 = rgb12leToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB444BE:
+        c->lumToYV12 = rgb12beToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB8:
+    case AV_PIX_FMT_BGR8:
+    case AV_PIX_FMT_PAL8:
+    case AV_PIX_FMT_BGR4_BYTE:
+    case AV_PIX_FMT_RGB4_BYTE:
+        c->lumToYV12 = palToY_c_vsx;
+        break;
+    case AV_PIX_FMT_MONOBLACK:
+        c->lumToYV12 = monoblack2Y_c_vsx;
+        break;
+    case AV_PIX_FMT_MONOWHITE:
+        c->lumToYV12 = monowhite2Y_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB32:
+        c->lumToYV12 = bgr32ToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB32_1:
+        c->lumToYV12 = bgr321ToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR32:
+        c->lumToYV12 = rgb32ToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR32_1:
+        c->lumToYV12 = rgb321ToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB48BE:
+        c->lumToYV12 = rgb48BEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGB48LE:
+        c->lumToYV12 = rgb48LEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR48BE:
+        c->lumToYV12 = bgr48BEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGR48LE:
+        c->lumToYV12 = bgr48LEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGBA64BE:
+        c->lumToYV12 = rgb64BEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_RGBA64LE:
+        c->lumToYV12 = rgb64LEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGRA64BE:
+        c->lumToYV12 = bgr64BEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_BGRA64LE:
+        c->lumToYV12 = bgr64LEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_P010LE:
+        c->lumToYV12 = p010LEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_P010BE:
+        c->lumToYV12 = p010BEToY_c_vsx;
+        break;
+    case AV_PIX_FMT_GRAYF32LE:
+        c->lumToYV12 = grayf32ToY16_c_vsx;
+        break;
+    case AV_PIX_FMT_GRAYF32BE:
+        c->lumToYV12 = grayf32ToY16_bswap_c_vsx;
+        break;
+    }
+    if (c->needAlpha) {
+        if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
+            if (HAVE_BIGENDIAN == !isBE(srcFormat) && !c->readAlpPlanar)
+                c->alpToYV12 = bswap16Y_c_vsx;
+        }
+        switch (srcFormat) {
+        case AV_PIX_FMT_BGRA64LE:
+        case AV_PIX_FMT_RGBA64LE:  c->alpToYV12 = rgba64leToA_c_vsx; break;
+        case AV_PIX_FMT_BGRA64BE:
+        case AV_PIX_FMT_RGBA64BE:  c->alpToYV12 = rgba64beToA_c_vsx; break;
+        case AV_PIX_FMT_BGRA:
+        case AV_PIX_FMT_RGBA:
+            c->alpToYV12 = rgbaToA_c_vsx;
+            break;
+        case AV_PIX_FMT_ABGR:
+        case AV_PIX_FMT_ARGB:
+            c->alpToYV12 = abgrToA_c_vsx;
+            break;
+        case AV_PIX_FMT_YA8:
+            c->alpToYV12 = uyvyToY_c_vsx;
+            break;
+        case AV_PIX_FMT_YA16LE:
+            c->alpToYV12 = read_ya16le_alpha_c_vsx;
+            break;
+        case AV_PIX_FMT_YA16BE:
+            c->alpToYV12 = read_ya16be_alpha_c_vsx;
+            break;
+        case AV_PIX_FMT_AYUV64LE:
+            c->alpToYV12 = read_ayuv64le_A_c_vsx;
+            break;
+        case AV_PIX_FMT_PAL8 :
+            c->alpToYV12 = palToA_c_vsx;
+            break;
+        }
+    }
+#endif //!HAVE_BIGENDIAN
+#endif //HAVE_VSX 
+}
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 8436f05..fca1999 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -566,6 +566,8 @@  static av_cold void sws_init_swscale(SwsContext *c)
                              &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
 
     ff_sws_init_input_funcs(c);
+    if (ARCH_PPC)
+        ff_sws_init_input_funcs_vsx(c);
 
     if (c->srcBpc == 8) {
         if (c->dstBpc <= 14) {
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index a59d127..e5f0e9d 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -859,6 +859,7 @@  void ff_get_unscaled_swscale_aarch64(SwsContext *c);
 SwsFunc ff_getSwsFunc(SwsContext *c);
 
 void ff_sws_init_input_funcs(SwsContext *c);
+void ff_sws_init_input_funcs_vsx(SwsContext *c);
 void ff_sws_init_output_funcs(SwsContext *c,
                               yuv2planar1_fn *yuv2plane1,
                               yuv2planarX_fn *yuv2planeX,