[FFmpeg-devel] POWER8 VSX vectorization libswscale/input.c Track ticket 5570

Message ID	1584963102-7551-1-git-send-email-pestov.vyach@yandex.ru
State	Superseded
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> From: Pestov Vyacheslav <pestov.vyach@yandex.ru> To: ffmpeg-devel@ffmpeg.org Date: Mon, 23 Mar 2020 14:31:42 +0300 Message-Id: <1584963102-7551-1-git-send-email-pestov.vyach@yandex.ru> Subject: [FFmpeg-devel] [PATCH] [PATCH] POWER8 VSX vectorization libswscale/input.c Track ticket 5570 Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Pestov Vyacheslav <pestov.vyach@yandex.ru> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel] POWER8 VSX vectorization libswscale/input.c Track ticket 5570 \| expand [FFmpeg-devel] POWER8 VSX vectorization libswscale/input.c Track ticket 5570

Context	Check	Description
andriy/ffmpeg-patchwork	fail	Make failed

diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile index 0a31a30..90c16f5 100644 --- a/libswscale/ppc/Makefile +++ b/libswscale/ppc/Makefile @@ -1,4 +1,5 @@ OBJS += ppc/swscale_altivec.o \ ppc/yuv2rgb_altivec.o \ ppc/yuv2yuv_altivec.o \ - ppc/swscale_vsx.o + ppc/swscale_vsx.o \ + ppc/input_vsx.o diff --git a/libswscale/ppc/input_vsx.c b/libswscale/ppc/input_vsx.c new file mode 100644 index 0000000..a37b801 --- /dev/null +++ b/libswscale/ppc/input_vsx.c @@ -0,0 +1,4562 @@ +/* + * POWER8 VSX vectorization libswscale/input.c + * Written by Vyacheslav Pestov. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * POWER8 VSX vectorization libswscale/input.c + * @author Vyacheslav Pestov + */ +#include <math.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + + +#include "libavutil/avutil.h" +#include "libavutil/bswap.h" +#include "libavutil/cpu.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mathematics.h" +#include "libavutil/pixdesc.h" +#include "libavutil/avassert.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavutil/timer.h" +#include "config.h" +#include "../rgb2rgb.h" +#include "../swscale.h" +#include "../swscale_internal.h" + + + + +#if HAVE_VSX +#if !HAVE_BIGENDIAN + + +//vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); +//vector signed short v_8000 = vec_splats((signed short)0x8000); +//vector signed short v_7FFF = vec_splats((signed short)0x7FFF); +//vector signed short v_FFFF = vec_splats((signed short)0xFFFF); +vector unsigned int v_000000FF = ((vector unsigned int){0xFF, 0xFF, 0xFF, 0xFF}); + +#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) + +#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \ + origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \ + ? b_r : r_b) +#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \ + origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \ + ? r_b : b_r) +#define v_r1 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \ + origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \ + ? v_b_r1 : v_r_b1) +#define v_b1 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \ + origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \ + ? v_r_b1 : v_b_r1) +#define v_r2 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \ + origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \ + ? v_b_r2 : v_r_b2) +#define v_b2 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \ + origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \ + ? v_r_b2 : v_b_r2) + +static av_always_inline void +rgb64ToY_c_template_vsx(uint16_t *dst, const uint16_t *src, int width, + enum AVPixelFormat origin, int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj, is_BE; + vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3; + vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + vector signed int v_ry, v_gy, v_by; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x2001<<(RGB2YUV_SHIFT-1))); + shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT); + v_ry = vec_splats((signed int)ry); + v_gy = vec_splats((signed int)gy); + v_by = vec_splats((signed int)by); + is_BE = isBE(origin); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32)); + v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr+48)); + + if(is_BE){ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24})); + v_r_b2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26})); + v_g2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28})); + v_b_r2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28})); + }else{ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25})); + v_r_b2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27})); + v_g2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29})); + v_b_r2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29})); + } + + + v_r_b1 = vec_mergeh(v_r_b1, v_null); + v_g1 = vec_mergeh(v_g1, v_null); + v_b_r1 = vec_mergeh(v_b_r1, v_null); + + v_r_b2 = vec_mergeh(v_r_b2, v_null); + v_g2 = vec_mergeh(v_g2, v_null); + v_b_r2 = vec_mergeh(v_b_r2, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gy )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_by )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ry); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, v_gy )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, v_by )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, ((vector unsigned char) + {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); + + src_addr += 64; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + unsigned int r_b = input_pixel(&src[i*4+0]); + unsigned int g = input_pixel(&src[i*4+1]); + unsigned int b_r = input_pixel(&src[i*4+2]); + + dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + } + STOP_TIMER("0_vsx") +} + +static av_always_inline void +rgb64ToUV_c_template_vsx(uint16_t *dstU, uint16_t *dstV, + const uint16_t *src1, const uint16_t *src2, + int width, enum AVPixelFormat origin, int32_t *rgb2yuv) +{ + av_assert1(src1==src2); + START_TIMER + int i, width_adj, is_BE ; + vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3; + vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x10001<<(RGB2YUV_SHIFT-1))); + shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + is_BE = isBE(origin); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32)); + v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr+48)); + + if(is_BE){ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24})); + v_r_b2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26})); + v_g2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28})); + v_b_r2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28})); + }else{ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25})); + v_r_b2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27})); + v_g2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29})); + v_b_r2 = vec_perm(v_rd2, v_rd3, + ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29})); + } + + + v_r_b1 = vec_mergeh(v_r_b1, v_null); + v_g1 = vec_mergeh(v_g1, v_null); + v_b_r1 = vec_mergeh(v_b_r1, v_null); + + v_r_b2 = vec_mergeh(v_r_b2, v_null); + v_g2 = vec_mergeh(v_g2, v_null); + v_b_r2 = vec_mergeh(v_b_r2, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src_addr += 64; + dstU_addr += 16; + dstV_addr += 16; + } + + for (i = width_adj; i < width; i++) { + int r_b = input_pixel(&src1[i*4+0]); + int g = input_pixel(&src1[i*4+1]); + int b_r = input_pixel(&src1[i*4+2]); + + dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + } + STOP_TIMER("1_vsx") +} + +static av_always_inline void +rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV, + const uint16_t *src1, const uint16_t *src2, + int width, enum AVPixelFormat origin, int32_t *rgb2yuv) +{ + int i; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + av_assert1(src1==src2); + for (i = 0; i < width; i++) { + int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1; + int g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1; + int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1; + + dstU[i]= (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + dstV[i]= (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + } +} + +#define rgb64funcs(pattern, BE_LE, origin) \ +static void pattern ## 64 ## BE_LE ## ToY_c_vsx(uint8_t *_dst, const uint8_t *_src, \ + const uint8_t *unused0, const uint8_t *unused1, \ + int width, uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src = (const uint16_t *) _src; \ + uint16_t *dst = (uint16_t *) _dst; \ + rgb64ToY_c_template_vsx(dst, src, width, origin, rgb2yuv); \ +} \ + \ +static void pattern ## 64 ## BE_LE ## ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, \ + const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \ + int width, uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src1 = (const uint16_t *) _src1, \ + *src2 = (const uint16_t *) _src2; \ + uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ + rgb64ToUV_c_template_vsx(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ +}\ + \ +static void pattern ## 64 ## BE_LE ## ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, \ + const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \ + int width, uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src1 = (const uint16_t *) _src1, \ + *src2 = (const uint16_t *) _src2; \ + uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ + rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ +} + +rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE) +rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE) +rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE) +rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE) + +static av_always_inline void rgb48ToY_c_template_vsx(uint16_t *dst, + const uint16_t *src, int width, + enum AVPixelFormat origin, + int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj, is_BE; + vector unsigned short v_rd0, v_rd1, v_rd2; + vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + vector signed int v_ry, v_gy, v_by; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x2001<<(RGB2YUV_SHIFT-1))); + shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT); + v_ry = vec_splats((signed int)ry); + v_gy = vec_splats((signed int)gy); + v_by = vec_splats((signed int)by); + is_BE = isBE(origin); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32)); + + if(is_BE){ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 7, 6, 13, 12, 19, 18})); + v_r_b2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){9, 8, 15, 14, 21, 20, 27, 26})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){3, 2, 9, 8, 15, 14, 21, 20})); + v_g2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){11, 10, 17, 16, 23, 22, 29, 28})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){5, 4, 11, 10, 17, 16, 23, 22})); + v_b_r2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){13, 12, 19, 18, 25, 24, 31, 30})); + }else{ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 6, 7, 12, 13, 18, 19})); + v_r_b2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){8, 9, 14, 15, 20, 21, 26, 27})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 3, 8, 9, 14, 15, 20, 21})); + v_g2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){10, 11, 16, 17, 22, 23, 28, 29})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){4, 5, 10, 11, 16, 17, 22, 23})); + v_b_r2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){12, 13, 18, 19, 24, 25, 30, 31})); + } + + + v_r_b1 = vec_mergeh(v_r_b1, v_null); + v_g1 = vec_mergeh(v_g1, v_null); + v_b_r1 = vec_mergeh(v_b_r1, v_null); + + v_r_b2 = vec_mergeh(v_r_b2, v_null); + v_g2 = vec_mergeh(v_g2, v_null); + v_b_r2 = vec_mergeh(v_b_r2, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gy )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_by )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ry); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, v_gy )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, v_by )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); + + src_addr += 48; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + unsigned int r_b = input_pixel(&src[i * 3 + 0]); + unsigned int g = input_pixel(&src[i * 3 + 1]); + unsigned int b_r = input_pixel(&src[i * 3 + 2]); + + dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + } + STOP_TIMER("2_vsx") +} + +static av_always_inline void rgb48ToUV_c_template_vsx(uint16_t *dstU, + uint16_t *dstV, + const uint16_t *src1, + const uint16_t *src2, + int width, + enum AVPixelFormat origin, + int32_t *rgb2yuv) +{ + av_assert1(src1==src2); + START_TIMER + int i, width_adj, is_BE ; + vector unsigned short v_rd0, v_rd1, v_rd2; + vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX], + rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x10001<<(RGB2YUV_SHIFT-1))); + shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + is_BE = isBE(origin); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32)); + + if(is_BE){ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 7, 6, 13, 12, 19, 18})); + v_r_b2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){9, 8, 15, 14, 21, 20, 27, 26})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){3, 2, 9, 8, 15, 14, 21, 20})); + v_g2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){11, 10, 17, 16, 23, 22, 29, 28})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){5, 4, 11, 10, 17, 16, 23, 22})); + v_b_r2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){13, 12, 19, 18, 25, 24, 31, 30})); + }else{ + v_r_b1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 6, 7, 12, 13, 18, 19})); + v_r_b2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){8, 9, 14, 15, 20, 21, 26, 27})); + v_g1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 3, 8, 9, 14, 15, 20, 21})); + v_g2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){10, 11, 16, 17, 22, 23, 28, 29})); + v_b_r1 = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){4, 5, 10, 11, 16, 17, 22, 23})); + v_b_r2 = vec_perm(v_rd1, v_rd2, + ((vector unsigned char){12, 13, 18, 19, 24, 25, 30, 31})); + } + + + v_r_b1 = vec_mergeh(v_r_b1, v_null); + v_g1 = vec_mergeh(v_g1, v_null); + v_b_r1 = vec_mergeh(v_b_r1, v_null); + + v_r_b2 = vec_mergeh(v_r_b2, v_null); + v_g2 = vec_mergeh(v_g2, v_null); + v_b_r2 = vec_mergeh(v_b_r2, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src_addr += 48; + dstU_addr += 16; + dstV_addr += 16; + } + + for (i = width_adj; i < width; i++) { + int r_b = input_pixel(&src1[i * 3 + 0]); + int g = input_pixel(&src1[i * 3 + 1]); + int b_r = input_pixel(&src1[i * 3 + 2]); + + dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + } + STOP_TIMER("3_vsx") +} + +static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU, + uint16_t *dstV, + const uint16_t *src1, + const uint16_t *src2, + int width, + enum AVPixelFormat origin, + int32_t *rgb2yuv) +{ + START_TIMER + int i; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + + av_assert1(src1 == src2); + for (i = 0; i < width; i++) { + int r_b = (input_pixel(&src1[6 * i + 0]) + + input_pixel(&src1[6 * i + 3]) + 1) >> 1; + int g = (input_pixel(&src1[6 * i + 1]) + + input_pixel(&src1[6 * i + 4]) + 1) >> 1; + int b_r = (input_pixel(&src1[6 * i + 2]) + + input_pixel(&src1[6 * i + 5]) + 1) >> 1; + + dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + } + STOP_TIMER("3.1") +} + +#undef r +#undef b +#undef v_r1 +#undef v_b1 +#undef v_r2 +#undef v_b2 +#undef input_pixel + +#define rgb48funcs(pattern, BE_LE, origin) \ +static void pattern ## 48 ## BE_LE ## ToY_c_vsx(uint8_t *_dst, \ + const uint8_t *_src, \ + const uint8_t *unused0, const uint8_t *unused1,\ + int width, \ + uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src = (const uint16_t *)_src; \ + uint16_t *dst = (uint16_t *)_dst; \ + rgb48ToY_c_template_vsx(dst, src, width, origin, rgb2yuv); \ +} \ + \ +static void pattern ## 48 ## BE_LE ## ToUV_c_vsx(uint8_t *_dstU, \ + uint8_t *_dstV, \ + const uint8_t *unused0, \ + const uint8_t *_src1, \ + const uint8_t *_src2, \ + int width, \ + uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src1 = (const uint16_t *)_src1, \ + *src2 = (const uint16_t *)_src2; \ + uint16_t *dstU = (uint16_t *)_dstU, \ + *dstV = (uint16_t *)_dstV; \ + rgb48ToUV_c_template_vsx(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ +} \ + \ +static void pattern ## 48 ## BE_LE ## ToUV_half_c_vsx(uint8_t *_dstU, \ + uint8_t *_dstV, \ + const uint8_t *unused0, \ + const uint8_t *_src1, \ + const uint8_t *_src2, \ + int width, \ + uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src1 = (const uint16_t *)_src1, \ + *src2 = (const uint16_t *)_src2; \ + uint16_t *dstU = (uint16_t *)_dstU, \ + *dstV = (uint16_t *)_dstV; \ + rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ +} + +rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE) +rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE) +rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE) +rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE) + +#define input_pixel(i) ((origin == AV_PIX_FMT_RGBA || \ + origin == AV_PIX_FMT_BGRA || \ + origin == AV_PIX_FMT_ARGB || \ + origin == AV_PIX_FMT_ABGR) \ + ? AV_RN32A(&src[(i) * 4]) \ + : (isBE(origin) ? AV_RB16(&src[(i) * 2]) \ + : AV_RL16(&src[(i) * 2]))) + + +static av_always_inline void rgb16_32ToY_c_template_vsx(int16_t *dst, + const uint8_t *src, + int width, + enum AVPixelFormat origin, + int shr, int shg, + int shb, int shp, + int maskr, int maskg, + int maskb, int rsh, + int gsh, int bsh, int S, + int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj, is_DW, is_BE; + vector signed short v_rd0, v_rd1, v_px,v_sign,v_val; + vector signed short v_r1, v_r2, v_b1, v_b2, v_g1, v_g2; + vector signed int v_dst1, v_dst2; + vector signed int shift1; + vector signed int shift2; + const int ry = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, + by = rgb2yuv[BY_IDX]<<bsh; + const unsigned rnd = (32<<((S)-1)) + (1<<(S-7)); + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((signed int)rnd); + shift2 = vec_splats((signed int)((S)-6)); + is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA || + origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR); + if(!is_DW) + is_BE = isBE(origin); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); + + if(is_DW){ + src_addr += 16; + v_rd1 = vec_vsx_ld(0, (signed short *)src_addr); + + v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, + (vector unsigned int)vec_splats((signed int)shp)); + v_b1 = (vector signed short)vec_and((vector signed int)v_rd0, + vec_splats((signed int)maskb)); + v_b1 = (vector signed short)vec_sr((vector unsigned int)v_b1, + (vector unsigned int)vec_splats((signed int)shb)); + v_g1 = (vector signed short)vec_and((vector signed int)v_rd0, + vec_splats((signed int)maskg)); + v_g1 = (vector signed short)vec_sr((vector unsigned int)v_g1, + (vector unsigned int)vec_splats((signed int)shg)); + v_r1 = (vector signed short)vec_and((vector signed int)v_rd0, + vec_splats((signed int)maskr)); + v_r1 = (vector signed short)vec_sr((vector unsigned int)v_r1, + (vector unsigned int)vec_splats((signed int)shr)); + + v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, + (vector unsigned int)vec_splats((signed int)shp)); + v_b2 = (vector signed short)vec_and((vector signed int)v_rd1, + vec_splats((signed int)maskb)); + v_b2 = (vector signed short)vec_sr((vector unsigned int)v_b2, + (vector unsigned int)vec_splats((signed int)shb)); + v_g2 = (vector signed short)vec_and((vector signed int)v_rd1, + vec_splats((signed int)maskg)); + v_g2 = (vector signed short)vec_sr((vector unsigned int)v_g2, + (vector unsigned int)vec_splats((signed int)shg)); + v_r2 = (vector signed short)vec_and((vector signed int)v_rd1, + vec_splats((signed int)maskr)); + v_r2 = (vector signed short)vec_sr((vector unsigned int)v_r2, + (vector unsigned int)vec_splats((signed int)shr)); + }else{ + if(is_BE){ + v_rd0 = vec_perm(v_rd0, v_rd0, + ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14})); + } + v_px = (vector signed short)vec_sr((vector unsigned short)v_rd0, + (vector unsigned short)vec_splats((signed short)shp)); + v_b1 = (vector signed short)vec_and(v_px, + vec_splats((signed short)maskb)); + v_b1 = (vector signed short)vec_sr((vector unsigned short)v_b1, + (vector unsigned short)vec_splats((signed short)shb)); + v_g1 = (vector signed short)vec_and(v_px, + vec_splats((signed short)maskg)); + v_g1 = (vector signed short)vec_sr((vector unsigned short)v_g1, + (vector unsigned short)vec_splats((signed short)shg)); + v_r1 = (vector signed short)vec_and(v_px, + vec_splats((signed short)maskr)); + v_r1 = (vector signed short)vec_sr((vector unsigned short)v_r1, + (vector unsigned short)vec_splats((signed short)shr)); + + + v_b2 = vec_mergel(v_b1, (vector signed short)v_null); + v_g2 = vec_mergel(v_g1, (vector signed short)v_null); + v_r2 = vec_mergel(v_r1, (vector signed short)v_null); + v_b1 = vec_mergeh(v_b1, (vector signed short)v_null); + v_g1 = vec_mergeh(v_g1, (vector signed short)v_null); + v_r1 = vec_mergeh(v_r1, (vector signed short)v_null); + + } + vec_vsx_st((vector unsigned char)v_r1, 0, (unsigned char *)dst_addr); + + v_dst1 = vec_mul((vector signed int)v_r1, + vec_splats((signed int)ry)); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, + vec_splats((signed int)gy) )); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, + vec_splats((signed int)by) )); + v_dst1 = vec_add(v_dst1, (vector signed int)shift1); + v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2); + + v_dst2 = vec_mul((vector signed int)v_r2, + vec_splats((signed int)ry)); + v_dst2 = vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, + vec_splats((signed int)gy) )); + v_dst2 = vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, + vec_splats((signed int)by) )); + v_dst2 = vec_add(v_dst2, (vector signed int)shift1); + v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2); + + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); + + dst_addr += 16; + src_addr += 16; + + } + + for (i = width_adj; i < width; i++) { + int px = input_pixel(i) >> shp; + int b = (px & maskb) >> shb; + int g = (px & maskg) >> shg; + int r = (px & maskr) >> shr; + dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6); + } + STOP_TIMER("4_vsx") +} + + +static av_always_inline void rgb16_32ToUV_c_template_vsx(int16_t *dstU, + int16_t *dstV, + const uint8_t *src, + int width, + enum AVPixelFormat origin, + int shr, int shg, + int shb, int shp, + int maskr, int maskg, + int maskb, int rsh, + int gsh, int bsh, int S, + int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj, is_DW, is_BE; + vector signed short v_rd0, v_rd1, v_px, v_sign, v_val; + vector signed short v_r1, v_r2, v_b1, v_b2, v_g1, v_g2; + vector signed int v_dst1, v_dst2; + vector unsigned int shift1; + vector signed int shift2; + const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, + bu = rgb2yuv[BU_IDX] << bsh, rv = rgb2yuv[RV_IDX] << rsh, + gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh; + const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7)); + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)rnd); + shift2 = vec_splats((signed int)((S)-6)); + is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA || + origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR); + if(!is_DW) + is_BE = isBE(origin); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); + + if(is_DW){ + src_addr += 16; + v_rd1 = vec_vsx_ld(0, (signed short *)src_addr); + + v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, + (vector unsigned int)vec_splats((signed int)shp)); + v_b1 = (vector signed short)vec_and((vector signed int)v_rd0, + vec_splats((signed int)maskb)); + v_b1 = (vector signed short)vec_sr((vector unsigned int)v_b1, + (vector unsigned int)vec_splats((signed int)shb)); + v_g1 = (vector signed short)vec_and((vector signed int)v_rd0, + vec_splats((signed int)maskg)); + v_g1 = (vector signed short)vec_sr((vector unsigned int)v_g1, + (vector unsigned int)vec_splats((signed int)shg)); + v_r1 = (vector signed short)vec_and((vector signed int)v_rd0, + vec_splats((signed int)maskr)); + v_r1 = (vector signed short)vec_sr((vector unsigned int)v_r1, + (vector unsigned int)vec_splats((signed int)shr)); + + v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, + (vector unsigned int)vec_splats((signed int)shp)); + v_b2 = (vector signed short)vec_and((vector signed int)v_rd1, + vec_splats((signed int)maskb)); + v_b2 = (vector signed short)vec_sr((vector unsigned int)v_b2, + (vector unsigned int)vec_splats((signed int)shb)); + v_g2 = (vector signed short)vec_and((vector signed int)v_rd1, + vec_splats((signed int)maskg)); + v_g2 = (vector signed short)vec_sr((vector unsigned int)v_g2, + (vector unsigned int)vec_splats((signed int)shg)); + v_r2 = (vector signed short)vec_and((vector signed int)v_rd1, + vec_splats((signed int)maskr)); + v_r2 = (vector signed short)vec_sr((vector unsigned int)v_r2, + (vector unsigned int)vec_splats((signed int)shr)); + }else{ + if(is_BE){ + v_rd0 = vec_perm(v_rd0, v_rd0, + ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14})); + } + v_px = (vector signed short)vec_sr((vector unsigned short)v_rd0, + (vector unsigned short)vec_splats((signed short)shp)); + v_b1 = (vector signed short)vec_and(v_px, + vec_splats((signed short)maskb)); + v_b1 = (vector signed short)vec_sr((vector unsigned short)v_b1, + (vector unsigned short)vec_splats((signed short)shb)); + v_g1 = (vector signed short)vec_and(v_px, + vec_splats((signed short)maskg)); + v_g1 = (vector signed short)vec_sr((vector unsigned short)v_g1, + (vector unsigned short)vec_splats((signed short)shg)); + v_r1 = (vector signed short)vec_and(v_px, + vec_splats((signed short)maskr)); + v_r1 = (vector signed short)vec_sr((vector unsigned short)v_r1, + (vector unsigned short)vec_splats((signed short)shr)); + + + v_b2 = vec_mergel(v_b1, (vector signed short)v_null); + v_g2 = vec_mergel(v_g1, (vector signed short)v_null); + v_r2 = vec_mergel(v_r1, (vector signed short)v_null); + v_b1 = vec_mergeh(v_b1, (vector signed short)v_null); + v_g1 = vec_mergeh(v_g1, (vector signed short)v_null); + v_r1 = vec_mergeh(v_r1, (vector signed short)v_null); + + } + + + v_dst1 = vec_mul((vector signed int)v_r1, + vec_splats((signed int)ru)); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, + vec_splats((signed int)gu) )); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, + vec_splats((signed int)bu) )); + v_dst1 = vec_add(v_dst1, (vector signed int)shift1); + v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2); + + v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)ru)); + v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_g2, vec_splats((signed int)gu) )); + v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_b2, vec_splats((signed int)bu) )); + + v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1); + + v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2); + + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = vec_mul((vector signed int)v_r1, + vec_splats((signed int)rv)); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, + vec_splats((signed int)gv) )); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, + vec_splats((signed int)bv) )); + v_dst1 = vec_add(v_dst1, (vector signed int)shift1); + v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2); + + v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)rv)); + v_dst2 = vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, + vec_splats((signed int)gv) )); + v_dst2 = vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, + vec_splats((signed int)bv) )); + + v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1); + + v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2); + + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + dstU_addr += 16; + dstV_addr += 16; + src_addr += 16; + + } + + for (i = width_adj; i < width; i++) { + int px = input_pixel(i) >> shp; + int b = (px & maskb) >> shb; + int g = (px & maskg) >> shg; + int r = (px & maskr) >> shr; + + dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6); + dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6); + } + STOP_TIMER("5_vsx") +} + +static av_always_inline void rgb16_32ToUV_half_c_template_vsx(int16_t *dstU, + int16_t *dstV, + const uint8_t *src, + int width, + enum AVPixelFormat origin, + int shr, int shg, + int shb, int shp, + int maskr, int maskg, + int maskb, int rsh, + int gsh, int bsh, int S, + int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj, is_DW, is_BE; + vector signed short v_rd0, v_rd1, v_sign, v_val; + vector unsigned int v_px0, v_px1; + vector signed int v_r2, v_g2, v_b2, v_r1, v_g1, v_b1,v_rb; + vector signed int v_dst1, v_dst2; + vector unsigned int shift1; + vector signed int shift2; + const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, + bu = rgb2yuv[BU_IDX] << bsh, rv = rgb2yuv[RV_IDX] << rsh, + gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh; + const int maskgx = ~(maskr | maskb); + const unsigned rnd = (256u<<(S)) + (1<<(S-6)); + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + maskr |= maskr << 1; + maskb |= maskb << 1; + maskg |= maskg << 1; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)rnd); + shift2 = vec_splats((signed int)((S)-6+1)); + is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA || + origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR); + if(!is_DW) + is_BE = isBE(origin); + } + + for (i = 0; i < width_adj; i+=4) { + v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); + src_addr += 16; + v_rd1 = vec_vsx_ld(0, (signed short *)src_addr); + + if(is_DW){ + v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, + (vector unsigned int)vec_splats((signed int)shp)); + v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, + (vector unsigned int)vec_splats((signed int)shp)); + v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27})); + v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd1, + ((vector unsigned char){4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31})); + v_g1 = (vector signed int)vec_and((vector unsigned int)v_px0, (vector unsigned int)vec_splats(maskgx)); + v_g1 = (vector signed int)vec_add((vector signed int)v_g1, (vector signed int)vec_and((vector unsigned int)v_px1, (vector unsigned int)vec_splats(maskgx))); + v_rb = (vector signed int)vec_add(v_px0, v_px1); + v_rb = (vector signed int)vec_sub((vector signed int)v_rb, (vector signed int)v_g1); + + v_b1 = vec_and((vector signed int)v_rb, + vec_splats((signed int)maskb)); + v_b1 = (vector signed int)vec_sr((vector unsigned int)v_b1, + (vector unsigned int)vec_splats((signed int)shb)); + + if(shp || + origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE || + origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){ + v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, + (vector unsigned int)vec_splats((signed int)shg)); + }else{ + v_g1 = vec_and((vector signed int)v_g1, + vec_splats((signed int)maskg)); + v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, + (vector unsigned int)vec_splats((signed int)shg)); + } + v_r1 = vec_and((vector signed int)v_rb, + vec_splats((signed int)maskr)); + v_r1 = (vector signed int)vec_sr((vector unsigned int)v_r1, + (vector unsigned int)vec_splats((signed int)shr)); + + src_addr += 16; + v_rd0 = vec_vsx_ld(0, (signed short *)src_addr); + src_addr += 16; + v_rd1 = vec_vsx_ld(0, (signed short *)src_addr); + + v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0, + (vector unsigned int)vec_splats((signed int)shp)); + v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1, + (vector unsigned int)vec_splats((signed int)shp)); + v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27})); + v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd1, + ((vector unsigned char){4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31})); + v_g2 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats(maskgx)); + v_g2 = (vector signed int)vec_add((vector signed int)v_g2, (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats(maskgx))); + v_rb = (vector signed int)vec_add(v_px0, v_px1); + v_rb = (vector signed int)vec_sub((vector signed int)v_rb, (vector signed int)v_g1); + + v_b2 = vec_and((vector signed int)v_rb, + vec_splats((signed int)maskb)); + v_b2 = (vector signed int)vec_sr((vector unsigned int)v_b2, + (vector unsigned int)vec_splats((signed int)shb)); + if(shp || + origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE || + origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){ + v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, + (vector unsigned int)vec_splats((signed int)shg)); + }else{ + v_g2 = vec_and((vector signed int)v_g2, + vec_splats((signed int)maskg)); + v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, + (vector unsigned int)vec_splats((signed int)shg)); + } + v_r2 = vec_and((vector signed int)v_rb, + vec_splats((signed int)maskr)); + v_r2 = (vector signed int)vec_sr((vector unsigned int)v_r2, + (vector unsigned int)vec_splats((signed int)shr)); + }else{ + if(is_BE){ + v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd0, + ((vector unsigned char){1, 0, 0, 0, 5, 4, 0, 0, 9, 8, 0, 0, 13, 12, 0, 0})); + v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd0, + ((vector unsigned char){3, 2, 0, 0, 7, 6, 0, 0, 11, 10, 0, 0, 15, 14, 0, 0})); + }else{ + v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd0, + ((vector unsigned char){0, 1, 0, 0, 4, 5, 0, 0, 8, 9, 0, 0, 12, 13, 0, 0})); + v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd0, + ((vector unsigned char){2, 3, 0, 0, 6, 7, 0, 0, 10, 11, 0, 0, 14, 15, 0, 0})); + } + + v_px0 = vec_and(v_px0, vec_splats((unsigned int)0x0000FFFF)); + v_px1 = vec_and(v_px1, vec_splats((unsigned int)0x0000FFFF)); + + v_px0 = (vector unsigned int)vec_sr(v_px0, + (vector unsigned int)vec_splats((signed int)shp)); + v_px1 = (vector unsigned int)vec_sr(v_px1, + (vector unsigned int)vec_splats((signed short)shp)); + + + v_g1 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats((unsigned int)maskgx)); + v_g1 = (vector signed int)vec_add(v_g1, (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats((signed int)maskgx))); + v_rb = (vector signed int)vec_add(v_px0, v_px1); + v_rb = (vector signed int)vec_sub(v_rb, v_g1); + + + + v_b1 = (vector signed int)vec_and(v_rb, vec_splats((signed int)maskb)); + v_b1 = (vector signed int)vec_sr((vector unsigned int)v_b1, + (vector unsigned int)vec_splats((signed int)shb)); + + if(shp || + origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE || + origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){ + v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, (vector unsigned int)vec_splats((signed int)shg)); + }else{ + v_g1 = vec_and(v_g1, vec_splats((signed int)maskg)); + v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, + (vector unsigned int)vec_splats((signed int)shg)); + } + + v_r1 = (vector signed int)vec_and((vector signed int)v_rb, + vec_splats((signed int)maskr)); + v_r1 = (vector signed int)vec_sr((vector unsigned int)v_r1, + (vector unsigned int)vec_splats((signed int)shr)); + if(is_BE){ + v_px0 = (vector unsigned int)vec_perm(v_rd1, v_rd1, + ((vector unsigned char){1, 0, 0, 0, 5, 4, 0, 0, 9, 8, 0, 0, 13, 12, 0, 0})); + v_px1 = (vector unsigned int)vec_perm(v_rd1, v_rd1, + ((vector unsigned char){3, 2, 0, 0, 7, 6, 0, 0, 11, 10, 0, 0, 15, 14, 0, 0})); + }else{ + v_px0 = (vector unsigned int)vec_perm(v_rd1, v_rd1, + ((vector unsigned char){0, 1, 0, 0, 4, 5, 0, 0, 8, 9, 0, 0, 12, 13, 0, 0})); + v_px1 = (vector unsigned int)vec_perm(v_rd1, v_rd1, + ((vector unsigned char){2, 3, 0, 0, 6, 7, 0, 0, 10, 11, 0, 0, 14, 15, 0, 0})); + } + + v_px0 = vec_and(v_px0, vec_splats((unsigned int)0x0000FFFF)); + v_px1 = vec_and(v_px1, vec_splats((unsigned int)0x0000FFFF)); + + v_px0 = vec_sr((vector unsigned int)v_px0, + (vector unsigned int)vec_splats((signed int)shp)); + v_px1 = vec_sr((vector unsigned int)v_px1, + (vector unsigned int)vec_splats((signed int)shp)); + + + v_g2 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats((unsigned int)maskgx)); + v_g2 = (vector signed int)vec_add(v_g2, + (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats((signed int)maskgx))); + v_rb = (vector signed int)vec_add(v_px0, v_px1); + v_rb = (vector signed int)vec_sub(v_rb, v_g2); + + v_b2 = (vector signed int)vec_and(v_rb, vec_splats((signed int)maskb)); + v_b2 = (vector signed int)vec_sr((vector unsigned int)v_b2, + (vector unsigned int)vec_splats((signed int)shb)); + if(shp || + origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE || + origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){ + v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, (vector unsigned int)vec_splats((signed int)shg)); + }else{ + v_g2 = vec_and(v_g2, + vec_splats((signed int)maskg)); + v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, (vector unsigned int)vec_splats((signed int)shg)); + } + v_r2 = vec_and(v_rb, vec_splats((signed int)maskr)); + v_r2 = (vector signed int)vec_sr((vector unsigned int)v_r2, (vector unsigned int)vec_splats((signed short)shr)); + } + + v_dst1 = vec_mul((vector signed int)v_r1, + vec_splats((signed int)ru)); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, + vec_splats((signed int)gu) )); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, + vec_splats((signed int)bu) )); + v_dst1 = vec_add(v_dst1, (vector signed int)shift1); + v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2); + + v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)ru)); + v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_g2, vec_splats((signed int)gu) )); + v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_b2, vec_splats((signed int)bu) )); + + v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1); + + v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2); + + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = vec_mul((vector signed int)v_r1, + vec_splats((signed int)rv)); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, + vec_splats((signed int)gv) )); + v_dst1 = vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, + vec_splats((signed int)bv) )); + v_dst1 = vec_add(v_dst1, (vector signed int)shift1); + v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2); + + v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)rv)); + v_dst2 = vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g2, + vec_splats((signed int)gv) )); + v_dst2 = vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b2, + vec_splats((signed int)bv) )); + + v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1); + + v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2); + + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + dstU_addr += 16; + dstV_addr += 16; + src_addr += 16; + + } + + for (i = width_adj; i < width; i++) { + + unsigned px0 = input_pixel(2 * i + 0) >> shp; + unsigned px1 = input_pixel(2 * i + 1) >> shp; + int b, r, g = (px0 & maskgx) + (px1 & maskgx); + int rb = px0 + px1 - g; + + b = (rb & maskb) >> shb; + if (shp || + origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE || + origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE) { + g >>= shg; + } else { + g = (g & maskg) >> shg; + } + r = (rb & maskr) >> shr; + + dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1); + dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1); + } + STOP_TIMER("5_half_vsx") +} + +#undef input_pixel + +#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \ + maskg, maskb, rsh, gsh, bsh, S) \ +static void name ## ToY_c_vsx(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \ + int width, uint32_t *tab) \ +{ \ + rgb16_32ToY_c_template_vsx((int16_t*)dst, src, width, fmt, shr, shg, shb, shp, \ + maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \ +} \ + \ +static void name ## ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, \ + const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \ + int width, uint32_t *tab) \ +{ \ + rgb16_32ToUV_c_template_vsx((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \ + shr, shg, shb, shp, \ + maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\ +} \ + \ +static void name ## ToUV_half_c_vsx(uint8_t *dstU, uint8_t *dstV, \ + const uint8_t *unused0, const uint8_t *src, \ + const uint8_t *dummy, \ + int width, uint32_t *tab) \ +{ \ + rgb16_32ToUV_half_c_template_vsx((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \ + shr, shg, shb, shp, \ + maskr, maskg, maskb, \ + rsh, gsh, bsh, S, tab); \ +} + +rgb16_32_wrapper(AV_PIX_FMT_BGR32, bgr32, 16, 0, 0, 0, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_BGR32_1, bgr321, 16, 0, 0, 8, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_RGB32, rgb32, 0, 0, 16, 0, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_RGB32_1, rgb321, 0, 0, 16, 8, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_BGR565LE, bgr16le, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_BGR555LE, bgr15le, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7) +rgb16_32_wrapper(AV_PIX_FMT_BGR444LE, bgr12le, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4) +rgb16_32_wrapper(AV_PIX_FMT_RGB565LE, rgb16le, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_RGB555LE, rgb15le, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7) +rgb16_32_wrapper(AV_PIX_FMT_RGB444LE, rgb12le, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4) +rgb16_32_wrapper(AV_PIX_FMT_BGR565BE, bgr16be, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_BGR555BE, bgr15be, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7) +rgb16_32_wrapper(AV_PIX_FMT_BGR444BE, bgr12be, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4) +rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8) +rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7) +rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4) + +static void gbr24pToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc, + int width, uint32_t *rgb2yuv) +{ + START_TIMER + vector unsigned short v_rd0, v_rd1, v_rd2, v_rd00, v_rd01, v_rd02; + int i, width_adj; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + uint16_t *dstU = (uint16_t *)_dstU; + uint16_t *dstV = (uint16_t *)_dstV; + const int ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + const int rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t gsrc_addr = (uintptr_t)gsrc; + uintptr_t bsrc_addr = (uintptr_t)bsrc; + uintptr_t rsrc_addr = (uintptr_t)rsrc; + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = ((vector unsigned int){(0x4001<<(RGB2YUV_SHIFT-6)),(0x4001<<(RGB2YUV_SHIFT-6)), + (0x4001<<(RGB2YUV_SHIFT-6)),(0x4001<<(RGB2YUV_SHIFT-6))} ); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6+1)); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)gsrc_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)bsrc_addr); + v_rd2 = vec_vsx_ld(0, (unsigned short *)rsrc_addr); + + v_rd00 = vec_sr(v_rd0, vec_splats((unsigned short)8)); + v_rd01 = vec_sr(v_rd1, vec_splats((unsigned short)8)); + v_rd02 = vec_sr(v_rd2, vec_splats((unsigned short)8)); + v_rd0 = vec_and(v_rd0, vec_splats((unsigned short)0xFF)); + v_rd1 = vec_and(v_rd1, vec_splats((unsigned short)0xFF)); + v_rd2 = vec_and(v_rd2, vec_splats((unsigned short)0xFF)); + + v_rd0 = vec_add(v_rd0, v_rd00); + v_rd1 = vec_add(v_rd1, v_rd01); + v_rd2 = vec_add(v_rd2, v_rd02); + + v_rd00 = vec_mergeh(v_rd0, v_null); + v_rd01 = vec_mergeh(v_rd1, v_null); + v_rd02 = vec_mergeh(v_rd2, v_null); + v_rd0 = vec_mergel(v_rd0, v_null); + v_rd1 = vec_mergel(v_rd1, v_null); + v_rd2 = vec_mergel(v_rd2, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd02, + ((vector signed int){ru,ru,ru,ru})); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd00, + ((vector signed int){gu,gu,gu,gu}) )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd01, + ((vector signed int){bu,bu,bu,bu}))); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_rd2, + ((vector signed int){ru,ru,ru,ru})); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_rd0, + ((vector signed int){gu,gu,gu,gu}))); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_rd1, + ((vector signed int){bu,bu,bu,bu}))); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); dstU_addr+=16; + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd02, + ((vector signed int){rv,rv,rv,rv})); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd00, + ((vector signed int){gv,gv,gv,gv}))); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd01, + ((vector signed int){bv,bv,bv,bv}))); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_rd2, + ((vector signed int){rv,rv,rv,rv})); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_rd0, + ((vector signed int){gv,gv,gv,gv}))); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_rd1, + ((vector signed int){bv,bv,bv,bv}))); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); dstV_addr+=16; + + gsrc_addr += 16; + bsrc_addr += 16; + rsrc_addr += 16; + } + for (i = width_adj; i < width; i++) { + unsigned int g = gsrc[2*i] + gsrc[2*i+1]; + unsigned int b = bsrc[2*i] + bsrc[2*i+1]; + unsigned int r = rsrc[2*i] + rsrc[2*i+1]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1); + dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1); + } + STOP_TIMER("6_vsx") +} + +static void rgba64leToA_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, + const uint8_t *unused2, int width, uint32_t *unused) +{ + START_TIMER + int16_t *dst = (int16_t *)_dst; + const uint16_t *src = (const uint16_t *)_src; + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3, v_dst; + + uintptr_t src_addr = (uintptr_t)_src; + uintptr_t dst_addr = (uintptr_t)_dst; + + // compute integral number of vector-length items and length of final fragment + width_adj = width & (~(int)0x07); + + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr + 32)); + v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr + 48)); + + v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31})); + v_rd0 = vec_perm(v_rd2, v_rd3, ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31})); + v_dst = vec_sld(v_dst, v_dst, 8); + v_dst = vec_sld(v_rd0, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 64; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dst[i]= AV_RL16(src + 4*i + 3); + } + STOP_TIMER("7_vsx") +} + +static void rgba64beToA_c_vsx(uint8_t *_dst, const uint8_t *_src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int16_t *dst = (int16_t *)_dst; + const uint16_t *src = (const uint16_t *)_src; + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3, v_dst; + + uintptr_t src_addr = (uintptr_t)_src; + uintptr_t dst_addr = (uintptr_t)_dst; + + // compute integral number of vector-length items and length of final fragment + width_adj = width & (~(int)0x07); + + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr + 32)); + v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr + 48)); + + v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char){7, 6, 15, 14, 23, 22, 31, 30})); + v_rd0 = vec_perm(v_rd2, v_rd3, ((vector unsigned char){7, 6, 15, 14, 23, 22, 31, 30})); + v_dst = vec_sld(v_dst, v_dst, 8); + v_dst = vec_sld(v_rd0, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 64; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dst[i]= AV_RB16(src + 4*i + 3); + } + STOP_TIMER("8_vsx") +} + +static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int16_t *dst = (int16_t *)_dst; + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_dst; + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + // compute integral number of vector-length items and length of final fragment + width_adj = width & (~(int)0x07); + + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)(src_addr)); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + + v_rd0 = vec_and(v_rd0, v_FF); + v_rd1 = vec_and(v_rd1, v_FF); + + v_rd0 = vec_sl(v_rd0, vec_splats((unsigned short)6)); + v_rd1 = vec_sl(v_rd1, vec_splats((unsigned short)6)); + + v_dst = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dst[i]= src[4*i]<<6; + } + STOP_TIMER("9_vsx") +} + +static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int16_t *dst = (int16_t *)_dst; + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_dst; + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + // compute integral number of vector-length items and length of final fragment + width_adj = width & (~(int)0x07); + + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)(src_addr)); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + + v_rd0 = vec_sld(v_rd0, v_rd0, 13); + v_rd1 = vec_sld(v_rd1, v_rd1, 13); + + v_rd0 = vec_and(v_rd0, v_FF); + v_rd1 = vec_and(v_rd1, v_FF); + + v_rd0 = vec_sl(v_rd0, vec_splats((unsigned short)6)); + v_rd1 = vec_sl(v_rd1, vec_splats((unsigned short)6)); + + v_dst = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dst[i]= src[4*i+3]<<6; + } + STOP_TIMER("10_vsx") +} + +static void palToA_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *pal) +{ + START_TIMER + int16_t *dst = (int16_t *)_dst; + int i, j, d, width_adj; + uint32_t _pal[8]; + + vector unsigned short v_dst; + vector unsigned int v_rd0, v_rd1, v_rd3, v_rd4; + vector unsigned char sample; + vector unsigned int shift1; + vector unsigned short shift2; + + uintptr_t dst_addr = (uintptr_t)_dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}); + shift1 = ((vector unsigned int){24, 24, 24, 24}); + shift2 = vec_splats((unsigned short)6); + } + for ( i = 0; i < width_adj; i += 8) { + for( j=0; j<8; ++j) + _pal[j] = pal[src[j]]; + + v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal); + v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4])); + v_rd3 = vec_sr(v_rd0, shift1); + v_rd4 = vec_sr(v_rd1, shift1); + v_rd0 = vec_perm(v_rd3, v_rd4, sample); + v_dst = vec_sl((vector unsigned short)v_rd0, shift2); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src += 8; + dst_addr += 16; + + } + + for (i = width_adj; i < width; i++) { + d = *src; + dst[i]= (pal[d] >> 24)<<6; + ++src; + } + STOP_TIMER("11_vsx") +} + +static void palToY_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *pal) +{ + START_TIMER + int16_t *dst = (int16_t *)_dst; + int i, j, d, width_adj; + uint32_t _pal[8]; + + vector unsigned short v_dst; + vector unsigned int v_rd0, v_rd1, v_rd3, v_rd4; + vector unsigned char sample; + vector unsigned short shift; + + uintptr_t dst_addr = (uintptr_t)_dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}); + shift = vec_splats((unsigned short)6); + } + for ( i = 0; i < width_adj; i += 8) { + for( j=0; j<8; ++j) + _pal[j] = pal[src[j]]; + + v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal); + v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4])); + v_rd3 = vec_and(v_rd0, v_000000FF); + v_rd4 = vec_and(v_rd1, v_000000FF); + v_rd0 = vec_perm(v_rd3, v_rd4, sample); + v_dst = vec_sl((vector unsigned short)v_rd0, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src += 8; + dst_addr += 16; + + } + + for (i = width_adj; i < width; i++) { + d= *src; + dst[i] = (pal[d] & 0xFF)<<6; + src++; + } + STOP_TIMER("12_vsx") +} + +static void palToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *pal) +{ + av_assert1(src1 == src2); + START_TIMER + uint16_t *dstU = (uint16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + int i, j, d, width_adj; + uint32_t _pal[8]; + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + vector unsigned short v_dst, v_tmp0, v_tmp1; + vector unsigned int v_rd0, v_rd1, shift1, shift2; + vector unsigned char sample; + vector unsigned short shift3; + + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}); + shift1 = vec_splats((unsigned int)8); + shift2 = vec_splats((unsigned int)16); + shift3 = vec_splats((unsigned short)6); + } + for ( i = 0; i < width_adj; i += 8) { + for( j = 0; j < 8; ++j) + _pal[j] = pal[src1[j]]; + + v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal); + v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4])); + + v_tmp0 = (vector unsigned short)vec_sr(v_rd0, shift1); + v_tmp1 = (vector unsigned short)vec_sr(v_rd1, shift1); + v_dst = (vector unsigned short)vec_perm(v_tmp0, v_tmp1, sample); + v_tmp0 = vec_and(v_dst, v_FF); + v_dst = vec_sl((vector unsigned short)v_tmp0, shift3); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + + v_tmp0 = (vector unsigned short)vec_sr(v_rd0, shift2); + v_tmp1 = (vector unsigned short)vec_sr(v_rd1, shift2); + v_dst = (vector unsigned short)vec_perm(v_tmp0, v_tmp1, sample); + v_tmp0 = vec_and(v_dst, v_FF); + v_dst = vec_sl((vector unsigned short)v_tmp0, shift3); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src1 += 8; + dstU_addr += 16; + dstV_addr += 16; + + } + + for (i = width_adj; i < width; i++) { + d = pal[*src1];; + dstU[i] = (uint8_t)(d>> 8)<<6; + dstV[i] = (uint8_t)(d>>16)<<6; + src1++; + } + STOP_TIMER("13_vsx") +} + +static void monowhite2Y_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + + int16_t *dst = (int16_t *)_dst; + int i, j; + vector unsigned short v_rd0, v_dst; + + uintptr_t dst_addr = (uintptr_t)dst; + + width = (width + 7) >> 3; + for (i = 0; i < width; i++) { + v_rd0 = vec_splats((unsigned short)~src[i]); + + v_dst = vec_sr(v_rd0, ((vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0})); + v_dst = vec_and(v_dst, vec_splats((unsigned short)0x01)); + v_dst = vec_mul(v_dst, vec_splats((unsigned short)16383)); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + dst_addr += 16; + } + if(width&7){ + int d= ~src[i]; + for (j = 0; j < (width&7); j++) + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } + STOP_TIMER("14_vsx") +} + +static void monoblack2Y_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int16_t *dst = (int16_t *)_dst; + int i, j; + vector unsigned short v_rd0, v_dst; + + uintptr_t dst_addr = (uintptr_t)dst; + + width = (width + 7) >> 3; + for (i = 0; i < width; i++) { + v_rd0 = vec_splats((unsigned short)src[i]); + + v_dst = vec_sr(v_rd0, ((vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0})); + v_dst = vec_and(v_dst, vec_splats((unsigned short)0x01)); + v_dst = vec_mul(v_dst, vec_splats((unsigned short)16383)); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + dst_addr += 16; + } + if(width&7){ + int d= src[i]; + for (j = 0; j < (width&7); j++) + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } + STOP_TIMER("15_vsx") +} + +static void yuy2ToY_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x0F); + + for ( i = 0; i < width_adj; i += 16) { + vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr); + vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + + vector int v_dst = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30})); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dst[i] = src[2 * i]; + } + STOP_TIMER("16_vsx") +} + +static void yuy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x0F); + + vec_and(v_dst, vec_splats((int)0x000)); + + if(width_adj){ + sample1 = ((vector unsigned char){1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31}); + sample2 = ((vector unsigned char){3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29}); + } + for ( i = 0; i < width_adj; i += 16) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd2 = vec_perm(v_rd0, v_rd1, sample1); + src_addr += 32; + + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd3 = vec_perm(v_rd0, v_rd1, sample2); + v_dst = vec_sld(v_rd2, v_rd3, 8); + v_dst = vec_sld(v_dst, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + v_dst = vec_sld(v_rd3, v_rd2, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src_addr += 32; + dstU_addr += 16; + dstV_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dstU[i] = src1[4 * i + 1]; + dstV[i] = src1[4 * i + 3]; + } + STOP_TIMER("17_vsx") + av_assert1(src1 == src2); +} + +static void yvy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x0F); + + vec_and(v_dst, vec_splats(0x000)); + + if(width_adj){ + sample1 = ((vector unsigned char){1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31}); + sample2 = ((vector unsigned char){3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29}); + } + for ( i = 0; i < width_adj; i += 16) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd2 = vec_perm(v_rd0, v_rd1, sample1); + src_addr += 32; + + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd3 = vec_perm(v_rd0, v_rd1, sample2); + v_dst = vec_sld(v_rd2, v_rd3, 8); + v_dst = vec_sld(v_dst, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + v_dst = vec_sld(v_rd3, v_rd2, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + + src_addr += 32; + dstU_addr += 16; + dstV_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + dstV[i] = src1[4 * i + 1]; + dstU[i] = src1[4 * i + 3]; + } + STOP_TIMER("18_vsx") + av_assert1(src1 == src2); +} +static void bswap16Y_c_vsx(uint8_t *_dst, const uint8_t *_src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_dst, v_shift; + + const uint16_t *src = (const uint16_t *)_src; + uint16_t *dst = (uint16_t *)_dst; + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj) + v_shift = (vector unsigned short)vec_splats((unsigned short)8); + for ( i = 0; i < width_adj; i += 8) { + v_dst = vec_vsx_ld(0, (unsigned short *)src_addr); + + v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift); + v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift); + v_dst = vec_or(v_rd0, v_rd1); + + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 16; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dst[i] = (src[i]>>8) | (src[i]<<8); + } + STOP_TIMER("19_vsx") +} + +static void bswap16UV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *unused0, const uint8_t *_src1, + const uint8_t *_src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_dst, v_shift; + + const uint16_t *src1 = (const uint16_t *)_src1, + *src2 = (const uint16_t *)_src2; + uint16_t *dstU = (uint16_t *)_dstU, + *dstV = (uint16_t *)_dstV; + uintptr_t src1_addr = (uintptr_t)_src1, + src2_addr = (uintptr_t)_src2; + uintptr_t dstU_addr = (uintptr_t)dstU, + dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x07); + + if(width_adj) + v_shift = (vector unsigned short)vec_splats((unsigned short)8); + for ( i = 0; i < width_adj; i += 8) { + // load to dstU + v_dst = vec_vsx_ld(0, (unsigned short *)src1_addr); + v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift); + v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift); + v_dst = vec_or(v_rd0, v_rd1); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + // load to dstV + v_dst = vec_vsx_ld(0, (unsigned short *)src2_addr); + v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift); + v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift); + v_dst = vec_or(v_rd0, v_rd1); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + // + src1_addr += 16; + src2_addr += 16; + dstU_addr += 16; + dstV_addr += 16; + } + for (i = width_adj; i < width; i++) { + dstU[i] = (src1[i]>>8) | (src1[i]<<8); + dstV[i] = (src2[i]>>8) | (src2[i]<<8); + } + STOP_TIMER("20_vsx") +} + +static void read_ya16le_gray_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector int v_rd0, v_rd1, v_dst; + vector unsigned char sample; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_dst = vec_perm(v_rd0, v_rd1, sample); + + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RL16(src + i * 4)); + } + STOP_TIMER("21_vsx") +} + +static void read_ya16le_alpha_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector int v_rd0, v_rd1, v_dst; + vector unsigned char sample; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_dst = vec_perm(v_rd0, v_rd1, sample); + + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RL16(src + i * 4 + 2)); + } + STOP_TIMER("22_vsx") +} + +static void read_ya16be_gray_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + + int i, width_adj; + + vector int v_rd0, v_rd1, v_dst; + vector unsigned char sample; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_dst = vec_perm(v_rd0, v_rd1, sample); + + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RB16(src + i * 4)); + } + STOP_TIMER("23_vsx") +} + +static void read_ya16be_alpha_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + vector int v_rd0, v_rd1, v_dst; + vector unsigned char sample; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_dst = vec_perm(v_rd0, v_rd1, sample); + + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RB16(src + i * 4 + 2)); + } + STOP_TIMER("24_vsx") + +} + +static void read_ayuv64le_Y_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused0, const uint8_t *unused1, + int width, uint32_t *unused2) +{ + START_TIMER + int i, width_adj; + + vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample1 = ((vector unsigned char){0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 10, 11, 18, 19, 26, 27}); + sample2 = ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32)); + v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48)); + + v_rd0 = vec_perm(v_rd0, v_rd1, sample1); + v_rd2 = vec_perm(v_rd2, v_rd3, sample2); + v_dst = vec_sld(v_rd2, v_rd0, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 64; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RL16(src + i * 8 + 2)); + } + STOP_TIMER("25_vsx") +} + + +static void read_ayuv64le_UV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src, + const uint8_t *unused1, int width, uint32_t *unused2) +{ + START_TIMER + int i, width_adj; + + vector int v_rd0, v_rd1, v_rd2, v_rd3, v_rd4, v_rd5, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample1 = ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31, 4, 5, 12, 13, 20, 21, 28, 29}); + sample2 = ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32)); + v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48)); + + v_rd4 = vec_perm(v_rd0, v_rd1, sample1); + v_rd5 = vec_perm(v_rd2, v_rd3, sample2); + v_dst = vec_sld(v_rd5, v_rd4, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + v_dst = vec_sld(v_rd4, v_rd5, 8); + v_dst = vec_sld(v_dst, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src_addr += 64; + dstU_addr += 16; + dstV_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dstU + i * 2, AV_RL16(src + i * 8 + 4)); + AV_WN16(dstV + i * 2, AV_RL16(src + i * 8 + 6)); + } + STOP_TIMER("26_vsx") +} + +static void read_ayuv64le_A_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused0, const uint8_t *unused1, + int width, uint32_t *unused2) +{ + START_TIMER + int i, width_adj; + + vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample1 = ((vector unsigned char){0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 9, 16, 17, 24, 25}); + sample2 = ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25, 0, 0, 0, 0, 0, 0, 0, 0}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32)); + v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48)); + + v_rd0 = vec_perm(v_rd0, v_rd1, sample1); + v_rd2 = vec_perm(v_rd2, v_rd3, sample2); + v_dst = vec_sld(v_rd2, v_rd0, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 64; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RL16(src + i * 8)); + } + STOP_TIMER("27_vsx") +} + +/* This is almost identical to the previous, end exists only because + * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */ +static void uyvyToY_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + vector int v_rd0, v_rd1, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x0F); + + if(width_adj){ + sample1 = ((vector unsigned char){1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); + } + for ( i = 0; i < width_adj; i += 16) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + + v_dst = vec_perm(v_rd0, v_rd1, sample1); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + dst[i] = src[2 * i + 1]; + } + STOP_TIMER("28_vsx") +} + +static void uyvyToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + vector int v_rd0, v_rd1, v_rd2, v_rd3, v_rd4, v_rd5, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x0F); + + if(width_adj){ + sample1 = ((vector unsigned char){2, 6, 10, 14, 18, 22, 26, 30, 0, 4, 8, 12, 16, 20, 24, 28}); + sample2 = ((vector unsigned char){0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30}); + } + for ( i = 0; i < width_adj; i += 16) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32)); + v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48)); + + v_rd4 = vec_perm(v_rd0, v_rd1, sample1); + v_rd5 = vec_perm(v_rd2, v_rd3, sample2); + v_dst = vec_sld(v_rd5, v_rd4, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + v_dst = vec_sld(v_rd4, v_rd5, 8); + v_dst = vec_sld(v_dst, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src_addr += 64; + dstU_addr += 16; + dstV_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + dstU[i] = src1[4 * i + 0]; + dstV[i] = src1[4 * i + 2]; + } + STOP_TIMER("29_vsx") + av_assert1(src1 == src2); +} + +static av_always_inline void nvXXtoUV_c_vsx(uint8_t *dst1, uint8_t *dst2, + const uint8_t *src, int width) +{ + START_TIMER + int i, width_adj; + + vector int v_rd0, v_rd1, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst1_addr = (uintptr_t)dst1; + uintptr_t dst2_addr = (uintptr_t)dst2; + + width_adj = width & (~(int)0x0F); + + if(width_adj){ + sample1 = ((vector unsigned char){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); + sample2 = ((vector unsigned char){1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); + } + for ( i = 0; i < width_adj; i += 16) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + + v_dst = vec_perm(v_rd0, v_rd1, sample1); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst1_addr); + v_dst = vec_perm(v_rd0, v_rd1, sample2); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst2_addr); + + src_addr += 32; + dst1_addr += 16; + dst2_addr += 16; + } + + for (i = width_adj; i < width; i++) { + dst1[i] = src[2 * i + 0]; + dst2[i] = src[2 * i + 1]; + } + STOP_TIMER("30_vsx") +} + +static void nv12ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, + int width, uint32_t *unused) +{ + nvXXtoUV_c_vsx(dstU, dstV, src1, width); +} + +static void nv21ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, + int width, uint32_t *unused) +{ + nvXXtoUV_c_vsx(dstV, dstU, src1, width); +} + +static void p010LEToY_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_dst, shift; + vector unsigned char sample; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj) + shift = vec_splats((unsigned short)6); + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + + v_dst = vec_sr(v_rd0, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 16; + dst_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RL16(src + i * 2) >> 6); + } + STOP_TIMER("31_vsx") +} + +static void p010BEToY_c_vsx(uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_dst, shift; + vector unsigned char sample; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample = ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}); + shift = vec_splats((unsigned short)6); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + + v_rd1 = vec_perm(v_rd0, v_rd0, sample); + v_dst = vec_sr(v_rd1, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 16; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + AV_WN16(dst + i * 2, AV_RB16(src + i * 2) >> 6); + } + STOP_TIMER("32_vsx") +} + +static void p010LEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + vector unsigned short v_rd0, v_rd1, v_dst; + vector unsigned char sample1, sample2; + vector unsigned short shift; + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample1 = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}); + sample2 = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}); + shift = vec_splats((unsigned short)6); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + + v_dst = vec_perm(v_rd0, v_rd1, sample1); + v_dst = vec_sr(v_dst, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + v_dst = vec_perm(v_rd0, v_rd1, sample2); + v_dst = vec_sr(v_dst, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src_addr += 32; + dstU_addr += 16; + dstV_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dstU + i * 2, AV_RL16(src1 + i * 4 + 0) >> 6); + AV_WN16(dstV + i * 2, AV_RL16(src1 + i * 4 + 2) >> 6); + } + STOP_TIMER("33_vsx") +} + +static void p010BEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + vector unsigned short v_rd0, v_rd1, v_dst; + vector unsigned char sample1, sample2; + vector unsigned short shift; + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample1 = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28}); + sample2 = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30}); + shift = vec_splats((unsigned short)6); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + + v_dst = vec_perm(v_rd0, v_rd1, sample1); + v_dst = vec_sr(v_dst, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + v_dst = vec_perm(v_rd0, v_rd1, sample2); + v_dst = vec_sr(v_dst, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src_addr += 32; + dstU_addr += 16; + dstV_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dstU + i * 2, AV_RB16(src1 + i * 4 + 0) >> 6); + AV_WN16(dstV + i * 2, AV_RB16(src1 + i * 4 + 2) >> 6); + + } + STOP_TIMER("34_vsx") +} + +static void p016LEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + vector unsigned short v_rd0, v_rd1, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample1 = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}); + sample2 = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + + v_dst = vec_perm(v_rd0, v_rd1, sample1); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + v_dst = vec_perm(v_rd0, v_rd1, sample2); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src_addr += 32; + dstU_addr += 16; + dstV_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dstU + i * 2, AV_RL16(src1 + i * 4 + 0)); + AV_WN16(dstV + i * 2, AV_RL16(src1 + i * 4 + 2)); + } + STOP_TIMER("35_vsx") +} + +static void p016BEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + + vector unsigned short v_rd0, v_rd1, v_dst; + vector unsigned char sample1, sample2; + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)dstU; + uintptr_t dstV_addr = (uintptr_t)dstV; + + width_adj = width & (~(int)0x07); + + if(width_adj){ + sample1 = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28}); + sample2 = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30}); + } + for ( i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16)); + + v_dst = vec_perm(v_rd0, v_rd1, sample1); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr); + v_dst = vec_perm(v_rd0, v_rd1, sample2); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr); + + src_addr += 32; + dstU_addr += 16; + dstV_addr += 16; + } + + + for (i = width_adj; i < width; i++) { + AV_WN16(dstU + i * 2, AV_RB16(src1 + i * 4 + 0)); + AV_WN16(dstV + i * 2, AV_RB16(src1 + i * 4 + 2));; + } + STOP_TIMER("36_vsx") +} + +#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) + +static void bgr24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dst = (int16_t *)_dst; + vector signed int v_ry, v_gy, v_by; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)_dst; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6)); + v_ry = vec_splats((signed int)ry); + v_gy = vec_splats((signed int)gy); + v_by = vec_splats((signed int)by); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + + v_b = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0})); + v_g = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0})); + v_r = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0})); + + v_b = vec_and(v_b, v_FF); + v_g = vec_and(v_g, v_FF); + v_r = vec_and(v_r, v_FF); + + v_rd0 = vec_mergeh(v_b, v_null); + v_rd1 = vec_mergeh(v_g, v_null); + v_rd2 = vec_mergeh(v_r, v_null); + + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + v_r = vec_mergel(v_r, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ry); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gy )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd0, v_by )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gy )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_by )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); + + src_addr += 24; + dst_addr += 16; + } + for (i = width_adj; i < width; i++) { + unsigned int b = src[3*i]; + unsigned int g = src[3*i + 1]; + unsigned int r = src[3*i + 2]; + + dst[i] = ((ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6)); + } + STOP_TIMER("37_vsx") +} + + +static void bgr24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6)); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + + v_b = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0})); + v_g = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0})); + v_r = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0})); + + v_b = vec_and(v_b, v_FF); + v_g = vec_and(v_g, v_FF); + v_r = vec_and(v_r, v_FF); + + v_rd0 = vec_mergeh(v_b, v_null); + v_rd1 = vec_mergeh(v_g, v_null); + v_rd2 = vec_mergeh(v_r, v_null); + + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + v_r = vec_mergel(v_r, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd0, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd0, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src_addr += 24; + dstU_addr += 16; + dstV_addr += 16; + } + for (i = width_adj; i < width; i++) { + int b = src1[3 * i + 0]; + int g = src1[3 * i + 1]; + int r = src1[3 * i + 2]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + } + STOP_TIMER("38_vsx") + av_assert1(src1 == src2); +} + +static void bgr24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x8002<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-5)); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32)); + + v_b = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30})); + v_g = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31})); + v_r = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 5, 8, 11, 14, 17, 20, 23, 26, 29})); + + v_b = vec_perm(v_b, v_rd2, + ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29})); + v_g = vec_perm(v_g, v_rd2, + ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30})); + v_r = vec_perm(v_r, v_rd2, + ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31})); + + v_r = vec_add(vec_and(v_r, v_FF), vec_sr(v_r, vec_splats((unsigned short)8))); + v_g = vec_add(vec_and(v_g, v_FF), vec_sr(v_g, vec_splats((unsigned short)8))); + v_b = vec_add(vec_and(v_b, v_FF), vec_sr(v_b, vec_splats((unsigned short)8))); + + v_rd0 = vec_mergeh(v_r, v_null); + v_rd1 = vec_mergeh(v_g, v_null); + v_rd2 = vec_mergeh(v_b, v_null); + + v_r = vec_mergel(v_r, v_null); + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd2, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd2, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src_addr += 48; + dstU_addr += 16; + dstV_addr += 16; + } + + for (i = width_adj; i < width; i++) { + int b = src1[6 * i + 0] + src1[6 * i + 3]; + int g = src1[6 * i + 1] + src1[6 * i + 4]; + int r = src1[6 * i + 2] + src1[6 * i + 5]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5); + dstV[i] = (rv*r + gv*g + bv*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5); + } + STOP_TIMER("38_half_vsx") + av_assert1(src1 == src2); +} + +static void rgb24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dst = (int16_t *)_dst; + vector signed int v_ry, v_gy, v_by; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)_dst; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6)); + v_ry = vec_splats((signed int)ry); + v_gy = vec_splats((signed int)gy); + v_by = vec_splats((signed int)by); + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + + v_r = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0})); + v_g = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0})); + v_b = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0})); + + v_b = vec_and(v_b, v_FF); + v_g = vec_and(v_g, v_FF); + v_r = vec_and(v_r, v_FF); + + v_rd0 = vec_mergeh(v_b, v_null); + v_rd1 = vec_mergeh(v_g, v_null); + v_rd2 = vec_mergeh(v_r, v_null); + + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + v_r = vec_mergel(v_r, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ry); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gy )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd0, v_by )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gy )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_by )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); + + src_addr += 24; + dst_addr += 16; + } + for (i = width_adj; i < width; i++) { + unsigned int r = src[3*i]; + unsigned int g = src[3*i + 1]; + unsigned int b = src[3*i + 2]; + + //dst[i] = ((ry*r + gy*g + by*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6)); + dst[i] = ((ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6)); + } + STOP_TIMER("39_vsx") +} + +static void rgb24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + av_assert1(src1 == src2); + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6)); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + + v_r = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0})); + v_g = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0})); + v_b = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0})); + + v_r = vec_and(v_r, v_FF); + v_g = vec_and(v_g, v_FF); + v_b = vec_and(v_b, v_FF); + + v_rd0 = vec_mergeh(v_r, v_null); + v_rd1 = vec_mergeh(v_g, v_null); + v_rd2 = vec_mergeh(v_b, v_null); + + v_r = vec_mergel(v_r, v_null); + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd2, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd2, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src_addr += 24; + dstU_addr += 16; + dstV_addr += 16; + } + for (i = width_adj; i < width; i++) { + int r = src1[3 * i + 0]; + int g = src1[3 * i + 1]; + int b = src1[3 * i + 2]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + } + STOP_TIMER("40_vsx") +} + +static void rgb24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + av_assert1(src1 == src2); + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t src_addr = (uintptr_t)src1; + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x8002<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-5)); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + + } + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16)); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32)); + + v_r = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30})); + v_g = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31})); + v_b = vec_perm(v_rd0, v_rd1, + ((vector unsigned char){2, 5, 8, 11, 14, 17, 20, 23, 26, 29})); + + v_r = vec_perm(v_r, v_rd2, + ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29})); + v_g = vec_perm(v_g, v_rd2, + ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30})); + v_b = vec_perm(v_b, v_rd2, + ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31})); + + v_r = vec_add(vec_and(v_r, v_FF), vec_sr(v_r, vec_splats((unsigned short)8))); + v_g = vec_add(vec_and(v_g, v_FF), vec_sr(v_g, vec_splats((unsigned short)8))); + v_b = vec_add(vec_and(v_b, v_FF), vec_sr(v_b, vec_splats((unsigned short)8))); + + v_rd0 = vec_mergeh(v_r, v_null); + v_rd1 = vec_mergeh(v_g, v_null); + v_rd2 = vec_mergeh(v_b, v_null); + + v_r = vec_mergel(v_r, v_null); + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd2, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_rd2, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src_addr += 48; + dstU_addr += 16; + dstV_addr += 16; + } + + for (i = width_adj; i < width; i++) { + int r = src1[6 * i + 0] + src1[6 * i + 3]; + int g = src1[6 * i + 1] + src1[6 * i + 4]; + int b = src1[6 * i + 2] + src1[6 * i + 5]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5); + dstV[i] = (rv*r + gv*g + bv*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5); + } + STOP_TIMER("40_half_vsx") +} + +static void planar_rgb_to_y_vsx(uint8_t *_dst, const uint8_t *src[4], + int width, int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dst = (int16_t *)_dst; + vector signed int v_ry, v_gy, v_by; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t dst_addr = (uintptr_t)_dst; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6)); + v_ry = vec_splats((signed int)ry); + v_gy = vec_splats((signed int)gy); + v_by = vec_splats((signed int)by); + } + + for (i = 0; i < width_adj; i+=8) { + if(i&1){ + v_rd0 = vec_sld(v_rd0, v_rd0, 8); + v_rd1 = vec_sld(v_rd1, v_rd1, 8); + v_rd2 = vec_sld(v_rd2, v_rd2, 8); + }else{ + v_rd0 = vec_vsx_ld(0, (unsigned short *)src[0]); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src[1])); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src[2])); + } + + v_g = vec_perm(v_rd0, v_rd0, + ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0})); + v_b = vec_perm(v_rd1, v_rd1, + ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0})); + v_r = vec_perm(v_rd2, v_rd2, + ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0})); + + v_b = vec_and(v_b, v_FF); + v_g = vec_and(v_g, v_FF); + v_r = vec_and(v_r, v_FF); + + v_b1 = vec_mergeh(v_b, v_null); + v_g1 = vec_mergeh(v_g, v_null); + v_r1 = vec_mergeh(v_r, v_null); + + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + v_r = vec_mergel(v_r, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gy )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_by )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gy )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_by )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); + + src[0] += 8; + src[1] += 8; + src[2] += 8; + dst_addr += 16; + } + + for (i = width_adj; i < width; i++) { + int g = src[0][0]; + int b = src[1][0]; + int r = src[2][0]; + dst[i] = (ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); + ++src[0]; + ++src[1]; + ++src[2]; + } + STOP_TIMER("41_vsx") +} + + +static void planar_rgb_to_a_vsx(uint8_t *_dst, const uint8_t *src[4], + int width, int32_t *unused) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_a, v_dst; + int16_t *dst = (int16_t *)_dst; + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t dst_addr = (uintptr_t)_dst; + + + width_adj = width&(~(int)0x07); + + for (i = 0; i < width_adj; i+=8) { + if(i&1) + v_rd0 = vec_sld(v_rd0, v_rd0, 8); + else + v_rd0 = vec_vsx_ld(0, (unsigned short *)src[3]); + + v_a = vec_perm(v_rd0, v_rd0, + ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0})); + v_a = vec_and(v_a, v_FF); + v_dst = vec_sl(v_a, vec_splats((unsigned short)6)); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src[3] += 8; + dst_addr += 16; + } + for (i = width_adj; i < width; i++){ + dst[i] = src[3][0] << 6; + ++src[3]; + } + STOP_TIMER("42_vsx") +} + + +static void planar_rgb_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *src[4], int width, int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + uint16_t *dstU = (uint16_t *)_dstU; + uint16_t *dstV = (uint16_t *)_dstV; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + vector unsigned short v_FF = vec_splats((unsigned short)0x00FF); + + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6)); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + } + + for (i = 0; i < width_adj; i+=8) { + if(i&1){ + v_rd0 = vec_sld(v_rd0, v_rd0, 8); + v_rd1 = vec_sld(v_rd1, v_rd1, 8); + v_rd2 = vec_sld(v_rd2, v_rd2, 8); + }else{ + v_rd0 = vec_vsx_ld(0, (unsigned short *)src[0]); + v_rd1 = vec_vsx_ld(0, (unsigned short *)(src[1])); + v_rd2 = vec_vsx_ld(0, (unsigned short *)(src[2])); + } + + v_g = vec_perm(v_rd0, v_rd0, + ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0})); + v_b = vec_perm(v_rd1, v_rd1, + ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0})); + v_r = vec_perm(v_rd2, v_rd2, + ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0})); + + v_b = vec_and(v_b, v_FF); + v_g = vec_and(v_g, v_FF); + v_r = vec_and(v_r, v_FF); + + v_b1 = vec_mergeh(v_b, v_null); + v_g1 = vec_mergeh(v_g, v_null); + v_r1 = vec_mergeh(v_r, v_null); + + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + v_r = vec_mergel(v_r, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src[0] += 8; + src[1] += 8; + src[2] += 8; + dstU_addr += 16; + dstV_addr += 16; + } + for (i = width_adj; i < width; i++) { + int g = src[0][0]; + int b = src[1][0]; + int r = src[2][0]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); + ++src[0]; + ++src[1]; + ++src[2]; + + } + STOP_TIMER("43_vsx") +} + +#define rdpx(src) \ + is_be ? AV_RB16(src) : AV_RL16(src) +static av_always_inline +void planar_rgb16_to_y_vsx(uint8_t *_dst, const uint8_t *_src[4], + int width, int bpc, int is_be, int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + int16_t *dst = (int16_t *)_dst; + const uint16_t **src = (const uint16_t **)_src; + vector signed int v_ry, v_gy, v_by; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + int sh = bpc < 16 ? bpc : 14; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)_dst; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(33 << (RGB2YUV_SHIFT + bpc - 9))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT + sh - 14)); + v_ry = vec_splats((signed int)ry); + v_gy = vec_splats((signed int)gy); + v_by = vec_splats((signed int)by); + } + + for (i = 0; i < width_adj; i+=8) { + + v_g = vec_vsx_ld(0, (unsigned short *)src[0]); + v_b = vec_vsx_ld(0, (unsigned short *)(src[1])); + v_r = vec_vsx_ld(0, (unsigned short *)(src[2])); + if(is_be){ + v_g = vec_perm(v_g, v_g, + ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14})); + v_b = vec_perm(v_b, v_b, + ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14})); + v_r = vec_perm(v_r, v_r, + ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14})); + } + + + v_b1 = vec_mergeh(v_b, v_null); + v_g1 = vec_mergeh(v_g, v_null); + v_r1 = vec_mergeh(v_r, v_null); + + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + v_r = vec_mergel(v_r, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gy )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_by )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gy )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_by )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr); + + src[0] += 8; + src[1] += 8; + src[2] += 8; + dst_addr += 16; + } + for (i = width_adj; i < width; i++) { + int g = rdpx(src[0]); + int b = rdpx(src[1]); + int r = rdpx(src[2]); + + dst[i] = ((ry*r + gy*g + by*b + + (33 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14)); + ++src[0]; + ++src[1]; + ++src[2]; + } + STOP_TIMER("44_vsx") +} +//ToDO +static av_always_inline +void planar_rgb16_to_a_vsx(uint8_t *_dst, const uint8_t *_src[4], + int width, int bpc, int is_be, int32_t *rgb2yuv) +{ + START_TIMER + int i, width_adj; + vector unsigned short v_rd0, v_a, v_dst, shift; + const uint16_t **src = (const uint16_t **)_src; + uint16_t *dst = (uint16_t *)_dst; + int sh = bpc < 16 ? bpc : 14; + uintptr_t dst_addr = (uintptr_t)_dst; + + + width_adj = width&(~(int)0x07); + if(width_adj){ + shift = vec_splats((unsigned short)(14 - sh)); + } + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (unsigned short *)src[3]); + if(is_be) + v_dst = vec_perm(v_rd0, v_rd0, + ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14})); + else + v_dst = vec_sl(v_rd0, shift); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src[3] += 8; + dst_addr += 16; + } + for (i=width_adj; i< width; i++){ + dst[i] = rdpx(src[3]) << (14 - sh); + ++src[3]; + } + STOP_TIMER("45_vsx") +} + +static av_always_inline +void planar_rgb16_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *_src[4], int width, + int bpc, int is_be, int32_t *rgb2yuv) +{ + START_TIMER + + int i, width_adj; + vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1; + vector unsigned int v_dst1, v_dst2; + vector unsigned int shift1, shift2; + const uint16_t **src = (const uint16_t **)_src; + uint16_t *dstU = (uint16_t *)_dstU; + uint16_t *dstV = (uint16_t *)_dstV; + vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + int sh = bpc < 16 ? bpc : 14; + vector unsigned short v_null = vec_splats((unsigned short)0x0000); + + uintptr_t dstU_addr = (uintptr_t)_dstU; + uintptr_t dstV_addr = (uintptr_t)_dstV; + + + width_adj = width&(~(int)0x07); + + if(width_adj){ + shift1 = vec_splats((unsigned int)(257 << (RGB2YUV_SHIFT + bpc - 9))); + shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT + sh - 14)); + v_ru = vec_splats((signed int)ru); + v_gu = vec_splats((signed int)gu); + v_bu = vec_splats((signed int)bu); + v_rv = vec_splats((signed int)rv); + v_gv = vec_splats((signed int)gv); + v_bv = vec_splats((signed int)bv); + } + + for (i = 0; i < width_adj; i+=8) { + v_g = vec_vsx_ld(0, (unsigned short *)src[0]); + v_b = vec_vsx_ld(0, (unsigned short *)(src[1])); + v_r = vec_vsx_ld(0, (unsigned short *)(src[2])); + if(is_be){ + v_g = vec_perm(v_g, v_g, + ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14})); + v_b = vec_perm(v_b, v_b, + ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14})); + v_r = vec_perm(v_r, v_r, + ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14})); + } + + + + v_b1 = vec_mergeh(v_b, v_null); + v_g1 = vec_mergeh(v_g, v_null); + v_r1 = vec_mergeh(v_r, v_null); + + v_g = vec_mergel(v_g, v_null); + v_b = vec_mergel(v_b, v_null); + v_r = vec_mergel(v_r, v_null); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gu )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bu )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gu )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bu )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); + + v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_g1, v_gv )); + v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1, + vec_mul((vector signed int)v_b1, v_bv )); + v_dst1 = vec_add(v_dst1, shift1); + v_dst1 = vec_sr(v_dst1, shift2); + v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_g, v_gv )); + v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2, + vec_mul((vector signed int)v_b, v_bv )); + v_dst2 = vec_add(v_dst2, shift1); + v_dst2 = vec_sr(v_dst2, shift2); + v_dst1 = vec_perm(v_dst1, v_dst2, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); + + src[0] += 8; + src[1] += 8; + src[2] += 8; + dstU_addr += 16; + dstV_addr += 16; + } + for (i = width_adj; i < width; i++) { + int g = rdpx(src[0]); + int b = rdpx(src[1]); + int r = rdpx(src[2]); + + dstU[i] = (ru*r + gu*g + bu*b + + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14); + dstV[i] = (rv*r + gv*g + bv*b + + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14); + ++src[0]; + ++src[1]; + ++src[2]; + } + STOP_TIMER("46_vsx") +} +#undef rdpx + +static av_always_inline void grayf32ToY16_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, + const uint8_t *unused2, int width, uint32_t *unused) +{ + START_TIMER + int i; + const float *src = (const float *)_src; + uint16_t *dst = (uint16_t *)_dst; + + for (i = 0; i < width; ++i){ + dst[i] = av_clip_uint16(lrintf(65535.0f * src[i])); + } + STOP_TIMER("47") +} + +static av_always_inline void grayf32ToY16_bswap_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, + const uint8_t *unused2, int width, uint32_t *unused) +{ + START_TIMER + int i; + const uint32_t *src = (const uint32_t *)_src; + uint16_t *dst = (uint16_t *)_dst; + + for (i = 0; i < width; ++i){ + dst[i] = av_clip_uint16(lrintf(65535.0f * av_int2float(av_bswap32(src[i])))); + } + STOP_TIMER("48") +} + +/*static av_always_inline +void grayf32ToY16_c_vsx(uint8_t *_dst, const uint8_t *_src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector float v_rd0, v_rd1; + vector signed int v_rd00, v_rd01, v_rd02, v_rd03; + vector unsigned short v_dst; + const float *src = (const float *)_src; + uint16_t *dst = (uint16_t *)_dst; + + uintptr_t dst_addr = (uintptr_t)_dst; + uintptr_t src_addr = (uintptr_t)_src; + + + width_adj = width&(~(int)0x07); + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (float *)src_addr); + v_rd1 = vec_vsx_ld(0, (float *)(src_addr+16)); + + v_rd0 = vec_rint(vec_mul(v_rd0, vec_splats((float)65535.0f))); + v_rd1 = vec_rint(vec_mul(v_rd1, vec_splats((float)65535.0f))); + v_rd00 = (vector signed int)vec_cts(v_rd0, 0); + v_rd01 = (vector signed int)vec_cts(v_rd1, 0); + v_rd02 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd00, + vec_splats((unsigned int)0xFFFF)); + v_rd03 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd01, + vec_splats((unsigned int)0xFFFF)); + v_rd00 = vec_or(v_rd00, v_rd02); + v_rd01 = vec_or(v_rd01, v_rd03); + + v_dst = (vector unsigned short)vec_perm(v_rd00, v_rd01, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + for (i = width_adj; i < width; i++){ + dst[i] = av_clip_uint16(lrintf(65535.0f * src[i])); + } + STOP_TIMER("47_vsx") +} +static av_always_inline +void grayf32ToY16_bswap_c_vsx(uint8_t *_dst, const uint8_t *_src, + const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + START_TIMER + int i, width_adj; + vector signed int v_rd0, v_rd1, v_rd2, v_rd3; + vector float v_rd00, v_rd01; + vector unsigned short v_dst; + const uint32_t *src = (const float *)_src; + uint16_t *dst = (uint16_t *)_dst; + + uintptr_t dst_addr = (uintptr_t)_dst; + uintptr_t src_addr = (uintptr_t)_src; + + + width_adj = width&(~(int)0x07); + + for (i = 0; i < width_adj; i+=8) { + v_rd0 = vec_vsx_ld(0, (int *)src_addr); + v_rd1 = vec_vsx_ld(0, (int *)(src_addr+16)); + + v_rd0 = vec_perm(v_rd0, v_rd0, + ((vector unsigned char){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12})); + v_rd1 = vec_perm(v_rd1, v_rd1, + ((vector unsigned char){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12})); + v_rd00 = vec_round(vec_mul((vector float)v_rd0, vec_splats((float)65535.0f))); + v_rd01 = vec_round(vec_mul((vector float)v_rd1, vec_splats((float)65535.0f))); + + + v_rd0 = vec_cts(v_rd00, 0); + v_rd1 = vec_cts(v_rd01, 0); + v_rd2 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd0, + vec_splats((unsigned int)0xFFFF)); + v_rd3 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd1, + vec_splats((unsigned int)0xFFFF)); + v_rd0 = vec_or(v_rd0, v_rd2); + v_rd1 = vec_or(v_rd1, v_rd3); + + + + v_dst = (vector unsigned short)vec_perm(v_rd0, v_rd1, + ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + + src_addr += 32; + dst_addr += 16; + } + for (i = width_adj; i < width; i++){ + + dst[i] = av_clip_uint16(lrintf(65535.0f * av_int2float(av_bswap32(src[i])))); + } + STOP_TIMER("48_vsx") +}*/ + +#define rgb9plus_planar_funcs_endian(nbits, endian_name, endian) \ +static void planar_rgb##nbits##endian_name##_to_y_vsx(uint8_t *dst, const uint8_t *src[4], \ + int w, int32_t *rgb2yuv) \ +{ \ + planar_rgb16_to_y_vsx(dst, src, w, nbits, endian, rgb2yuv); \ +} \ +static void planar_rgb##nbits##endian_name##_to_uv_vsx(uint8_t *dstU, uint8_t *dstV, \ + const uint8_t *src[4], int w, int32_t *rgb2yuv) \ +{ \ + planar_rgb16_to_uv_vsx(dstU, dstV, src, w, nbits, endian, rgb2yuv); \ +} \ + + +#define rgb9plus_planar_transparency_funcs(nbits) \ +static void planar_rgb##nbits##le_to_a_vsx(uint8_t *dst, const uint8_t *src[4], \ + int w, int32_t *rgb2yuv) \ +{ \ + planar_rgb16_to_a_vsx(dst, src, w, nbits, 0, rgb2yuv); \ +} \ +static void planar_rgb##nbits##be_to_a_vsx(uint8_t *dst, const uint8_t *src[4], \ + int w, int32_t *rgb2yuv) \ +{ \ + planar_rgb16_to_a_vsx(dst, src, w, nbits, 1, rgb2yuv); \ +} + +#define rgb9plus_planar_funcs(nbits) \ + rgb9plus_planar_funcs_endian(nbits, le, 0) \ + rgb9plus_planar_funcs_endian(nbits, be, 1) + +rgb9plus_planar_funcs(9) +rgb9plus_planar_funcs(10) +rgb9plus_planar_funcs(12) +rgb9plus_planar_funcs(14) +rgb9plus_planar_funcs(16) + +rgb9plus_planar_transparency_funcs(10) +rgb9plus_planar_transparency_funcs(12) +rgb9plus_planar_transparency_funcs(16) +#endif //!HAVE_BIGENDIAN +#endif //HAVE_VSX +av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c) +{ + if (!(av_get_cpu_flags() & AV_CPU_FLAG_VSX)) + return; +#if HAVE_VSX +#if !HAVE_BIGENDIAN + enum AVPixelFormat srcFormat = c->srcFormat; + + c->chrToYV12 = NULL; + switch (srcFormat) { + case AV_PIX_FMT_YUYV422: + c->chrToYV12 = yuy2ToUV_c_vsx; + break; + case AV_PIX_FMT_YVYU422: + c->chrToYV12 = yvy2ToUV_c_vsx; + break; + case AV_PIX_FMT_UYVY422: + c->chrToYV12 = uyvyToUV_c_vsx; + break; + case AV_PIX_FMT_NV12: + case AV_PIX_FMT_NV24: + c->chrToYV12 = nv12ToUV_c_vsx; + break; + case AV_PIX_FMT_NV21: + case AV_PIX_FMT_NV42: + c->chrToYV12 = nv21ToUV_c_vsx; + break; + case AV_PIX_FMT_RGB8: + case AV_PIX_FMT_BGR8: + case AV_PIX_FMT_PAL8: + case AV_PIX_FMT_BGR4_BYTE: + case AV_PIX_FMT_RGB4_BYTE: + c->chrToYV12 = palToUV_c_vsx; + break; + case AV_PIX_FMT_GBRP9LE: + c->readChrPlanar = planar_rgb9le_to_uv_vsx; + break; + case AV_PIX_FMT_GBRAP10LE: + case AV_PIX_FMT_GBRP10LE: + c->readChrPlanar = planar_rgb10le_to_uv_vsx; + break; + case AV_PIX_FMT_GBRAP12LE: + case AV_PIX_FMT_GBRP12LE: + c->readChrPlanar = planar_rgb12le_to_uv_vsx; + break; + case AV_PIX_FMT_GBRP14LE: + c->readChrPlanar = planar_rgb14le_to_uv_vsx; + break; + case AV_PIX_FMT_GBRAP16LE: + case AV_PIX_FMT_GBRP16LE: + c->readChrPlanar = planar_rgb16le_to_uv_vsx; + break; + case AV_PIX_FMT_GBRP9BE: + c->readChrPlanar = planar_rgb9be_to_uv_vsx; + break; + case AV_PIX_FMT_GBRAP10BE: + case AV_PIX_FMT_GBRP10BE: + c->readChrPlanar = planar_rgb10be_to_uv_vsx; + break; + case AV_PIX_FMT_GBRAP12BE: + case AV_PIX_FMT_GBRP12BE: + c->readChrPlanar = planar_rgb12be_to_uv_vsx; + break; + case AV_PIX_FMT_GBRP14BE: + c->readChrPlanar = planar_rgb14be_to_uv_vsx; + break; + case AV_PIX_FMT_GBRAP16BE: + case AV_PIX_FMT_GBRP16BE: + c->readChrPlanar = planar_rgb16be_to_uv_vsx; + break; + case AV_PIX_FMT_GBRAP: + case AV_PIX_FMT_GBRP: + c->readChrPlanar = planar_rgb_to_uv_vsx; + break; + case AV_PIX_FMT_YUV420P9BE: + case AV_PIX_FMT_YUV422P9BE: + case AV_PIX_FMT_YUV444P9BE: + case AV_PIX_FMT_YUV420P10BE: + case AV_PIX_FMT_YUV422P10BE: + case AV_PIX_FMT_YUV440P10BE: + case AV_PIX_FMT_YUV444P10BE: + case AV_PIX_FMT_YUV420P12BE: + case AV_PIX_FMT_YUV422P12BE: + case AV_PIX_FMT_YUV440P12BE: + case AV_PIX_FMT_YUV444P12BE: + case AV_PIX_FMT_YUV420P14BE: + case AV_PIX_FMT_YUV422P14BE: + case AV_PIX_FMT_YUV444P14BE: + case AV_PIX_FMT_YUV420P16BE: + case AV_PIX_FMT_YUV422P16BE: + case AV_PIX_FMT_YUV444P16BE: + + case AV_PIX_FMT_YUVA420P9BE: + case AV_PIX_FMT_YUVA422P9BE: + case AV_PIX_FMT_YUVA444P9BE: + case AV_PIX_FMT_YUVA420P10BE: + case AV_PIX_FMT_YUVA422P10BE: + case AV_PIX_FMT_YUVA444P10BE: + case AV_PIX_FMT_YUVA422P12BE: + case AV_PIX_FMT_YUVA444P12BE: + case AV_PIX_FMT_YUVA420P16BE: + case AV_PIX_FMT_YUVA422P16BE: + case AV_PIX_FMT_YUVA444P16BE: + c->chrToYV12 = bswap16UV_c_vsx; + break; + case AV_PIX_FMT_AYUV64LE: + c->chrToYV12 = read_ayuv64le_UV_c_vsx; + break; + case AV_PIX_FMT_P010LE: + c->chrToYV12 = p010LEToUV_c_vsx; + break; + case AV_PIX_FMT_P010BE: + c->chrToYV12 = p010BEToUV_c_vsx; + break; + case AV_PIX_FMT_P016LE: + c->chrToYV12 = p016LEToUV_c_vsx; + break; + case AV_PIX_FMT_P016BE: + c->chrToYV12 = p016BEToUV_c_vsx; + break; + } + if (c->chrSrcHSubSample) { + switch (srcFormat) { + case AV_PIX_FMT_RGBA64BE: + c->chrToYV12 = rgb64BEToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGBA64LE: + c->chrToYV12 = rgb64LEToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGRA64BE: + c->chrToYV12 = bgr64BEToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGRA64LE: + c->chrToYV12 = bgr64LEToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB48BE: + c->chrToYV12 = rgb48BEToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB48LE: + c->chrToYV12 = rgb48LEToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR48BE: + c->chrToYV12 = bgr48BEToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR48LE: + c->chrToYV12 = bgr48LEToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB32: + c->chrToYV12 = bgr32ToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB32_1: + c->chrToYV12 = bgr321ToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR24: + c->chrToYV12 = bgr24ToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR565LE: + c->chrToYV12 = bgr16leToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR565BE: + c->chrToYV12 = bgr16beToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR555LE: + c->chrToYV12 = bgr15leToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR555BE: + c->chrToYV12 = bgr15beToUV_half_c_vsx; + break; + case AV_PIX_FMT_GBRAP: + case AV_PIX_FMT_GBRP: + c->chrToYV12 = gbr24pToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR444LE: + c->chrToYV12 = bgr12leToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR444BE: + c->chrToYV12 = bgr12beToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR32: + c->chrToYV12 = rgb32ToUV_half_c_vsx; + break; + case AV_PIX_FMT_BGR32_1: + c->chrToYV12 = rgb321ToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB24: + c->chrToYV12 = rgb24ToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB565LE: + c->chrToYV12 = rgb16leToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB565BE: + c->chrToYV12 = rgb16beToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB555LE: + c->chrToYV12 = rgb15leToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB555BE: + c->chrToYV12 = rgb15beToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB444LE: + c->chrToYV12 = rgb12leToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB444BE: + c->chrToYV12 = rgb12beToUV_half_c_vsx; + break; + } + } else { + switch (srcFormat) { + case AV_PIX_FMT_RGBA64BE: + c->chrToYV12 = rgb64BEToUV_c_vsx; + break; + case AV_PIX_FMT_RGBA64LE: + c->chrToYV12 = rgb64LEToUV_c_vsx; + break; + case AV_PIX_FMT_BGRA64BE: + c->chrToYV12 = bgr64BEToUV_c_vsx; + break; + case AV_PIX_FMT_BGRA64LE: + c->chrToYV12 = bgr64LEToUV_c_vsx; + break; + case AV_PIX_FMT_RGB48BE: + c->chrToYV12 = rgb48BEToUV_c_vsx; + break; + case AV_PIX_FMT_RGB48LE: + c->chrToYV12 = rgb48LEToUV_c_vsx; + break; + case AV_PIX_FMT_BGR48BE: + c->chrToYV12 = bgr48BEToUV_c_vsx; + break; + case AV_PIX_FMT_BGR48LE: + c->chrToYV12 = bgr48LEToUV_c_vsx; + break; + case AV_PIX_FMT_RGB32: + c->chrToYV12 = bgr32ToUV_c_vsx; + break; + case AV_PIX_FMT_RGB32_1: + c->chrToYV12 = bgr321ToUV_c_vsx; + break; + case AV_PIX_FMT_BGR24: + c->chrToYV12 = bgr24ToUV_c_vsx; + break; + case AV_PIX_FMT_BGR565LE: + c->chrToYV12 = bgr16leToUV_c_vsx; + break; + case AV_PIX_FMT_BGR565BE: + c->chrToYV12 = bgr16beToUV_c_vsx; + break; + case AV_PIX_FMT_BGR555LE: + c->chrToYV12 = bgr15leToUV_c_vsx; + break; + case AV_PIX_FMT_BGR555BE: + c->chrToYV12 = bgr15beToUV_c_vsx; + break; + case AV_PIX_FMT_BGR444LE: + c->chrToYV12 = bgr12leToUV_c_vsx; + break; + case AV_PIX_FMT_BGR444BE: + c->chrToYV12 = bgr12beToUV_c_vsx; + break; + case AV_PIX_FMT_BGR32: + c->chrToYV12 = rgb32ToUV_c_vsx; + break; + case AV_PIX_FMT_BGR32_1: + c->chrToYV12 = rgb321ToUV_c_vsx; + break; + case AV_PIX_FMT_RGB24: + c->chrToYV12 = rgb24ToUV_c_vsx; + break; + case AV_PIX_FMT_RGB565LE: + c->chrToYV12 = rgb16leToUV_c_vsx; + break; + case AV_PIX_FMT_RGB565BE: + c->chrToYV12 = rgb16beToUV_c_vsx; + break; + case AV_PIX_FMT_RGB555LE: + c->chrToYV12 = rgb15leToUV_c_vsx; + break; + case AV_PIX_FMT_RGB555BE: + c->chrToYV12 = rgb15beToUV_c_vsx; + break; + case AV_PIX_FMT_RGB444LE: + c->chrToYV12 = rgb12leToUV_c_vsx; + break; + case AV_PIX_FMT_RGB444BE: + c->chrToYV12 = rgb12beToUV_c_vsx; + break; + } + } + + c->lumToYV12 = NULL; + c->alpToYV12 = NULL; + switch (srcFormat) { + case AV_PIX_FMT_GBRP9LE: + c->readLumPlanar = planar_rgb9le_to_y_vsx; + break; + case AV_PIX_FMT_GBRAP10LE: + c->readAlpPlanar = planar_rgb10le_to_a_vsx; + case AV_PIX_FMT_GBRP10LE: + c->readLumPlanar = planar_rgb10le_to_y_vsx; + break; + case AV_PIX_FMT_GBRAP12LE: + c->readAlpPlanar = planar_rgb12le_to_a_vsx; + case AV_PIX_FMT_GBRP12LE: + c->readLumPlanar = planar_rgb12le_to_y_vsx; + break; + case AV_PIX_FMT_GBRP14LE: + c->readLumPlanar = planar_rgb14le_to_y_vsx; + break; + case AV_PIX_FMT_GBRAP16LE: + c->readAlpPlanar = planar_rgb16le_to_a_vsx; + case AV_PIX_FMT_GBRP16LE: + c->readLumPlanar = planar_rgb16le_to_y_vsx; + break; + case AV_PIX_FMT_GBRP9BE: + c->readLumPlanar = planar_rgb9be_to_y_vsx; + break; + case AV_PIX_FMT_GBRAP10BE: + c->readAlpPlanar = planar_rgb10be_to_a_vsx; + case AV_PIX_FMT_GBRP10BE: + c->readLumPlanar = planar_rgb10be_to_y_vsx; + break; + case AV_PIX_FMT_GBRAP12BE: + c->readAlpPlanar = planar_rgb12be_to_a_vsx; + case AV_PIX_FMT_GBRP12BE: + c->readLumPlanar = planar_rgb12be_to_y_vsx; + break; + case AV_PIX_FMT_GBRP14BE: + c->readLumPlanar = planar_rgb14be_to_y_vsx; + break; + case AV_PIX_FMT_GBRAP16BE: + c->readAlpPlanar = planar_rgb16be_to_a_vsx; + case AV_PIX_FMT_GBRP16BE: + c->readLumPlanar = planar_rgb16be_to_y_vsx; + break; + case AV_PIX_FMT_GBRAP: + c->readAlpPlanar = planar_rgb_to_a_vsx; + case AV_PIX_FMT_GBRP: + c->readLumPlanar = planar_rgb_to_y_vsx; + break; + + case AV_PIX_FMT_YUV420P9BE: + case AV_PIX_FMT_YUV422P9BE: + case AV_PIX_FMT_YUV444P9BE: + case AV_PIX_FMT_YUV420P10BE: + case AV_PIX_FMT_YUV422P10BE: + case AV_PIX_FMT_YUV440P10BE: + case AV_PIX_FMT_YUV444P10BE: + case AV_PIX_FMT_YUV420P12BE: + case AV_PIX_FMT_YUV422P12BE: + case AV_PIX_FMT_YUV440P12BE: + case AV_PIX_FMT_YUV444P12BE: + case AV_PIX_FMT_YUV420P14BE: + case AV_PIX_FMT_YUV422P14BE: + case AV_PIX_FMT_YUV444P14BE: + case AV_PIX_FMT_YUV420P16BE: + case AV_PIX_FMT_YUV422P16BE: + case AV_PIX_FMT_YUV444P16BE: + + case AV_PIX_FMT_GRAY9BE: + case AV_PIX_FMT_GRAY10BE: + case AV_PIX_FMT_GRAY12BE: + case AV_PIX_FMT_GRAY14BE: + case AV_PIX_FMT_GRAY16BE: + + case AV_PIX_FMT_P016BE: + c->lumToYV12 = bswap16Y_c_vsx; + break; + case AV_PIX_FMT_YUVA420P9BE: + case AV_PIX_FMT_YUVA422P9BE: + case AV_PIX_FMT_YUVA444P9BE: + case AV_PIX_FMT_YUVA420P10BE: + case AV_PIX_FMT_YUVA422P10BE: + case AV_PIX_FMT_YUVA444P10BE: + case AV_PIX_FMT_YUVA422P12BE: + case AV_PIX_FMT_YUVA444P12BE: + case AV_PIX_FMT_YUVA420P16BE: + case AV_PIX_FMT_YUVA422P16BE: + case AV_PIX_FMT_YUVA444P16BE: + c->lumToYV12 = bswap16Y_c_vsx; + c->alpToYV12 = bswap16Y_c_vsx; + break; + case AV_PIX_FMT_YA16LE: + c->lumToYV12 = read_ya16le_gray_c_vsx; + break; + case AV_PIX_FMT_YA16BE: + c->lumToYV12 = read_ya16be_gray_c_vsx; + break; + case AV_PIX_FMT_AYUV64LE: + c->lumToYV12 = read_ayuv64le_Y_c_vsx; + break; + case AV_PIX_FMT_YUYV422: + case AV_PIX_FMT_YVYU422: + case AV_PIX_FMT_YA8: + c->lumToYV12 = yuy2ToY_c_vsx; + break; + case AV_PIX_FMT_UYVY422: + c->lumToYV12 = uyvyToY_c_vsx; + break; + case AV_PIX_FMT_BGR24: + c->lumToYV12 = bgr24ToY_c_vsx; + break; + case AV_PIX_FMT_BGR565LE: + c->lumToYV12 = bgr16leToY_c_vsx; + break; + case AV_PIX_FMT_BGR565BE: + c->lumToYV12 = bgr16beToY_c_vsx; + break; + case AV_PIX_FMT_BGR555LE: + c->lumToYV12 = bgr15leToY_c_vsx; + break; + case AV_PIX_FMT_BGR555BE: + c->lumToYV12 = bgr15beToY_c_vsx; + break; + case AV_PIX_FMT_BGR444LE: + c->lumToYV12 = bgr12leToY_c_vsx; + break; + case AV_PIX_FMT_BGR444BE: + c->lumToYV12 = bgr12beToY_c_vsx; + break; + case AV_PIX_FMT_RGB24: + c->lumToYV12 = rgb24ToY_c_vsx; + break; + case AV_PIX_FMT_RGB565LE: + c->lumToYV12 = rgb16leToY_c_vsx; + break; + case AV_PIX_FMT_RGB565BE: + c->lumToYV12 = rgb16beToY_c_vsx; + break; + case AV_PIX_FMT_RGB555LE: + c->lumToYV12 = rgb15leToY_c_vsx; + break; + case AV_PIX_FMT_RGB555BE: + c->lumToYV12 = rgb15beToY_c_vsx; + break; + case AV_PIX_FMT_RGB444LE: + c->lumToYV12 = rgb12leToY_c_vsx; + break; + case AV_PIX_FMT_RGB444BE: + c->lumToYV12 = rgb12beToY_c_vsx; + break; + case AV_PIX_FMT_RGB8: + case AV_PIX_FMT_BGR8: + case AV_PIX_FMT_PAL8: + case AV_PIX_FMT_BGR4_BYTE: + case AV_PIX_FMT_RGB4_BYTE: + c->lumToYV12 = palToY_c_vsx; + break; + case AV_PIX_FMT_MONOBLACK: + c->lumToYV12 = monoblack2Y_c_vsx; + break; + case AV_PIX_FMT_MONOWHITE: + c->lumToYV12 = monowhite2Y_c_vsx; + break; + case AV_PIX_FMT_RGB32: + c->lumToYV12 = bgr32ToY_c_vsx; + break; + case AV_PIX_FMT_RGB32_1: + c->lumToYV12 = bgr321ToY_c_vsx; + break; + case AV_PIX_FMT_BGR32: + c->lumToYV12 = rgb32ToY_c_vsx; + break; + case AV_PIX_FMT_BGR32_1: + c->lumToYV12 = rgb321ToY_c_vsx; + break; + case AV_PIX_FMT_RGB48BE: + c->lumToYV12 = rgb48BEToY_c_vsx; + break; + case AV_PIX_FMT_RGB48LE: + c->lumToYV12 = rgb48LEToY_c_vsx; + break; + case AV_PIX_FMT_BGR48BE: + c->lumToYV12 = bgr48BEToY_c_vsx; + break; + case AV_PIX_FMT_BGR48LE: + c->lumToYV12 = bgr48LEToY_c_vsx; + break; + case AV_PIX_FMT_RGBA64BE: + c->lumToYV12 = rgb64BEToY_c_vsx; + break; + case AV_PIX_FMT_RGBA64LE: + c->lumToYV12 = rgb64LEToY_c_vsx; + break; + case AV_PIX_FMT_BGRA64BE: + c->lumToYV12 = bgr64BEToY_c_vsx; + break; + case AV_PIX_FMT_BGRA64LE: + c->lumToYV12 = bgr64LEToY_c_vsx; + break; + case AV_PIX_FMT_P010LE: + c->lumToYV12 = p010LEToY_c_vsx; + break; + case AV_PIX_FMT_P010BE: + c->lumToYV12 = p010BEToY_c_vsx; + break; + case AV_PIX_FMT_GRAYF32LE: + c->lumToYV12 = grayf32ToY16_c_vsx; + break; + case AV_PIX_FMT_GRAYF32BE: + c->lumToYV12 = grayf32ToY16_bswap_c_vsx; + break; + } + if (c->needAlpha) { + if (is16BPS(srcFormat) || isNBPS(srcFormat)) { + if (HAVE_BIGENDIAN == !isBE(srcFormat) && !c->readAlpPlanar) + c->alpToYV12 = bswap16Y_c_vsx; + } + switch (srcFormat) { + case AV_PIX_FMT_BGRA64LE: + case AV_PIX_FMT_RGBA64LE: c->alpToYV12 = rgba64leToA_c_vsx; break; + case AV_PIX_FMT_BGRA64BE: + case AV_PIX_FMT_RGBA64BE: c->alpToYV12 = rgba64beToA_c_vsx; break; + case AV_PIX_FMT_BGRA: + case AV_PIX_FMT_RGBA: + c->alpToYV12 = rgbaToA_c_vsx; + break; + case AV_PIX_FMT_ABGR: + case AV_PIX_FMT_ARGB: + c->alpToYV12 = abgrToA_c_vsx; + break; + case AV_PIX_FMT_YA8: + c->alpToYV12 = uyvyToY_c_vsx; + break; + case AV_PIX_FMT_YA16LE: + c->alpToYV12 = read_ya16le_alpha_c_vsx; + break; + case AV_PIX_FMT_YA16BE: + c->alpToYV12 = read_ya16be_alpha_c_vsx; + break; + case AV_PIX_FMT_AYUV64LE: + c->alpToYV12 = read_ayuv64le_A_c_vsx; + break; + case AV_PIX_FMT_PAL8 : + c->alpToYV12 = palToA_c_vsx; + break; + } + } +#endif //!HAVE_BIGENDIAN +#endif //HAVE_VSX +} diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 8436f05..de7d241 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -566,6 +566,8 @@ static av_cold void sws_init_swscale(SwsContext *c) &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX); ff_sws_init_input_funcs(c); + if (ARCH_PPC) + ff_sws_init_input_funcs_vsx if (c->srcBpc == 8) { if (c->dstBpc <= 14) { diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index a59d127..e5f0e9d 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -859,6 +859,7 @@ void ff_get_unscaled_swscale_aarch64(SwsContext *c); SwsFunc ff_getSwsFunc(SwsContext *c); void ff_sws_init_input_funcs(SwsContext *c); +void ff_sws_init_input_funcs_vsx(SwsContext *c); void ff_sws_init_output_funcs(SwsContext *c, yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,

[FFmpeg-devel] POWER8 VSX vectorization libswscale/input.c Track ticket 5570

Checks

Commit Message

Comments

Patch