@@ -1124,24 +1124,25 @@ static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
}
}
-static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- uint32_t coeff0, uint32_t coeff1)
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+ int32_t stride, uint32_t coeff0,
+ uint32_t coeff1)
{
uint16_t out0, out1;
- uint32_t load0, load1;
v16i8 src0, src1, src2, tmp0, tmp1, res;
v16u8 dst_data = { 0 };
+ v8i16 out;
v8u16 res_r;
v16i8 coeff_vec0 = __msa_fill_b(coeff0);
v16i8 coeff_vec1 = __msa_fill_b(coeff1);
v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
- LD_SB3(src, src_stride, src0, src1, src2);
- load0 = LW(dst);
- load1 = LW(dst + dst_stride);
+ LD_SB3(src, stride, src0, src1, src2);
+ out0 = LH(dst);
+ out1 = LH(dst + stride);
- INSERT_W2_UB(load0, load1, dst_data);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
@@ -1151,20 +1152,20 @@ static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
res_r = __msa_sat_u_h(res_r, 7);
res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
- dst_data = __msa_aver_u_b((v16u8) res, dst_data);
- out0 = __msa_copy_u_h((v8i16) dst_data, 0);
- out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+ out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+ out0 = __msa_copy_u_h(out, 0);
+ out1 = __msa_copy_u_h(out, 2);
SH(out0, dst);
- dst += dst_stride;
+ dst += stride;
SH(out1, dst);
}
-static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- uint32_t coeff0, uint32_t coeff1)
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+ int32_t stride, uint32_t coeff0,
+ uint32_t coeff1)
{
- uint32_t load0, load1;
+ uint16_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4;
v16u8 tmp0, tmp1, tmp2, tmp3;
v8u16 res_r;
@@ -1174,19 +1175,16 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
v16u8 dst_data = { 0 };
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
- load0 = LW(dst);
- load1 = LW(dst + dst_stride);
-
- dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
- dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
+ LD_SB5(src, stride, src0, src1, src2, src3, src4);
- load0 = LW(dst + 2 * dst_stride);
- load1 = LW(dst + 3 * dst_stride);
-
- dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
- dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
+ tp0 = LH(dst);
+ tp1 = LH(dst + stride);
+ tp2 = LH(dst + 2 * stride);
+ tp3 = LH(dst + 3 * stride);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
tmp0, tmp1, tmp2, tmp3); @@ -1202,102 +1200,26 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,