@@ -75,22 +75,6 @@ static const uint8_t luma_mask_arr[16 * 8] = {
DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
}
-#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
-( { \
- v8i16 tmp1_m; \
- v16i8 tmp0_m, tmp2_m; \
- v16i8 minus5b_m = __msa_ldi_b(-5); \
- v16i8 plus20b_m = __msa_ldi_b(20); \
- \
- tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
- tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
- \
- ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
- DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
- \
- tmp1_m; \
-} )
-
#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
( { \
v4i32 tmp1_m; \
@@ -1157,128 +1141,6 @@ static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
}
}
-static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst, int32_t dst_stride)
-{
- v16i8 src0, src1, src2, src3, src4;
- v16i8 mask0, mask1, mask2;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
- v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
- v8i16 res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3;
- v16u8 tmp0, tmp1, tmp2, tmp3;
-
- LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- src += (5 * src_stride);
-
- XORI_B5_128_SB(src0, src1, src2, src3, src4);
-
- hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
- mask0, mask1, mask2);
- hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
- mask0, mask1, mask2);
-
- PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
-
- hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
-
- hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
- mask0, mask1, mask2);
- hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
- mask0, mask1, mask2);
-
- PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
-
- res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
- hz_out3, hz_out4, hz_out5);
- res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
- hz_out4, hz_out5, hz_out6);
- res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
- hz_out5, hz_out6, hz_out7);
- res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
- hz_out6, hz_out7, hz_out8);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- tmp0 = PCKEV_XORI128_UB(res0, res1);
- tmp1 = PCKEV_XORI128_UB(res2, res3);
- PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
- AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
-
- ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
-}
-
-static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int32_t height)
-{
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4;
- v16i8 mask0, mask1, mask2;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
- v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
- v16u8 dst0, dst1, dst2, dst3;
- v8i16 res0, res1, res2, res3;
-
- LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- XORI_B5_128_SB(src0, src1, src2, src3, src4);
- src += (5 * src_stride);
-
- hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
- hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
- hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
- hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
- hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
-
- hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
- hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
- hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
- hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
-
- res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
- hz_out3, hz_out4, hz_out5);
- res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
- hz_out4, hz_out5, hz_out6);
- res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
- hz_out5, hz_out6, hz_out7);
- res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
- hz_out6, hz_out7, hz_out8);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
- CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
- dst, dst_stride);
-
- dst += (4 * dst_stride);
- hz_out3 = hz_out7;
- hz_out1 = hz_out5;
- hz_out5 = hz_out4;
- hz_out4 = hz_out8;
- hz_out2 = hz_out6;
- hz_out0 = hz_out5;
- }
-}
-
-static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride)
-{
- avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
- avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
- 16);
-}
-
static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src,
int32_t src_stride,
uint8_t *dst, @@ -1578,22 +1440,28 @@ static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src,