@@ -171,23 +171,27 @@ static const uint8_t luma_mask_arr[16 * 8] = {
out0_m; \
} )
-static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int32_t height)
+static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
+ uint8_t *dst, int32_t stride)
{
- uint32_t loop_cnt;
- v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
- v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
- v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
- v16i8 mask0, mask1, mask2;
- v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
- v8i16 out0, out1;
+ const int16_t filt_const0 = 0xfb01;
+ const int16_t filt_const1 = 0x1414;
+ const int16_t filt_const2 = 0x1fb;
v16u8 out;
+ v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
+ v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
+ v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
+ v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
+ v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
+
+ filt0 = (v16i8) __msa_fill_h(filt_const0);
+ filt1 = (v16i8) __msa_fill_h(filt_const1);
+ filt2 = (v16i8) __msa_fill_h(filt_const2);
LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
- LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
- src_y += (5 * src_stride);
+ LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+ src_y += (5 * stride);
src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); @@ -196,149 +200,237 @@ static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,