@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ * Copyright (c) 2015 -2017 Manojkumar Bhosale
+ (Manojkumar.Bhosale@imgtec.com)
*
* This file is part of FFmpeg.
*
@@ -35,12 +35,14 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
uint8_t *q3 = src + (stride << 1) + stride;
uint8_t flag0, flag1;
int32_t dp00, dq00, dp30, dq30, d00, d30;
+ int32_t d0030, d0434;
int32_t dp04, dq04, dp34, dq34, d04, d34;
int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
uint64_t dst_val0, dst_val1;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+ v2i64 cmp3;
v8u16 temp0, temp1;
v8i16 temp2;
v8i16 tc_pos, tc_neg;
@@ -54,62 +56,86 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
d00 = dp00 + dq00;
d30 = dp30 + dq30;
- p_is_pcm0 = p_is_pcm[0];
- q_is_pcm0 = q_is_pcm[0];
dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
d04 = dp04 + dq04;
d34 = dp34 + dq34;
+
+ p_is_pcm0 = p_is_pcm[0];
p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm0 = q_is_pcm[0];
q_is_pcm4 = q_is_pcm[1];
- if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
- if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
- p3_src = LD_UH(p3);
- p2_src = LD_UH(p2);
- p1_src = LD_UH(p1);
- p0_src = LD_UH(p0);
- q0_src = LD_UH(q0);
- q1_src = LD_UH(q1);
- q2_src = LD_UH(q2);
- q3_src = LD_UH(q3);
-
- tc0 = tc[0];
- beta30 = beta >> 3;
- beta20 = beta >> 2;
- tc250 = ((tc0 * 5 + 1) >> 1);
- tc4 = tc[1];
- tc254 = ((tc4 * 5 + 1) >> 1);
-
- flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
- abs(p0[0] - q0[0]) < tc250 &&
- abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
- abs(p0[3] - q0[3]) < tc250 &&
- (d00 << 1) < beta20 && (d30 << 1) < beta20);
- cmp0 = __msa_fill_d(flag0);
-
- flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
- abs(p0[4] - q0[4]) < tc254 &&
- abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
- abs(p0[7] - q0[7]) < tc254 &&
- (d04 << 1) < beta20 && (d34 << 1) < beta20);
- cmp1 = __msa_fill_d(flag1);
- cmp2 = __msa_ilvev_d(cmp1, cmp0);
- cmp2 = __msa_ceqi_d(cmp2, 0);
-
- ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
- zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
- p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
- q3_src);
-
- cmp0 = (v2i64) __msa_fill_h(tc0);
- cmp1 = (v2i64) __msa_fill_h(tc4);
- tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+ cmp0 = __msa_fill_d(p_is_pcm0);
+ cmp1 = __msa_fill_d(p_is_pcm4);
+ p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+
+ cmp0 = (v2i64) __msa_fill_w(d0030);
+ cmp1 = (v2i64) __msa_fill_w(d0434);
+ cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
+ cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ p3_src = LD_UH(p3);
+ p2_src = LD_UH(p2);
+ p1_src = LD_UH(p1);
+ p0_src = LD_UH(p0);
+
+ cmp0 = __msa_fill_d(q_is_pcm0);
+ cmp1 = __msa_fill_d(q_is_pcm4);
+ q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = ((tc0 * 5 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = ((tc4 * 5 + 1) >> 1);
+
+ cmp0 = (v2i64) __msa_fill_h(tc0);
+ cmp1 = (v2i64) __msa_fill_h(tc4);
+
+ ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+ p3_src, p2_src, p1_src, p0_src);
+ q0_src = LD_UH(q0);
+ q1_src = LD_UH(q1);
+ q2_src = LD_UH(q2);
+ q3_src = LD_UH(q3);
+
+ flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+ abs(p0[0] - q0[0]) < tc250;
+ flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+ abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+
+ tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+ ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+ q0_src, q1_src, q2_src, q3_src);
+ flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+ abs(p0[4] - q0[4]) < tc254;
+ flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+ abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+
+ cmp0 = (v2i64) __msa_fill_w(flag0);
+ cmp1 = (v2i64) __msa_fill_w(flag1);
+ cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
+ cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
tc_pos <<= 1;
tc_neg = -tc_pos;
+ /* p part */
temp0 = (p1_src + p0_src + q0_src);
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); @@ -129,15 +155,11 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,