diff mbox

[FFmpeg-devel,v4] avutil/mips: refine msa macros CLIP_*.

Message ID 1565171520-26219-1-git-send-email-guxiwei-hf@loongson.cn
State New
Headers show

Commit Message

guxiwei Aug. 7, 2019, 9:52 a.m. UTC
Changing details as following:
1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
   source vector.
2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
   Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
   Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).
   Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).
3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
   instead, because there are no difference in the effect of this two macros.
---
 libavcodec/mips/h264dsp_msa.c       |  39 +++++------
 libavcodec/mips/h264idct_msa.c      |   7 +-
 libavcodec/mips/hevc_idct_msa.c     |  21 +++---
 libavcodec/mips/hevc_lpf_sao_msa.c  | 132 ++++++++++++++++++------------------
 libavcodec/mips/hevc_mc_bi_msa.c    |  44 ++++++------
 libavcodec/mips/hevc_mc_biw_msa.c   |  56 +++++++--------
 libavcodec/mips/hevc_mc_uniw_msa.c  |  40 +++++------
 libavcodec/mips/hevcpred_msa.c      |   8 +--
 libavcodec/mips/idctdsp_msa.c       |   9 +--
 libavcodec/mips/qpeldsp_msa.c       |   4 +-
 libavcodec/mips/simple_idct_msa.c   |  98 +++++++++++---------------
 libavcodec/mips/vp3dsp_idct_msa.c   |  68 +++----------------
 libavcodec/mips/vp8_idct_msa.c      |   5 +-
 libavcodec/mips/vp9_idct_msa.c      |  10 ++-
 libavutil/mips/generic_macros_msa.h | 119 +++++++++++++-------------------
 15 files changed, 280 insertions(+), 380 deletions(-)

Comments

Shiyou Yin Aug. 7, 2019, 10:13 a.m. UTC | #1
LGTM.

>-----Original Message-----
>From: ffmpeg-devel-bounces@ffmpeg.org [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of gxw
>Sent: Wednesday, August 7, 2019 5:52 PM
>To: ffmpeg-devel@ffmpeg.org
>Subject: [FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.
>
>Changing details as following:
>1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
>   source vector.
>2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
>   Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
>   Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).
>   Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).
>3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
>   instead, because there are no difference in the effect of this two macros.
>---
> libavcodec/mips/h264dsp_msa.c       |  39 +++++------
> libavcodec/mips/h264idct_msa.c      |   7 +-
> libavcodec/mips/hevc_idct_msa.c     |  21 +++---
> libavcodec/mips/hevc_lpf_sao_msa.c  | 132 ++++++++++++++++++------------------
> libavcodec/mips/hevc_mc_bi_msa.c    |  44 ++++++------
> libavcodec/mips/hevc_mc_biw_msa.c   |  56 +++++++--------
> libavcodec/mips/hevc_mc_uniw_msa.c  |  40 +++++------
> libavcodec/mips/hevcpred_msa.c      |   8 +--
> libavcodec/mips/idctdsp_msa.c       |   9 +--
> libavcodec/mips/qpeldsp_msa.c       |   4 +-
> libavcodec/mips/simple_idct_msa.c   |  98 +++++++++++---------------
> libavcodec/mips/vp3dsp_idct_msa.c   |  68 +++----------------
> libavcodec/mips/vp8_idct_msa.c      |   5 +-
> libavcodec/mips/vp9_idct_msa.c      |  10 ++-
> libavutil/mips/generic_macros_msa.h | 119 +++++++++++++-------------------
> 15 files changed, 280 insertions(+), 380 deletions(-)
>
>diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
>index c4ba8c4..dd05982 100644
>--- a/libavcodec/mips/h264dsp_msa.c
>+++ b/libavcodec/mips/h264dsp_msa.c
>@@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
>     tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
>     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
>     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
>-    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>-    CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
>+    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
>     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
>     PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
>     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
>@@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
>
>         SRA_4V(temp0, temp1, temp2, temp3, denom);
>         SRA_4V(temp4, temp5, temp6, temp7, denom);
>-        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
>-        CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
>+        CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
>         PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
>                     dst0, dst1, dst2, dst3);
>         ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
>@@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
>     temp = p1_or_q1_org_in << 1;                              \
>     clip3 = clip3 - temp;                                     \
>     clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
>-    clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);              \
>+    CLIP_SH(clip3, negate_tc_in, tc_in);                      \
>     p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
> }
>
>@@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
>     delta = q0_sub_p0 + p1_sub_q1;                              \
>     delta >>= 3;                                                \
>                                                                 \
>-    delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
>+    CLIP_SH(delta, negate_threshold_in, threshold_in);          \
>                                                                 \
>     p0_or_q0_out = p0_or_q0_org_in + delta;                     \
>     q0_or_p0_out = q0_or_p0_org_in - delta;                     \
>@@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
>     delta = q0_sub_p0 + p1_sub_q1;                                       \
>     delta = __msa_srari_h(delta, 3);                                     \
>                                                                          \
>-    delta = CLIP_SH(delta, -tc, tc);                                     \
>+    CLIP_SH(delta, -tc, tc);                                             \
>                                                                          \
>     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
>                                                                          \
>@@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
>     q0_sub_p0 <<= 2;                                                       \
>     delta = q0_sub_p0 + p1_sub_q1;                                         \
>     delta = __msa_srari_h(delta, 3);                                       \
>-    delta = CLIP_SH(delta, -tc, tc);                                       \
>+    CLIP_SH(delta, -tc, tc);                                               \
>                                                                            \
>     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
>                                                                            \
>@@ -1742,7 +1740,7 @@ static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
>     v8i16 tc, tc_orig_r, tc_plus1;
>     v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
>     v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
>-    v8u16 src2_r, src3_r;
>+    v8i16 src2_r, src3_r;
>     v8i16 p2_r, p1_r, q2_r, q1_r;
>     v16u8 p2, q2, p0, q0;
>     v4i32 dst0, dst1;
>@@ -1840,8 +1838,8 @@ static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
>     tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
>     tc = tc_orig_r;
>
>-    p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
>-    q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
>+    CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
>+    CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
>
>     p2_r += p1_r;
>     q2_r += q1_r;
>@@ -1873,14 +1871,13 @@ static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
>                                               (v16i8) is_less_than_beta2);
>     tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
>
>-    q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
>+    CLIP_SH(q0_sub_p0, -tc, tc);
>
>-    ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
>+    ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
>     src2_r += q0_sub_p0;
>     src3_r -= q0_sub_p0;
>
>-    src2_r = (v8u16) CLIP_SH_0_255(src2_r);
>-    src3_r = (v8u16) CLIP_SH_0_255(src3_r);
>+    CLIP_SH2_0_255(src2_r, src3_r);
>
>     PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
>
>@@ -2510,10 +2507,8 @@ void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
>     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
>     SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
>     SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
>-    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>-    CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
>-    CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
>-    CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
>+    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
>+    CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
>     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
>                 dst2, dst3);
>     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
>@@ -2554,10 +2549,8 @@ void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
>         SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
>         SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
>         SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
>-        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>-        CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
>-        CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
>-        CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
>+        CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
>+        CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
>         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
>                     dst2, dst3);
>         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
>diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
>index 7851bfd..fbf7795 100644
>--- a/libavcodec/mips/h264idct_msa.c
>+++ b/libavcodec/mips/h264idct_msa.c
>@@ -233,8 +233,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
>          res0, res1, res2, res3);
>     ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
>          res4, res5, res6, res7);
>-    CLIP_SH4_0_255(res0, res1, res2, res3);
>-    CLIP_SH4_0_255(res4, res5, res6, res7);
>+    CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
>     PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
>                 dst0, dst1, dst2, dst3);
>     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
>@@ -263,8 +262,8 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
>          dst0_r, dst1_r, dst2_r, dst3_r);
>     ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
>          dst4_r, dst5_r, dst6_r, dst7_r);
>-    CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
>-    CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
>+    CLIP_SH8_0_255(dst0_r, dst1_r, dst2_r, dst3_r,
>+                   dst4_r, dst5_r, dst6_r, dst7_r);
>     PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
>                 dst0, dst1, dst2, dst3);
>     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
>diff --git a/libavcodec/mips/hevc_idct_msa.c b/libavcodec/mips/hevc_idct_msa.c
>index b14aec9..5ab6acd 100644
>--- a/libavcodec/mips/hevc_idct_msa.c
>+++ b/libavcodec/mips/hevc_idct_msa.c
>@@ -803,8 +803,9 @@ static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
>         LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
>         coeffs += 64;
>
>-        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
>-        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
>+        CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
>+                       dst_r2, dst_l2, dst_r3, dst_l3);
>+
>         PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
>                     dst_r3, dst0, dst1, dst2, dst3);
>         ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
>@@ -825,8 +826,8 @@ static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
>     dst_r3 += in6;
>     dst_l3 += in7;
>
>-    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
>-    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
>+    CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
>+                   dst_r2, dst_l2, dst_r3, dst_l3);
>     PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
>                 dst_r3, dst0, dst1, dst2, dst3);
>     ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
>@@ -873,8 +874,8 @@ static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
>         LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
>         coeffs += 64;
>
>-        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
>-        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
>+        CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
>+                       dst_r2, dst_l2, dst_r3, dst_l3);
>         PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
>                     dst_r3, dst0, dst1, dst2, dst3);
>         ST_UB2(dst0, dst1, dst, 16);
>@@ -905,8 +906,8 @@ static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
>     LD_SH4(coeffs, 16, in0, in2, in4, in6);
>     LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
>
>-    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
>-    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
>+    CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
>+                   dst_r2, dst_l2, dst_r3, dst_l3);
>     PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
>                 dst_r3, dst0, dst1, dst2, dst3);
>     ST_UB2(dst0, dst1, dst, 16);
>@@ -928,8 +929,8 @@ static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
>     dst_r3 += in6;
>     dst_l3 += in7;
>
>-    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
>-    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
>+    CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
>+                   dst_r2, dst_l2, dst_r3, dst_l3);
>     PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
>                 dst_r3, dst0, dst1, dst2, dst3);
>     ST_UB2(dst0, dst1, dst, 16);
>diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c
>index 791ddb3..26663dd 100644
>--- a/libavcodec/mips/hevc_lpf_sao_msa.c
>+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
>@@ -140,19 +140,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
>
>             temp1 = temp0 + p2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - p1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
>
>             temp1 = (temp0 << 1) + p2_src + q1_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
>
>             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
>@@ -165,19 +165,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
>
>             temp1 = temp0 + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - q1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
>
>             temp1 = (temp0 << 1) + p1_src + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
>
>             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
>@@ -218,15 +218,15 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
>             abs_delta0 = (v8u16) abs_delta0 < temp1;
>
>-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
>+            CLIP_SH(delta0, tc_neg, tc_pos);
>
>-            temp0 = (v8u16) (delta0 + p0_src);
>-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
>-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
>+            temp2 = (v8i16) (delta0 + p0_src);
>+            CLIP_SH_0_255(temp2);
>+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
>                                         (v16u8) p_is_pcm_vec);
>
>             temp2 = (v8i16) (q0_src - delta0);
>-            temp2 = CLIP_SH_0_255(temp2);
>+            CLIP_SH_0_255(temp2);
>             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
>                                         (v16u8) q_is_pcm_vec);
>
>@@ -252,9 +252,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             delta1 -= (v8i16) p1_src;
>             delta1 += delta0;
>             delta1 >>= 1;
>-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
>+            CLIP_SH(delta1, tc_neg, tc_pos);
>             delta1 = (v8i16) p1_src + (v8i16) delta1;
>-            delta1 = CLIP_SH_0_255(delta1);
>+            CLIP_SH_0_255(delta1);
>             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
>                                           (v16u8) p_is_pcm_vec);
>
>@@ -262,9 +262,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             delta2 = delta2 - (v8i16) q1_src;
>             delta2 = delta2 - delta0;
>             delta2 = delta2 >> 1;
>-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
>+            CLIP_SH(delta2, tc_neg, tc_pos);
>             delta2 = (v8i16) q1_src + (v8i16) delta2;
>-            delta2 = CLIP_SH_0_255(delta2);
>+            CLIP_SH_0_255(delta2);
>             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
>                                           (v16u8) q_is_pcm_vec);
>
>@@ -298,19 +298,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
>
>             temp1 = temp0 + p2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - p1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
>
>             temp1 = (temp0 << 1) + p2_src + q1_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
>
>             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
>@@ -323,19 +323,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
>
>             temp1 = temp0 + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - q1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
>
>             temp1 = (temp0 << 1) + p1_src + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
>
>             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
>@@ -362,15 +362,15 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
>             abs_delta0 = (v8u16) abs_delta0 < temp1;
>
>-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
>+            CLIP_SH(delta0, tc_neg, tc_pos);
>
>-            temp0 = (v8u16) (delta0 + p0_src);
>-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
>-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
>+            temp2 = (v8i16) (delta0 + p0_src);
>+            CLIP_SH_0_255(temp2);
>+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
>                                         (v16u8) p_is_pcm_vec);
>
>             temp2 = (v8i16) (q0_src - delta0);
>-            temp2 = CLIP_SH_0_255(temp2);
>+            CLIP_SH_0_255(temp2);
>             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
>                                         (v16u8) q_is_pcm_vec);
>
>@@ -394,9 +394,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             delta1 -= (v8i16) p1_src;
>             delta1 += delta0;
>             delta1 >>= 1;
>-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
>+            CLIP_SH(delta1, tc_neg, tc_pos);
>             delta1 = (v8i16) p1_src + (v8i16) delta1;
>-            delta1 = CLIP_SH_0_255(delta1);
>+            CLIP_SH_0_255(delta1);
>             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
>                                           (v16u8) p_is_pcm_vec);
>
>@@ -404,9 +404,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
>             delta2 = delta2 - (v8i16) q1_src;
>             delta2 = delta2 - delta0;
>             delta2 = delta2 >> 1;
>-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
>+            CLIP_SH(delta2, tc_neg, tc_pos);
>             delta2 = (v8i16) q1_src + (v8i16) delta2;
>-            delta2 = CLIP_SH_0_255(delta2);
>+            CLIP_SH_0_255(delta2);
>             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
>                                           (v16u8) q_is_pcm_vec);
>
>@@ -561,19 +561,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
>
>             temp1 = temp0 + p2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - p1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
>
>             temp1 = (temp0 << 1) + p2_src + q1_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
>
>             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
>@@ -585,19 +585,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
>
>             temp1 = temp0 + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - q1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
>
>             temp1 = (temp0 << 1) + p1_src + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
>
>             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
>@@ -620,14 +620,14 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
>             abs_delta0 = (v8u16) abs_delta0 < temp1;
>
>-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
>-            temp0 = (v8u16) (delta0 + p0_src);
>-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
>-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
>+            CLIP_SH(delta0, tc_neg, tc_pos);
>+            temp2 = (v8i16) (delta0 + p0_src);
>+            CLIP_SH_0_255(temp2);
>+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
>                                         (v16u8) p_is_pcm_vec);
>
>             temp2 = (v8i16) (q0_src - delta0);
>-            temp2 = CLIP_SH_0_255(temp2);
>+            CLIP_SH_0_255(temp2);
>             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
>                                         (v16u8) q_is_pcm_vec);
>
>@@ -649,9 +649,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             delta1 -= (v8i16) p1_src;
>             delta1 += delta0;
>             delta1 >>= 1;
>-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
>+            CLIP_SH(delta1, tc_neg, tc_pos);
>             delta1 = (v8i16) p1_src + (v8i16) delta1;
>-            delta1 = CLIP_SH_0_255(delta1);
>+            CLIP_SH_0_255(delta1);
>             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
>                                           (v16u8) p_is_pcm_vec);
>
>@@ -659,9 +659,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             delta2 = delta2 - (v8i16) q1_src;
>             delta2 = delta2 - delta0;
>             delta2 = delta2 >> 1;
>-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
>+            CLIP_SH(delta2, tc_neg, tc_pos);
>             delta2 = (v8i16) q1_src + (v8i16) delta2;
>-            delta2 = CLIP_SH_0_255(delta2);
>+            CLIP_SH_0_255(delta2);
>             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
>                                           (v16u8) q_is_pcm_vec);
>
>@@ -726,19 +726,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
>
>             temp1 = temp0 + p2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - p1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
>
>             temp1 = (temp0 << 1) + p2_src + q1_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - p0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
>
>             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
>@@ -750,19 +750,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q2_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
>
>             temp1 = temp0 + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
>             temp2 = (v8i16) (temp1 - q1_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
>
>             temp1 = (temp0 << 1) + p1_src + q2_src;
>             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
>             temp2 = (v8i16) (temp1 - q0_src);
>-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
>+            CLIP_SH(temp2, tc_neg, tc_pos);
>             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
>
>             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
>@@ -785,15 +785,15 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
>             abs_delta0 = (v8u16) abs_delta0 < temp1;
>
>-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
>+            CLIP_SH(delta0, tc_neg, tc_pos);
>
>-            temp0 = (v8u16) (delta0 + p0_src);
>-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
>-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
>+            temp2 = (v8i16) (delta0 + p0_src);
>+            CLIP_SH_0_255(temp2);
>+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
>                                         (v16u8) p_is_pcm_vec);
>
>             temp2 = (v8i16) (q0_src - delta0);
>-            temp2 = CLIP_SH_0_255(temp2);
>+            CLIP_SH_0_255(temp2);
>             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
>                                         (v16u8) q_is_pcm_vec);
>
>@@ -815,9 +815,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             delta1 -= (v8i16) p1_src;
>             delta1 += delta0;
>             delta1 >>= 1;
>-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
>+            CLIP_SH(delta1, tc_neg, tc_pos);
>             delta1 = (v8i16) p1_src + (v8i16) delta1;
>-            delta1 = CLIP_SH_0_255(delta1);
>+            CLIP_SH_0_255(delta1);
>             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
>                                           (v16u8) p_is_pcm_vec);
>
>@@ -825,9 +825,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
>             delta2 = delta2 - (v8i16) q1_src;
>             delta2 = delta2 - delta0;
>             delta2 = delta2 >> 1;
>-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
>+            CLIP_SH(delta2, tc_neg, tc_pos);
>             delta2 = (v8i16) q1_src + (v8i16) delta2;
>-            delta2 = CLIP_SH_0_255(delta2);
>+            CLIP_SH_0_255(delta2);
>             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
>                                           (v16u8) q_is_pcm_vec);
>             delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
>@@ -955,15 +955,15 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
>         temp0 <<= 2;
>         temp0 += temp1;
>         delta = __msa_srari_h((v8i16) temp0, 3);
>-        delta = CLIP_SH(delta, tc_neg, tc_pos);
>+        CLIP_SH(delta, tc_neg, tc_pos);
>
>         temp0 = (v8i16) ((v8i16) p0 + delta);
>-        temp0 = CLIP_SH_0_255(temp0);
>+        CLIP_SH_0_255(temp0);
>         temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
>                                     (v16u8) p_is_pcm_vec);
>
>         temp1 = (v8i16) ((v8i16) q0 - delta);
>-        temp1 = CLIP_SH_0_255(temp1);
>+        CLIP_SH_0_255(temp1);
>         temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
>                                     (v16u8) q_is_pcm_vec);
>
>@@ -1014,15 +1014,15 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
>         temp0 <<= 2;
>         temp0 += temp1;
>         delta = __msa_srari_h((v8i16) temp0, 3);
>-        delta = CLIP_SH(delta, tc_neg, tc_pos);
>+        CLIP_SH(delta, tc_neg, tc_pos);
>
>         temp0 = (v8i16) ((v8i16) p0 + delta);
>-        temp0 = CLIP_SH_0_255(temp0);
>+        CLIP_SH_0_255(temp0);
>         temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
>                                     (v16u8) p_is_pcm_vec);
>
>         temp1 = (v8i16) ((v8i16) q0 - delta);
>-        temp1 = CLIP_SH_0_255(temp1);
>+        CLIP_SH_0_255(temp1);
>         temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
>                                     (v16u8) q_is_pcm_vec);
>
>diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
>index 34613c8..c6c8d27 100644
>--- a/libavcodec/mips/hevc_mc_bi_msa.c
>+++ b/libavcodec/mips/hevc_mc_bi_msa.c
>@@ -48,7 +48,7 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
> {                                                                  \
>     ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                 \
>     SRARI_H2_SH(out0, out1, rnd_val);                              \
>-    CLIP_SH2_0_255_MAX_SATU(out0, out1);                           \
>+    CLIP_SH2_0_255(out0, out1);                                    \
> }
>
> #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,    \
>@@ -83,7 +83,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
>         dst0 <<= 6;
>         dst0 += in0;
>         dst0 = __msa_srari_h(dst0, 7);
>-        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
>+        CLIP_SH_0_255(dst0);
>
>         dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
>         ST_W2(dst0, 0, 1, dst, dst_stride);
>@@ -739,7 +739,7 @@ static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
>         HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
>         dst2 = __msa_adds_s_h(in2, dst2);
>         dst2 = __msa_srari_h(dst2, 7);
>-        dst2 = CLIP_SH_0_255(dst2);
>+        CLIP_SH_0_255(dst2);
>         PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
>
>         tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
>@@ -888,7 +888,7 @@ static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
>         HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
>         dst2 = __msa_adds_s_h(dst2, in2);
>         dst2 = __msa_srari_h(dst2, 7);
>-        dst2 = CLIP_SH_0_255(dst2);
>+        CLIP_SH_0_255(dst2);
>
>         PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
>         dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
>@@ -1726,7 +1726,7 @@ static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
>         ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
>         ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
>         SRARI_H2_SH(out0, out1, 7);
>-        CLIP_SH2_0_255_MAX_SATU(out0, out1);
>+        CLIP_SH2_0_255(out0, out1);
>         out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
>         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
>         dst += (4 * dst_stride);
>@@ -1854,7 +1854,7 @@ static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
>             tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>             ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
>             tmp = __msa_srari_h(tmp, 7);
>-            tmp = CLIP_SH_0_255_MAX_SATU(tmp);
>+            CLIP_SH_0_255(tmp);
>             out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
>             ST_D1(out, 0, dst_tmp);
>             dst_tmp += dst_stride;
>@@ -2000,7 +2000,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
>         tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>         ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
>         tmp = __msa_srari_h(tmp, 7);
>-        tmp = CLIP_SH_0_255_MAX_SATU(tmp);
>+        CLIP_SH_0_255(tmp);
>         out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
>         ST_D1(out, 0, dst_tmp);
>         dst_tmp += dst_stride;
>@@ -2088,7 +2088,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
>         ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
>         ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
>         SRARI_H2_SH(out0, out1, 7);
>-        CLIP_SH2_0_255_MAX_SATU(out0, out1);
>+        CLIP_SH2_0_255(out0, out1);
>         out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
>         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
>         dst += (4 * dst_stride);
>@@ -2215,7 +2215,7 @@ static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
>
>     tmp0 = __msa_adds_s_h(tmp0, in0);
>     tmp0 = __msa_srari_h(tmp0, 7);
>-    tmp0 = CLIP_SH_0_255(tmp0);
>+    CLIP_SH_0_255(tmp0);
>     dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
>
>     ST_W2(dst0, 0, 1, dst, dst_stride);
>@@ -2943,7 +2943,7 @@ static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
>     DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
>     dst10 = __msa_adds_s_h(dst10, in0);
>     dst10 = __msa_srari_h(dst10, 7);
>-    dst10 = CLIP_SH_0_255(dst10);
>+    CLIP_SH_0_255(dst10);
>
>     dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
>     ST_W2(dst10, 0, 1, dst, dst_stride);
>@@ -3843,7 +3843,7 @@ static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
>     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
>     tmp = __msa_adds_s_h(tmp, in0);
>     tmp = __msa_srari_h(tmp, 7);
>-    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
>+    CLIP_SH_0_255(tmp);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
>     ST_W2(out, 0, 1, dst, dst_stride);
> }
>@@ -3919,7 +3919,7 @@ static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
>     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
>     ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
>     SRARI_H2_SH(tmp0, tmp1, 7);
>-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
>+    CLIP_SH2_0_255(tmp0, tmp1);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
> }
>@@ -4032,7 +4032,7 @@ static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
>         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
>                     tmp2, tmp3);
>         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>         dst += (8 * dst_stride);
>@@ -4200,7 +4200,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
>     ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
>                 tmp3);
>     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
>-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>
>@@ -4212,7 +4212,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
>     ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
>     ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
>     SRARI_H2_SH(tmp4, tmp5, 7);
>-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
>+    CLIP_SH2_0_255(tmp4, tmp5);
>     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
>     ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
> }
>@@ -4286,7 +4286,7 @@ static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
>     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
>     ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
>     SRARI_H2_SH(tmp0, tmp1, 7);
>-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
>+    CLIP_SH2_0_255(tmp0, tmp1);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>     ST_D2(out, 0, 1, dst, dst_stride);
> }
>@@ -4380,7 +4380,7 @@ static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
>         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
>                     tmp0, tmp1, tmp2, tmp3);
>         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
>         dst += 8;
>@@ -4495,8 +4495,8 @@ static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
>     ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
>     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
>     SRARI_H2_SH(tmp4, tmp5, 7);
>-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
>+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>+    CLIP_SH2_0_255(tmp4, tmp5);
>     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
>     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
>@@ -4610,7 +4610,7 @@ static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
>             ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
>                         tmp0, tmp1, tmp2, tmp3);
>             SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
>-            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>             ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
>             dst_tmp += (4 * dst_stride);
>@@ -4760,7 +4760,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
>         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
>                     tmp0, tmp1, tmp2, tmp3);
>         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
>         dst_tmp += (4 * dst_stride);
>@@ -4846,7 +4846,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
>         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
>                     tmp0, tmp1, tmp2, tmp3);
>         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>         dst += (8 * dst_stride);
>diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c
>index 68f122e..f775ea8 100644
>--- a/libavcodec/mips/hevc_mc_biw_msa.c
>+++ b/libavcodec/mips/hevc_mc_biw_msa.c
>@@ -66,7 +66,7 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
>     out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);   \
>     SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \
>     PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);         \
>-    CLIP_SH2_0_255_MAX_SATU(out0, out1);                             \
>+    CLIP_SH2_0_255(out0, out1);                                      \
> }
>
> #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,  \
>@@ -124,7 +124,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
>         dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
>         SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
>         dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>-        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
>+        CLIP_SH_0_255(dst0);
>         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
>         ST_W2(out0, 0, 1, dst, dst_stride);
>     } else if (4 == height) {
>@@ -1069,8 +1069,8 @@ static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
>         dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
>                                  (v8i16) weight_vec);
>         SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
>-        dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
>-        out2 = CLIP_SH_0_255(dst2_r);
>+        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
>+        CLIP_SH_0_255(out2);
>
>         LD_SB2(src0_ptr, 16, src0, src1);
>         src0_ptr += src_stride;
>@@ -1100,8 +1100,8 @@ static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
>     dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
>     dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
>     SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
>-    dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
>-    out2 = CLIP_SH_0_255(dst2_r);
>+    out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
>+    CLIP_SH_0_255(out2);
>     PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
>     dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
>     ST_SH(out0, dst);
>@@ -1674,8 +1674,8 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
>         dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
>                                  (v8i16) weight_vec);
>         SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
>-        dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
>-        out2 = CLIP_SH_0_255(dst2_r);
>+        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
>+        CLIP_SH_0_255(out2);
>         PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
>         ST_D2(out0, 0, 1, dst, dst_stride);
>         ST_W2(out2, 0, 1, dst + 8, dst_stride);
>@@ -2048,7 +2048,7 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
>         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
>         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
>         SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
>-        CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
>+        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
>         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
>         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
>@@ -2226,7 +2226,7 @@ static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
>             dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
>             dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
>             SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
>-            CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
>+            CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
>             PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
>             out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>             ST_D2(out, 0, 1, dst_tmp, dst_stride);
>@@ -2412,7 +2412,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
>         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
>         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
>         SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
>-        CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
>+        CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
>         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
>         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>         ST_D2(out, 0, 1, dst_tmp, dst_stride);
>@@ -2503,7 +2503,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
>         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
>         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
>         SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
>-        CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
>+        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
>         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
>         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
>@@ -2683,8 +2683,8 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
>     dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
>     dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
>     SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
>-    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>-    out0 = CLIP_SH_0_255(dst0_r);
>+    out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>+    CLIP_SH_0_255(out0);
>     out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
>     ST_W2(out0, 0, 1, dst, dst_stride);
> }
>@@ -3554,8 +3554,8 @@ static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
>     dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
>     dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
>     SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
>-    dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
>-    out = CLIP_SH_0_255(dst10_r);
>+    out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
>+    CLIP_SH_0_255(out);
>     out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
>     ST_W2(out, 0, 1, dst, dst_stride);
> }
>@@ -4575,7 +4575,7 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
>     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
>     SRAR_W2_SW(dst0, dst1, rnd_vec);
>     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
>-    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
>+    CLIP_SH_0_255(tmp);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
>     ST_W2(out, 0, 1, dst, dst_stride);
> }
>@@ -4672,7 +4672,7 @@ static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
>     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
>     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
>     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
>-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
>+    CLIP_SH2_0_255(tmp0, tmp1);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
> }
>@@ -4810,7 +4810,7 @@ static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
>         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
>         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
>                     tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>         dst += (8 * dst_stride);
>@@ -5008,7 +5008,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
>     SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
>     PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
>                 tmp2, tmp3);
>-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>
>@@ -5030,7 +5030,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
>     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
>     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
>
>-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
>+    CLIP_SH2_0_255(tmp4, tmp5);
>     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
>     ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
> }
>@@ -5126,7 +5126,7 @@ static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
>     dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
>     SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
>     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
>-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
>+    CLIP_SH2_0_255(tmp0, tmp1);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>     ST_D2(out, 0, 1, dst, dst_stride);
> }
>@@ -5248,7 +5248,7 @@ static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
>         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
>         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
>                     tmp0, tmp1, tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
>         dst += 8;
>@@ -5387,7 +5387,7 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
>     SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
>     PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
>                 tmp0, tmp1, tmp2, tmp3);
>-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>
>     PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
>@@ -5399,7 +5399,7 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
>     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
>     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
>     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
>-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
>+    CLIP_SH2_0_255(tmp4, tmp5);
>     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
>     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
>     ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
>@@ -5537,7 +5537,7 @@ static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
>             SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
>             PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
>                         tmp0, tmp1, tmp2, tmp3);
>-            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>             ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
>             dst_tmp += (4 * dst_stride);
>@@ -5724,7 +5724,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
>         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
>         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
>                     tmp0, tmp1, tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
>         dst_tmp += (4 * dst_stride);
>@@ -5820,7 +5820,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
>         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
>         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
>                     tmp0, tmp1, tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>         dst += (8 * dst_stride);
>diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c b/libavcodec/mips/hevc_mc_uniw_msa.c
>index cad1240..1a8c251 100644
>--- a/libavcodec/mips/hevc_mc_uniw_msa.c
>+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
>@@ -41,7 +41,7 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
>     SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);                    \
>     PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);          \
>     ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);          \
>-    CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h);                                  \
>+    CLIP_SH2_0_255(out0_h, out1_h);                                           \
> }
>
> #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \
>@@ -88,7 +88,7 @@ static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
>         SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
>         dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>         dst0 += offset_vec;
>-        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
>+        CLIP_SH_0_255(dst0);
>         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
>         ST_W2(out0, 0, 1, dst, dst_stride);
>     } else if (4 == height) {
>@@ -1863,7 +1863,7 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
>         SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
>         ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
>         ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
>-        CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
>+        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
>         PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
>         out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
>         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
>@@ -2014,7 +2014,7 @@ static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
>             SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
>             ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
>             ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
>-            CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst0_l, dst1_l);
>+            CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l);
>
>             PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
>             dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
>@@ -2165,7 +2165,7 @@ static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
>         MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
>         SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
>         ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
>-        CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l);
>+        CLIP_SW2_0_255(dst0_r, dst0_l);
>         dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>         out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
>         ST_D1(out, 0, dst_tmp);
>@@ -2246,7 +2246,7 @@ static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
>         SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
>         ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
>         ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
>-        CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
>+        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
>         PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
>         out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
>         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
>@@ -2394,7 +2394,7 @@ static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
>     SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
>     dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>     dst0 = __msa_adds_s_h(dst0, offset_vec);
>-    dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
>+    CLIP_SH_0_255(dst0);
>     out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
>     ST_W2(out, 0, 1, dst, dst_stride);
>     dst += (4 * dst_stride);
>@@ -3295,7 +3295,7 @@ static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
>     SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
>     dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
>     dst0 = __msa_adds_s_h(dst0, offset_vec);
>-    dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
>+    CLIP_SH_0_255(dst0);
>     out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
>     ST_W2(out, 0, 1, dst, dst_stride);
> }
>@@ -4247,7 +4247,7 @@ static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
>     SRAR_W2_SW(dst0, dst1, rnd_vec);
>     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
>     tmp += offset_vec;
>-    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
>+    CLIP_SH_0_255(tmp);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
>     ST_W2(out, 0, 1, dst, dst_stride);
> }
>@@ -4316,7 +4316,7 @@ static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
>     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
>     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
>     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
>+    CLIP_SH2_0_255(tmp0, tmp1);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
> }
>@@ -4417,7 +4417,7 @@ static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
>                     tmp2, tmp3);
>         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>         dst += (8 * dst_stride);
>@@ -4574,8 +4574,8 @@ static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
>     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>     ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
>     ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
>-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
>+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>+    CLIP_SH2_0_255(tmp4, tmp5);
>     PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
>     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>     ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
>@@ -4652,7 +4652,7 @@ static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
>     SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
>     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
>     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
>+    CLIP_SH2_0_255(tmp0, tmp1);
>     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
>     ST_D2(out, 0, 1, dst, dst_stride);
> }
>@@ -4745,7 +4745,7 @@ static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
>                     dst3_r, tmp0, tmp1, tmp2, tmp3);
>         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
>         dst += 8;
>@@ -4861,8 +4861,8 @@ static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
>     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>     ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
>     ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
>-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
>+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>+    CLIP_SH2_0_255(tmp4, tmp5);
>     PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
>     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
>     ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
>@@ -4973,7 +4973,7 @@ static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
>                         dst3_r, tmp0, tmp1, tmp2, tmp3);
>             ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>             ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
>-            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>             ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
>             dst_tmp += (4 * dst_stride);
>@@ -5120,7 +5120,7 @@ static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
>                     dst3_r, tmp0, tmp1, tmp2, tmp3);
>         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
>         dst_tmp += (4 * dst_stride);
>@@ -5187,7 +5187,7 @@ static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
>                     tmp2, tmp3);
>         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
>         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
>-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
>+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
>         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
>         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
>         dst += (8 * dst_stride);
>diff --git a/libavcodec/mips/hevcpred_msa.c b/libavcodec/mips/hevcpred_msa.c
>index 909b62d..f53276d 100644
>--- a/libavcodec/mips/hevcpred_msa.c
>+++ b/libavcodec/mips/hevcpred_msa.c
>@@ -83,7 +83,7 @@ static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
>         vec2 -= vec0;
>         vec2 >>= 1;
>         vec2 += vec1;
>-        vec2 = CLIP_SH_0_255(vec2);
>+        CLIP_SH_0_255(vec2);
>
>         for (col = 0; col < 4; col++) {
>             dst[stride * col] = (uint8_t) vec2[col];
>@@ -122,7 +122,7 @@ static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
>         vec2 -= vec0;
>         vec2 >>= 1;
>         vec2 += vec1;
>-        vec2 = CLIP_SH_0_255(vec2);
>+        CLIP_SH_0_255(vec2);
>
>         val0 = vec2[0];
>         val1 = vec2[1];
>@@ -214,7 +214,7 @@ static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
>         src0_r -= src_top_val;
>         src0_r >>= 1;
>         src0_r += src_left_val;
>-        src0_r = CLIP_SH_0_255(src0_r);
>+        CLIP_SH_0_255(src0_r);
>         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
>         val0 = __msa_copy_s_w((v4i32) src0, 0);
>         SW(val0, dst);
>@@ -254,7 +254,7 @@ static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
>         src0_r -= src_top_val;
>         src0_r >>= 1;
>         src0_r += src_left_val;
>-        src0_r = CLIP_SH_0_255(src0_r);
>+        CLIP_SH_0_255(src0_r);
>         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
>         val0 = __msa_copy_s_d((v2i64) src0, 0);
>         SD(val0, dst);
>diff --git a/libavcodec/mips/idctdsp_msa.c b/libavcodec/mips/idctdsp_msa.c
>index b29e420..b6b98dc 100644
>--- a/libavcodec/mips/idctdsp_msa.c
>+++ b/libavcodec/mips/idctdsp_msa.c
>@@ -28,8 +28,7 @@ static void put_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
>     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
>
>     LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
>-    CLIP_SH4_0_255(in0, in1, in2, in3);
>-    CLIP_SH4_0_255(in4, in5, in6, in7);
>+    CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7);
>     PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
>     PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
>
>@@ -63,8 +62,7 @@ static void put_signed_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
>     in6 += 128;
>     in7 += 128;
>
>-    CLIP_SH4_0_255(in0, in1, in2, in3);
>-    CLIP_SH4_0_255(in4, in5, in6, in7);
>+    CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7);
>     PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
>     PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
>
>@@ -109,8 +107,7 @@ static void add_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
>     in6 += (v8i16) pix6;
>     in7 += (v8i16) pix7;
>
>-    CLIP_SH4_0_255(in0, in1, in2, in3);
>-    CLIP_SH4_0_255(in4, in5, in6, in7);
>+    CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7);
>     PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
>     PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
>
>diff --git a/libavcodec/mips/qpeldsp_msa.c b/libavcodec/mips/qpeldsp_msa.c
>index f9d1a95..c7675f1 100644
>--- a/libavcodec/mips/qpeldsp_msa.c
>+++ b/libavcodec/mips/qpeldsp_msa.c
>@@ -96,7 +96,7 @@
>     DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);         \
>     res0_r = (v8i16) (sum0_r - sum3_r);                                 \
>     res0_r = __msa_srari_h(res0_r, 5);                                  \
>-    res0_r = CLIP_SH_0_255(res0_r);                                     \
>+    CLIP_SH_0_255(res0_r);                                              \
>     out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);        \
>                                                                         \
>     out;                                                                \
>@@ -118,7 +118,7 @@
>     res0_r = (v8i16) (sum0_r - sum3_r);                                   \
>     res0_r += 15;                                                         \
>     res0_r >>= 5;                                                         \
>-    res0_r = CLIP_SH_0_255(res0_r);                                       \
>+    CLIP_SH_0_255(res0_r);                                                \
>     out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);          \
>                                                                           \
>     out;                                                                  \
>diff --git a/libavcodec/mips/simple_idct_msa.c b/libavcodec/mips/simple_idct_msa.c
>index 8a72359..4bd3dd8 100644
>--- a/libavcodec/mips/simple_idct_msa.c
>+++ b/libavcodec/mips/simple_idct_msa.c
>@@ -336,35 +336,26 @@ static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
>     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
>     SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
>     SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
>-    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
>-                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
>-    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
>-                a0_r, a1_r, a2_r, a3_r);
>-    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
>-    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
>-    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
>-    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
>-    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
>-                temp2_r, temp2_r, temp3_r, temp3_r,
>-                temp0_r, temp1_r, temp2_r, temp3_r);
>-    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
>-    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
>-    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
>-    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
>-    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
>-    dst += 4 * dst_stride;
>-    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
>-    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
>-    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
>-    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
>-    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
>-                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
>-    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
>-    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
>-    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
>-    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
>+    PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
>+                temp3_l, temp3_r, in0, in1, in2, in3);
>+    PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
>+                in4, in5, in6, in7);
>+    CLIP_SH4_0_255(in0, in1, in2, in3);
>+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
>+                in0, in1, in2, in3);
>+    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
>+    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
>+    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
>+    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
>     SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
>-    dst += 4 * dst_stride;
>+    CLIP_SH4_0_255(in4, in5, in6, in7);
>+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
>+                in4, in5, in6, in7);
>+    tmp3 = __msa_copy_u_d((v2i64) in4, 1);
>+    tmp2 = __msa_copy_u_d((v2i64) in5, 1);
>+    tmp1 = __msa_copy_u_d((v2i64) in6, 1);
>+    tmp0 = __msa_copy_u_d((v2i64) in7, 1);
>+    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
> }
>
> static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
>@@ -516,21 +507,17 @@ static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
>                 temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
>     ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
>                temp0_l, temp1_l, temp2_l, temp3_l);
>-    temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
>-    temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
>-    temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
>-    temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
>-    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
>-    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
>-    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
>-    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
>-    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
>-                temp2_r, temp2_r, temp3_r, temp3_r,
>-                temp0_r, temp1_r, temp2_r, temp3_r);
>-    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
>-    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
>-    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
>-    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
>+    in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l);
>+    in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l);
>+    in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l);
>+    in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l);
>+    CLIP_SH4_0_255(in0, in1, in2, in3);
>+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
>+                in0, in1, in2, in3);
>+    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
>+    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
>+    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
>+    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
>     SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
>
>     SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
>@@ -540,20 +527,17 @@ static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
>                 a0_r, a1_r, a2_r, a3_r);
>     ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
>                a3_l, a2_l, a1_l, a0_l);
>-    a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
>-    a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
>-    a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
>-    a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
>-    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
>-    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
>-    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
>-    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
>-    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
>-                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
>-    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
>-    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
>-    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
>-    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
>+    in4 = (v8i16) (a3_r) + (v8i16) (a3_l);
>+    in5 = (v8i16) (a2_r) + (v8i16) (a2_l);
>+    in6 = (v8i16) (a1_r) + (v8i16) (a1_l);
>+    in7 = (v8i16) (a0_r) + (v8i16) (a0_l);
>+    CLIP_SH4_0_255(in4, in5, in6, in7);
>+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
>+                in4, in5, in6, in7);
>+    tmp0 = __msa_copy_u_d((v2i64) in4, 1);
>+    tmp1 = __msa_copy_u_d((v2i64) in5, 1);
>+    tmp2 = __msa_copy_u_d((v2i64) in6, 1);
>+    tmp3 = __msa_copy_u_d((v2i64) in7, 1);
>     SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
> }
>
>diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c
>index b2899ee..90c578f 100644
>--- a/libavcodec/mips/vp3dsp_idct_msa.c
>+++ b/libavcodec/mips/vp3dsp_idct_msa.c
>@@ -187,14 +187,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
>         G += c5;
>         H += c6;
>     }
>-    A = CLIP_SW_0_255(A);
>-    B = CLIP_SW_0_255(B);
>-    C = CLIP_SW_0_255(C);
>-    D = CLIP_SW_0_255(D);
>-    E = CLIP_SW_0_255(E);
>-    F = CLIP_SW_0_255(F);
>-    G = CLIP_SW_0_255(G);
>-    H = CLIP_SW_0_255(H);
>+    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
>     sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
>     sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
>     sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
>@@ -205,7 +198,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
>     Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
>     if (type == 1) {
>         Bdd = Add + cnst128w;
>-        Bdd = CLIP_SW_0_255(Bdd);
>+        CLIP_SW_0_255(Bdd);
>         Ad = Bdd;
>         Bd = Bdd;
>         Cd = Bdd;
>@@ -223,14 +216,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
>         Fd = Add + c5;
>         Gd = Add + c6;
>         Hd = Add + c7;
>-        Ad = CLIP_SW_0_255(Ad);
>-        Bd = CLIP_SW_0_255(Bd);
>-        Cd = CLIP_SW_0_255(Cd);
>-        Dd = CLIP_SW_0_255(Dd);
>-        Ed = CLIP_SW_0_255(Ed);
>-        Fd = CLIP_SW_0_255(Fd);
>-        Gd = CLIP_SW_0_255(Gd);
>-        Hd = CLIP_SW_0_255(Hd);
>+        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
>     }
>     Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
>     Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
>@@ -309,14 +295,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
>         G += c5;
>         H += c6;
>     }
>-    A = CLIP_SW_0_255(A);
>-    B = CLIP_SW_0_255(B);
>-    C = CLIP_SW_0_255(C);
>-    D = CLIP_SW_0_255(D);
>-    E = CLIP_SW_0_255(E);
>-    F = CLIP_SW_0_255(F);
>-    G = CLIP_SW_0_255(G);
>-    H = CLIP_SW_0_255(H);
>+    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
>     sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
>     sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
>     sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
>@@ -327,7 +306,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
>     Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
>     if (type == 1) {
>         Bdd = Add + cnst128w;
>-        Bdd = CLIP_SW_0_255(Bdd);
>+        CLIP_SW_0_255(Bdd);
>         Ad = Bdd;
>         Bd = Bdd;
>         Cd = Bdd;
>@@ -345,14 +324,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
>         Fd = Add + c5;
>         Gd = Add + c6;
>         Hd = Add + c7;
>-        Ad = CLIP_SW_0_255(Ad);
>-        Bd = CLIP_SW_0_255(Bd);
>-        Cd = CLIP_SW_0_255(Cd);
>-        Dd = CLIP_SW_0_255(Dd);
>-        Ed = CLIP_SW_0_255(Ed);
>-        Fd = CLIP_SW_0_255(Fd);
>-        Gd = CLIP_SW_0_255(Gd);
>-        Hd = CLIP_SW_0_255(Hd);
>+        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
>     }
>     Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
>     Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
>@@ -436,14 +408,7 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
>     e5 += dc;
>     e6 += dc;
>     e7 += dc;
>-    e0 = CLIP_SW_0_255(e0);
>-    e1 = CLIP_SW_0_255(e1);
>-    e2 = CLIP_SW_0_255(e2);
>-    e3 = CLIP_SW_0_255(e3);
>-    e4 = CLIP_SW_0_255(e4);
>-    e5 = CLIP_SW_0_255(e5);
>-    e6 = CLIP_SW_0_255(e6);
>-    e7 = CLIP_SW_0_255(e7);
>+    CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7);
>
>     /* Left part */
>     ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
>@@ -458,14 +423,7 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
>     r5 += dc;
>     r6 += dc;
>     r7 += dc;
>-    r0 = CLIP_SW_0_255(r0);
>-    r1 = CLIP_SW_0_255(r1);
>-    r2 = CLIP_SW_0_255(r2);
>-    r3 = CLIP_SW_0_255(r3);
>-    r4 = CLIP_SW_0_255(r4);
>-    r5 = CLIP_SW_0_255(r5);
>-    r6 = CLIP_SW_0_255(r6);
>-    r7 = CLIP_SW_0_255(r7);
>+    CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7);
>     VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
>     VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
>     VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
>@@ -516,10 +474,7 @@ void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
>     f1 += e1;
>     g0 -= e0;
>     g1 -= e1;
>-    f0 = CLIP_SW_0_255(f0);
>-    f1 = CLIP_SW_0_255(f1);
>-    g0 = CLIP_SW_0_255(g0);
>-    g1 = CLIP_SW_0_255(g1);
>+    CLIP_SW4_0_255(f0, f1, g0, g1);
>     VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
>
>     /* Final move to first_pixel */
>@@ -563,10 +518,7 @@ void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
>     f1 += e1;
>     g0 -= e0;
>     g1 -= e1;
>-    f0 = CLIP_SW_0_255(f0);
>-    f1 = CLIP_SW_0_255(f1);
>-    g0 = CLIP_SW_0_255(g0);
>-    g1 = CLIP_SW_0_255(g1);
>+    CLIP_SW4_0_255(f0, f1, g0, g1);
>     VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
>     /* Final move to first_pixel */
>     ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
>diff --git a/libavcodec/mips/vp8_idct_msa.c b/libavcodec/mips/vp8_idct_msa.c
>index ae6fec0..ce37ca1 100644
>--- a/libavcodec/mips/vp8_idct_msa.c
>+++ b/libavcodec/mips/vp8_idct_msa.c
>@@ -71,10 +71,7 @@ void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
>     ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
>                res0, res1, res2, res3);
>     ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
>-    res0 = CLIP_SW_0_255(res0);
>-    res1 = CLIP_SW_0_255(res1);
>-    res2 = CLIP_SW_0_255(res2);
>-    res3 = CLIP_SW_0_255(res3);
>+    CLIP_SW4_0_255(res0, res1, res2, res3);
>     VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
>     ST_W2(dest0, 0, 1, dst, stride);
>     ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
>diff --git a/libavcodec/mips/vp9_idct_msa.c b/libavcodec/mips/vp9_idct_msa.c
>index 924b83d..53bfbb4 100644
>--- a/libavcodec/mips/vp9_idct_msa.c
>+++ b/libavcodec/mips/vp9_idct_msa.c
>@@ -764,13 +764,13 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
>
>     res0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst0);
>     res0 += out0;
>-    res0 = CLIP_SH_0_255(res0);
>+    CLIP_SH_0_255(res0);
>     res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
>     ST_D1(res0, 0, dst);
>
>     res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7);
>     res7 += out7;
>-    res7 = CLIP_SH_0_255(res7);
>+    CLIP_SH_0_255(res7);
>     res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7);
>     ST_D1(res7, 0, dst + 7 * dst_stride);
>
>@@ -1193,8 +1193,7 @@ static void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst,
>              res3);
>         ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
>              res7);
>-        CLIP_SH4_0_255(res0, res1, res2, res3);
>-        CLIP_SH4_0_255(res4, res5, res6, res7);
>+        CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
>         PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
>                     tmp0, tmp1, tmp2, tmp3);
>         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
>@@ -1982,8 +1981,7 @@ static void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst,
>              res3);
>         ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
>              res7);
>-        CLIP_SH4_0_255(res0, res1, res2, res3);
>-        CLIP_SH4_0_255(res4, res5, res6, res7);
>+        CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
>         PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
>                     tmp0, tmp1, tmp2, tmp3);
>
>diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
>index 528f45e..0061dc4 100644
>--- a/libavutil/mips/generic_macros_msa.h
>+++ b/libavutil/mips/generic_macros_msa.h
>@@ -914,99 +914,78 @@
>
> /* Description : Clips all halfword elements of input vector between min & max
>                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
>-   Arguments   : Inputs  - in       (input vector)
>-                         - min      (min threshold)
>-                         - max      (max threshold)
>-                 Outputs - out_m    (output vector with clipped elements)
>+   Arguments   : Inputs  - in    (input vector)
>+                         - min   (min threshold)
>+                         - max   (max threshold)
>+                 Outputs - in    (output vector with clipped elements)
>                  Return Type - signed halfword
> */
>-#define CLIP_SH(in, min, max)                           \
>-( {                                                     \
>-    v8i16 out_m;                                        \
>-                                                        \
>-    out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \
>-    out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \
>-    out_m;                                              \
>-} )
>+#define CLIP_SH(in, min, max)                     \
>+{                                                 \
>+    in = __msa_max_s_h((v8i16) min, (v8i16) in);  \
>+    in = __msa_min_s_h((v8i16) max, (v8i16) in);  \
>+}
>
> /* Description : Clips all signed halfword elements of input vector
>                  between 0 & 255
>-   Arguments   : Inputs  - in       (input vector)
>-                 Outputs - out_m    (output vector with clipped elements)
>-                 Return Type - signed halfword
>+   Arguments   : Inputs  - in    (input vector)
>+                 Outputs - in    (output vector with clipped elements)
>+                 Return Type - signed halfwords
> */
>-#define CLIP_SH_0_255(in)                                 \
>-( {                                                       \
>-    v8i16 max_m = __msa_ldi_h(255);                       \
>-    v8i16 out_m;                                          \
>-                                                          \
>-    out_m = __msa_maxi_s_h((v8i16) in, 0);                \
>-    out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \
>-    out_m;                                                \
>-} )
>+#define CLIP_SH_0_255(in)                       \
>+{                                               \
>+    in = __msa_maxi_s_h((v8i16) in, 0);         \
>+    in = (v8i16) __msa_sat_u_h((v8u16) in, 7);  \
>+}
>+
> #define CLIP_SH2_0_255(in0, in1)  \
> {                                 \
>-    in0 = CLIP_SH_0_255(in0);     \
>-    in1 = CLIP_SH_0_255(in1);     \
>+    CLIP_SH_0_255(in0);           \
>+    CLIP_SH_0_255(in1);           \
> }
>+
> #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
> {                                           \
>     CLIP_SH2_0_255(in0, in1);               \
>     CLIP_SH2_0_255(in2, in3);               \
> }
>
>-#define CLIP_SH_0_255_MAX_SATU(in)                    \
>-( {                                                   \
>-    v8i16 out_m;                                      \
>-                                                      \
>-    out_m = __msa_maxi_s_h((v8i16) in, 0);            \
>-    out_m = (v8i16) __msa_sat_u_h((v8u16) out_m, 7);  \
>-    out_m;                                            \
>-} )
>-#define CLIP_SH2_0_255_MAX_SATU(in0, in1)  \
>-{                                          \
>-    in0 = CLIP_SH_0_255_MAX_SATU(in0);     \
>-    in1 = CLIP_SH_0_255_MAX_SATU(in1);     \
>-}
>-#define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3)  \
>-{                                                    \
>-    CLIP_SH2_0_255_MAX_SATU(in0, in1);               \
>-    CLIP_SH2_0_255_MAX_SATU(in2, in3);               \
>+#define CLIP_SH8_0_255(in0, in1, in2, in3,  \
>+                       in4, in5, in6, in7)  \
>+{                                           \
>+    CLIP_SH4_0_255(in0, in1, in2, in3);     \
>+    CLIP_SH4_0_255(in4, in5, in6, in7);     \
> }
>
> /* Description : Clips all signed word elements of input vector
>                  between 0 & 255
>-   Arguments   : Inputs  - in       (input vector)
>-                 Outputs - out_m    (output vector with clipped elements)
>+   Arguments   : Inputs  - in    (input vector)
>+                 Outputs - in    (output vector with clipped elements)
>                  Return Type - signed word
> */
>-#define CLIP_SW_0_255(in)                                 \
>-( {                                                       \
>-    v4i32 max_m = __msa_ldi_w(255);                       \
>-    v4i32 out_m;                                          \
>-                                                          \
>-    out_m = __msa_maxi_s_w((v4i32) in, 0);                \
>-    out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
>-    out_m;                                                \
>-} )
>+#define CLIP_SW_0_255(in)                       \
>+{                                               \
>+    in = __msa_maxi_s_w((v4i32) in, 0);         \
>+    in = (v4i32) __msa_sat_u_w((v4u32) in, 7);  \
>+}
>
>-#define CLIP_SW_0_255_MAX_SATU(in)                    \
>-( {                                                   \
>-    v4i32 out_m;                                      \
>-                                                      \
>-    out_m = __msa_maxi_s_w((v4i32) in, 0);            \
>-    out_m = (v4i32) __msa_sat_u_w((v4u32) out_m, 7);  \
>-    out_m;                                            \
>-} )
>-#define CLIP_SW2_0_255_MAX_SATU(in0, in1)  \
>-{                                          \
>-    in0 = CLIP_SW_0_255_MAX_SATU(in0);     \
>-    in1 = CLIP_SW_0_255_MAX_SATU(in1);     \
>+#define CLIP_SW2_0_255(in0, in1)  \
>+{                                 \
>+    CLIP_SW_0_255(in0);           \
>+    CLIP_SW_0_255(in1);           \
> }
>-#define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3)  \
>-{                                                    \
>-    CLIP_SW2_0_255_MAX_SATU(in0, in1);               \
>-    CLIP_SW2_0_255_MAX_SATU(in2, in3);               \
>+
>+#define CLIP_SW4_0_255(in0, in1, in2, in3)  \
>+{                                           \
>+    CLIP_SW2_0_255(in0, in1);               \
>+    CLIP_SW2_0_255(in2, in3);               \
>+}
>+
>+#define CLIP_SW8_0_255(in0, in1, in2, in3,  \
>+                       in4, in5, in6, in7)  \
>+{                                           \
>+    CLIP_SW4_0_255(in0, in1, in2, in3);     \
>+    CLIP_SW4_0_255(in4, in5, in6, in7);     \
> }
>
> /* Description : Addition of 4 signed word elements
>--
>2.1.0
>
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Michael Niedermayer Aug. 7, 2019, 11:05 p.m. UTC | #2
On Wed, Aug 07, 2019 at 05:52:00PM +0800, gxw wrote:
> Changing details as following:
> 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
>    source vector.
> 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
>    Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
>    Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).
>    Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).
> 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
>    instead, because there are no difference in the effect of this two macros.

can these 3 things be split into 3 patches ?
It would be clearer if each change would be in its own patch

thanks

[...]
guxiwei Aug. 8, 2019, 1:49 a.m. UTC | #3
> 发件人: "Michael Niedermayer" <michael@niedermayer.cc>

> 发送时间: 2019-08-08 07:05:13 (星期四)

> 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>

> 抄送: 

> 主题: Re: [FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.

> 

> On Wed, Aug 07, 2019 at 05:52:00PM +0800, gxw wrote:

> > Changing details as following:

> > 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in

> >    source vector.

> > 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.

> >    Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).

> >    Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).

> >    Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).

> > 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'

> >    instead, because there are no difference in the effect of this two macros.

> 

> can these 3 things be split into 3 patches ?

> It would be clearer if each change would be in its own patch

> 

> thanks

> 

> [...]

> -- 

> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

> 

> Whats the most studid thing your enemy could do ? Blow himself up

> Whats the most studid thing you could do ? Give up your rights and

> freedom because your enemy blew himself up.


It can be split into 3 patches. But there some benefits as 1 patch, these macros belong to the same class and are highly relevant. It is more intuitive to put them in a patch.

thanks

北京市海淀区中关村环保科技示范园龙芯产业园2号楼 100095电话: +86 (10) 62546668传真: +86 (10) 62600826www.loongson.cn本邮件及其附件含有龙芯中科技术有限公司的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部 分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。 

This email and its attachments contain confidential information from Loongson
Technology Corporation Limited, which is intended only for the person or entity
whose address is listed above. Any use of the information contained herein in
any way (including, but not limited to, total or partial disclosure,
reproduction or dissemination) by persons other than the intended recipient(s)
is prohibited. If you receive this email in error, please notify the sender by
phone or email immediately and delete it.
Michael Niedermayer Aug. 8, 2019, 4:06 p.m. UTC | #4
On Thu, Aug 08, 2019 at 09:49:35AM +0800, 顾希伟 wrote:
> > 发件人: "Michael Niedermayer" <michael@niedermayer.cc>
> > 发送时间: 2019-08-08 07:05:13 (星期四)
> > 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> > 抄送: 
> > 主题: Re: [FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.
> > 
> > On Wed, Aug 07, 2019 at 05:52:00PM +0800, gxw wrote:
> > > Changing details as following:
> > > 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
> > >    source vector.
> > > 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
> > >    Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
> > >    Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).
> > >    Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).
> > > 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
> > >    instead, because there are no difference in the effect of this two macros.
> > 
> > can these 3 things be split into 3 patches ?
> > It would be clearer if each change would be in its own patch
> > 
> > thanks
> > 
> > [...]
> > -- 
> > Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> > 
> > Whats the most studid thing your enemy could do ? Blow himself up
> > Whats the most studid thing you could do ? Give up your rights and
> > freedom because your enemy blew himself up.
> 
> It can be split into 3 patches. But there some benefits as 1 patch, these macros belong to the same class and are highly relevant. It is more intuitive to put them in a patch.

hmm
does anyone else has any oppinion about this ?

if not ill apply it

Thanks


[...]
Shiyou Yin Aug. 12, 2019, 5:23 a.m. UTC | #5
>-----Original Message-----
>From: ffmpeg-devel-bounces@ffmpeg.org [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
>Michael Niedermayer
>Sent: Friday, August 9, 2019 12:07 AM
>To: FFmpeg development discussions and patches
>Subject: Re: [FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.
>
>On Thu, Aug 08, 2019 at 09:49:35AM +0800, 顾希伟 wrote:
>> > 发件人: "Michael Niedermayer" <michael@niedermayer.cc>
>> > 发送时间: 2019-08-08 07:05:13 (星期四)
>> > 收件人: "FFmpeg development discussions and patches"
>> > <ffmpeg-devel@ffmpeg.org>
>> > 抄送:
>> > 主题: Re: [FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.
>> >
>> > On Wed, Aug 07, 2019 at 05:52:00PM +0800, gxw wrote:
>> > > Changing details as following:
>> > > 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
>> > >    source vector.
>> > > 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
>> > >    Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
>> > >    Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).
>> > >    Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).
>> > > 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
>> > >    instead, because there are no difference in the effect of this two macros.
>> >
>> > can these 3 things be split into 3 patches ?
>> > It would be clearer if each change would be in its own patch
>> >
>> > thanks
>> >
>> > [...]
>>
>> It can be split into 3 patches. But there some benefits as 1 patch, these macros belong to the same
>>class and are highly relevant. It is more intuitive to put them in a patch.
>
>hmm
>does anyone else has any oppinion about this ?
>
>if not ill apply it
>

In fact, change 2 and 3 is related closely. it's using a new macro to replace 'CLIP_SH/Wn_0_255' and
 'CLIP_SH/Wn_0_255_MAX_SATU'. So, It's better to put 2&3 in one patch. 
Change 1 belongs to the same macro type of change 2&3. Putting it together is mainly because of there are
too many macros are pending refactor, It's a balance between patch complexity and patch number.
So it's acceptable to me.
Michael Niedermayer Aug. 13, 2019, 9:30 a.m. UTC | #6
On Mon, Aug 12, 2019 at 01:23:09PM +0800, Shiyou Yin wrote:
> >-----Original Message-----
> >From: ffmpeg-devel-bounces@ffmpeg.org [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> >Michael Niedermayer
> >Sent: Friday, August 9, 2019 12:07 AM
> >To: FFmpeg development discussions and patches
> >Subject: Re: [FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.
> >
> >On Thu, Aug 08, 2019 at 09:49:35AM +0800, 顾希伟 wrote:
> >> > 发件人: "Michael Niedermayer" <michael@niedermayer.cc>
> >> > 发送时间: 2019-08-08 07:05:13 (星期四)
> >> > 收件人: "FFmpeg development discussions and patches"
> >> > <ffmpeg-devel@ffmpeg.org>
> >> > 抄送:
> >> > 主题: Re: [FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.
> >> >
> >> > On Wed, Aug 07, 2019 at 05:52:00PM +0800, gxw wrote:
> >> > > Changing details as following:
> >> > > 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
> >> > >    source vector.
> >> > > 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
> >> > >    Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
> >> > >    Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).
> >> > >    Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).
> >> > > 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
> >> > >    instead, because there are no difference in the effect of this two macros.
> >> >
> >> > can these 3 things be split into 3 patches ?
> >> > It would be clearer if each change would be in its own patch
> >> >
> >> > thanks
> >> >
> >> > [...]
> >>
> >> It can be split into 3 patches. But there some benefits as 1 patch, these macros belong to the same
> >>class and are highly relevant. It is more intuitive to put them in a patch.
> >
> >hmm
> >does anyone else has any oppinion about this ?
> >
> >if not ill apply it
> >
> 
> In fact, change 2 and 3 is related closely. it's using a new macro to replace 'CLIP_SH/Wn_0_255' and
>  'CLIP_SH/Wn_0_255_MAX_SATU'. So, It's better to put 2&3 in one patch. 
> Change 1 belongs to the same macro type of change 2&3. Putting it together is mainly because of there are
> too many macros are pending refactor, It's a balance between patch complexity and patch number.
> So it's acceptable to me. 

ok, will apply

thx

[...]
diff mbox

Patch

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index c4ba8c4..dd05982 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -413,8 +413,7 @@  static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
-    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
-    CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
     PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -475,8 +474,7 @@  static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
 
         SRA_4V(temp0, temp1, temp2, temp3, denom);
         SRA_4V(temp4, temp5, temp6, temp7, denom);
-        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
-        CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+        CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
         PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
                     dst0, dst1, dst2, dst3);
         ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -531,7 +529,7 @@  static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     temp = p1_or_q1_org_in << 1;                              \
     clip3 = clip3 - temp;                                     \
     clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
-    clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);              \
+    CLIP_SH(clip3, negate_tc_in, tc_in);                      \
     p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
 }
 
@@ -549,7 +547,7 @@  static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     delta = q0_sub_p0 + p1_sub_q1;                              \
     delta >>= 3;                                                \
                                                                 \
-    delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+    CLIP_SH(delta, negate_threshold_in, threshold_in);          \
                                                                 \
     p0_or_q0_out = p0_or_q0_org_in + delta;                     \
     q0_or_p0_out = q0_or_p0_org_in - delta;                     \
@@ -598,7 +596,7 @@  static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     delta = q0_sub_p0 + p1_sub_q1;                                       \
     delta = __msa_srari_h(delta, 3);                                     \
                                                                          \
-    delta = CLIP_SH(delta, -tc, tc);                                     \
+    CLIP_SH(delta, -tc, tc);                                             \
                                                                          \
     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
                                                                          \
@@ -662,7 +660,7 @@  static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     q0_sub_p0 <<= 2;                                                       \
     delta = q0_sub_p0 + p1_sub_q1;                                         \
     delta = __msa_srari_h(delta, 3);                                       \
-    delta = CLIP_SH(delta, -tc, tc);                                       \
+    CLIP_SH(delta, -tc, tc);                                               \
                                                                            \
     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
                                                                            \
@@ -1742,7 +1740,7 @@  static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
     v8i16 tc, tc_orig_r, tc_plus1;
     v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
     v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
-    v8u16 src2_r, src3_r;
+    v8i16 src2_r, src3_r;
     v8i16 p2_r, p1_r, q2_r, q1_r;
     v16u8 p2, q2, p0, q0;
     v4i32 dst0, dst1;
@@ -1840,8 +1838,8 @@  static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
     tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
     tc = tc_orig_r;
 
-    p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
-    q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
+    CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
+    CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
 
     p2_r += p1_r;
     q2_r += q1_r;
@@ -1873,14 +1871,13 @@  static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
                                               (v16i8) is_less_than_beta2);
     tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
 
-    q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
+    CLIP_SH(q0_sub_p0, -tc, tc);
 
-    ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
+    ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
     src2_r += q0_sub_p0;
     src3_r -= q0_sub_p0;
 
-    src2_r = (v8u16) CLIP_SH_0_255(src2_r);
-    src3_r = (v8u16) CLIP_SH_0_255(src3_r);
+    CLIP_SH2_0_255(src2_r, src3_r);
 
     PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
 
@@ -2510,10 +2507,8 @@  void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
     SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
     SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
-    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
-    CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
-    CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
-    CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
+    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+    CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
                 dst2, dst3);
     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
@@ -2554,10 +2549,8 @@  void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
         SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
         SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
         SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
-        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
-        CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
-        CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
-        CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
+        CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+        CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
                     dst2, dst3);
         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
index 7851bfd..fbf7795 100644
--- a/libavcodec/mips/h264idct_msa.c
+++ b/libavcodec/mips/h264idct_msa.c
@@ -233,8 +233,7 @@  static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
          res0, res1, res2, res3);
     ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
          res4, res5, res6, res7);
-    CLIP_SH4_0_255(res0, res1, res2, res3);
-    CLIP_SH4_0_255(res4, res5, res6, res7);
+    CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
     PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
                 dst0, dst1, dst2, dst3);
     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
@@ -263,8 +262,8 @@  static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
          dst0_r, dst1_r, dst2_r, dst3_r);
     ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
          dst4_r, dst5_r, dst6_r, dst7_r);
-    CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
-    CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
+    CLIP_SH8_0_255(dst0_r, dst1_r, dst2_r, dst3_r,
+                   dst4_r, dst5_r, dst6_r, dst7_r);
     PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
                 dst0, dst1, dst2, dst3);
     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
diff --git a/libavcodec/mips/hevc_idct_msa.c b/libavcodec/mips/hevc_idct_msa.c
index b14aec9..5ab6acd 100644
--- a/libavcodec/mips/hevc_idct_msa.c
+++ b/libavcodec/mips/hevc_idct_msa.c
@@ -803,8 +803,9 @@  static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
         LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
         coeffs += 64;
 
-        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
-        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
+                       dst_r2, dst_l2, dst_r3, dst_l3);
+
         PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                     dst_r3, dst0, dst1, dst2, dst3);
         ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
@@ -825,8 +826,8 @@  static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
     dst_r3 += in6;
     dst_l3 += in7;
 
-    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
-    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+    CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
+                   dst_r2, dst_l2, dst_r3, dst_l3);
     PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                 dst_r3, dst0, dst1, dst2, dst3);
     ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
@@ -873,8 +874,8 @@  static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
         LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
         coeffs += 64;
 
-        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
-        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
+                       dst_r2, dst_l2, dst_r3, dst_l3);
         PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                     dst_r3, dst0, dst1, dst2, dst3);
         ST_UB2(dst0, dst1, dst, 16);
@@ -905,8 +906,8 @@  static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
     LD_SH4(coeffs, 16, in0, in2, in4, in6);
     LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
 
-    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
-    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+    CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
+                   dst_r2, dst_l2, dst_r3, dst_l3);
     PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                 dst_r3, dst0, dst1, dst2, dst3);
     ST_UB2(dst0, dst1, dst, 16);
@@ -928,8 +929,8 @@  static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
     dst_r3 += in6;
     dst_l3 += in7;
 
-    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
-    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+    CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
+                   dst_r2, dst_l2, dst_r3, dst_l3);
     PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                 dst_r3, dst0, dst1, dst2, dst3);
     ST_UB2(dst0, dst1, dst, 16);
diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c
index 791ddb3..26663dd 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -140,19 +140,19 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
 
             temp1 = temp0 + p2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - p1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
 
             temp1 = (temp0 << 1) + p2_src + q1_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
 
             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
@@ -165,19 +165,19 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
 
             temp1 = temp0 + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - q1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
 
             temp1 = (temp0 << 1) + p1_src + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
 
             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
@@ -218,15 +218,15 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
             abs_delta0 = (v8u16) abs_delta0 < temp1;
 
-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+            CLIP_SH(delta0, tc_neg, tc_pos);
 
-            temp0 = (v8u16) (delta0 + p0_src);
-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+            temp2 = (v8i16) (delta0 + p0_src);
+            CLIP_SH_0_255(temp2);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
                                         (v16u8) p_is_pcm_vec);
 
             temp2 = (v8i16) (q0_src - delta0);
-            temp2 = CLIP_SH_0_255(temp2);
+            CLIP_SH_0_255(temp2);
             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
                                         (v16u8) q_is_pcm_vec);
 
@@ -252,9 +252,9 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             delta1 -= (v8i16) p1_src;
             delta1 += delta0;
             delta1 >>= 1;
-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            CLIP_SH(delta1, tc_neg, tc_pos);
             delta1 = (v8i16) p1_src + (v8i16) delta1;
-            delta1 = CLIP_SH_0_255(delta1);
+            CLIP_SH_0_255(delta1);
             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
                                           (v16u8) p_is_pcm_vec);
 
@@ -262,9 +262,9 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             delta2 = delta2 - (v8i16) q1_src;
             delta2 = delta2 - delta0;
             delta2 = delta2 >> 1;
-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            CLIP_SH(delta2, tc_neg, tc_pos);
             delta2 = (v8i16) q1_src + (v8i16) delta2;
-            delta2 = CLIP_SH_0_255(delta2);
+            CLIP_SH_0_255(delta2);
             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
                                           (v16u8) q_is_pcm_vec);
 
@@ -298,19 +298,19 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
 
             temp1 = temp0 + p2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - p1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
 
             temp1 = (temp0 << 1) + p2_src + q1_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
 
             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
@@ -323,19 +323,19 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
 
             temp1 = temp0 + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - q1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
 
             temp1 = (temp0 << 1) + p1_src + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
 
             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
@@ -362,15 +362,15 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
             abs_delta0 = (v8u16) abs_delta0 < temp1;
 
-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+            CLIP_SH(delta0, tc_neg, tc_pos);
 
-            temp0 = (v8u16) (delta0 + p0_src);
-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+            temp2 = (v8i16) (delta0 + p0_src);
+            CLIP_SH_0_255(temp2);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
                                         (v16u8) p_is_pcm_vec);
 
             temp2 = (v8i16) (q0_src - delta0);
-            temp2 = CLIP_SH_0_255(temp2);
+            CLIP_SH_0_255(temp2);
             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
                                         (v16u8) q_is_pcm_vec);
 
@@ -394,9 +394,9 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             delta1 -= (v8i16) p1_src;
             delta1 += delta0;
             delta1 >>= 1;
-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            CLIP_SH(delta1, tc_neg, tc_pos);
             delta1 = (v8i16) p1_src + (v8i16) delta1;
-            delta1 = CLIP_SH_0_255(delta1);
+            CLIP_SH_0_255(delta1);
             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
                                           (v16u8) p_is_pcm_vec);
 
@@ -404,9 +404,9 @@  static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             delta2 = delta2 - (v8i16) q1_src;
             delta2 = delta2 - delta0;
             delta2 = delta2 >> 1;
-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            CLIP_SH(delta2, tc_neg, tc_pos);
             delta2 = (v8i16) q1_src + (v8i16) delta2;
-            delta2 = CLIP_SH_0_255(delta2);
+            CLIP_SH_0_255(delta2);
             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
                                           (v16u8) q_is_pcm_vec);
 
@@ -561,19 +561,19 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
 
             temp1 = temp0 + p2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - p1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
 
             temp1 = (temp0 << 1) + p2_src + q1_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
 
             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
@@ -585,19 +585,19 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
 
             temp1 = temp0 + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - q1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
 
             temp1 = (temp0 << 1) + p1_src + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
 
             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
@@ -620,14 +620,14 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
             abs_delta0 = (v8u16) abs_delta0 < temp1;
 
-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
-            temp0 = (v8u16) (delta0 + p0_src);
-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+            CLIP_SH(delta0, tc_neg, tc_pos);
+            temp2 = (v8i16) (delta0 + p0_src);
+            CLIP_SH_0_255(temp2);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
                                         (v16u8) p_is_pcm_vec);
 
             temp2 = (v8i16) (q0_src - delta0);
-            temp2 = CLIP_SH_0_255(temp2);
+            CLIP_SH_0_255(temp2);
             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
                                         (v16u8) q_is_pcm_vec);
 
@@ -649,9 +649,9 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             delta1 -= (v8i16) p1_src;
             delta1 += delta0;
             delta1 >>= 1;
-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            CLIP_SH(delta1, tc_neg, tc_pos);
             delta1 = (v8i16) p1_src + (v8i16) delta1;
-            delta1 = CLIP_SH_0_255(delta1);
+            CLIP_SH_0_255(delta1);
             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
                                           (v16u8) p_is_pcm_vec);
 
@@ -659,9 +659,9 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             delta2 = delta2 - (v8i16) q1_src;
             delta2 = delta2 - delta0;
             delta2 = delta2 >> 1;
-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            CLIP_SH(delta2, tc_neg, tc_pos);
             delta2 = (v8i16) q1_src + (v8i16) delta2;
-            delta2 = CLIP_SH_0_255(delta2);
+            CLIP_SH_0_255(delta2);
             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
                                           (v16u8) q_is_pcm_vec);
 
@@ -726,19 +726,19 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst0 = (v16u8) (temp2 + (v8i16) p2_src);
 
             temp1 = temp0 + p2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - p1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst1 = (v16u8) (temp2 + (v8i16) p1_src);
 
             temp1 = (temp0 << 1) + p2_src + q1_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - p0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst2 = (v16u8) (temp2 + (v8i16) p0_src);
 
             dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
@@ -750,19 +750,19 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q2_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst5 = (v16u8) (temp2 + (v8i16) q2_src);
 
             temp1 = temp0 + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
             temp2 = (v8i16) (temp1 - q1_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst4 = (v16u8) (temp2 + (v8i16) q1_src);
 
             temp1 = (temp0 << 1) + p1_src + q2_src;
             temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
             temp2 = (v8i16) (temp1 - q0_src);
-            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            CLIP_SH(temp2, tc_neg, tc_pos);
             dst3 = (v16u8) (temp2 + (v8i16) q0_src);
 
             dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
@@ -785,15 +785,15 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
             abs_delta0 = (v8u16) abs_delta0 < temp1;
 
-            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+            CLIP_SH(delta0, tc_neg, tc_pos);
 
-            temp0 = (v8u16) (delta0 + p0_src);
-            temp0 = (v8u16) CLIP_SH_0_255(temp0);
-            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+            temp2 = (v8i16) (delta0 + p0_src);
+            CLIP_SH_0_255(temp2);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
                                         (v16u8) p_is_pcm_vec);
 
             temp2 = (v8i16) (q0_src - delta0);
-            temp2 = CLIP_SH_0_255(temp2);
+            CLIP_SH_0_255(temp2);
             temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
                                         (v16u8) q_is_pcm_vec);
 
@@ -815,9 +815,9 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             delta1 -= (v8i16) p1_src;
             delta1 += delta0;
             delta1 >>= 1;
-            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            CLIP_SH(delta1, tc_neg, tc_pos);
             delta1 = (v8i16) p1_src + (v8i16) delta1;
-            delta1 = CLIP_SH_0_255(delta1);
+            CLIP_SH_0_255(delta1);
             delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
                                           (v16u8) p_is_pcm_vec);
 
@@ -825,9 +825,9 @@  static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
             delta2 = delta2 - (v8i16) q1_src;
             delta2 = delta2 - delta0;
             delta2 = delta2 >> 1;
-            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            CLIP_SH(delta2, tc_neg, tc_pos);
             delta2 = (v8i16) q1_src + (v8i16) delta2;
-            delta2 = CLIP_SH_0_255(delta2);
+            CLIP_SH_0_255(delta2);
             delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
                                           (v16u8) q_is_pcm_vec);
             delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
@@ -955,15 +955,15 @@  static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
         temp0 <<= 2;
         temp0 += temp1;
         delta = __msa_srari_h((v8i16) temp0, 3);
-        delta = CLIP_SH(delta, tc_neg, tc_pos);
+        CLIP_SH(delta, tc_neg, tc_pos);
 
         temp0 = (v8i16) ((v8i16) p0 + delta);
-        temp0 = CLIP_SH_0_255(temp0);
+        CLIP_SH_0_255(temp0);
         temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
                                     (v16u8) p_is_pcm_vec);
 
         temp1 = (v8i16) ((v8i16) q0 - delta);
-        temp1 = CLIP_SH_0_255(temp1);
+        CLIP_SH_0_255(temp1);
         temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
                                     (v16u8) q_is_pcm_vec);
 
@@ -1014,15 +1014,15 @@  static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
         temp0 <<= 2;
         temp0 += temp1;
         delta = __msa_srari_h((v8i16) temp0, 3);
-        delta = CLIP_SH(delta, tc_neg, tc_pos);
+        CLIP_SH(delta, tc_neg, tc_pos);
 
         temp0 = (v8i16) ((v8i16) p0 + delta);
-        temp0 = CLIP_SH_0_255(temp0);
+        CLIP_SH_0_255(temp0);
         temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
                                     (v16u8) p_is_pcm_vec);
 
         temp1 = (v8i16) ((v8i16) q0 - delta);
-        temp1 = CLIP_SH_0_255(temp1);
+        CLIP_SH_0_255(temp1);
         temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
                                     (v16u8) q_is_pcm_vec);
 
diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
index 34613c8..c6c8d27 100644
--- a/libavcodec/mips/hevc_mc_bi_msa.c
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -48,7 +48,7 @@  static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
 {                                                                  \
     ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                 \
     SRARI_H2_SH(out0, out1, rnd_val);                              \
-    CLIP_SH2_0_255_MAX_SATU(out0, out1);                           \
+    CLIP_SH2_0_255(out0, out1);                                    \
 }
 
 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,    \
@@ -83,7 +83,7 @@  static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
         dst0 <<= 6;
         dst0 += in0;
         dst0 = __msa_srari_h(dst0, 7);
-        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+        CLIP_SH_0_255(dst0);
 
         dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
         ST_W2(dst0, 0, 1, dst, dst_stride);
@@ -739,7 +739,7 @@  static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
         HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
         dst2 = __msa_adds_s_h(in2, dst2);
         dst2 = __msa_srari_h(dst2, 7);
-        dst2 = CLIP_SH_0_255(dst2);
+        CLIP_SH_0_255(dst2);
         PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
 
         tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
@@ -888,7 +888,7 @@  static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
         HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
         dst2 = __msa_adds_s_h(dst2, in2);
         dst2 = __msa_srari_h(dst2, 7);
-        dst2 = CLIP_SH_0_255(dst2);
+        CLIP_SH_0_255(dst2);
 
         PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
         dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
@@ -1726,7 +1726,7 @@  static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
         ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
         ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
         SRARI_H2_SH(out0, out1, 7);
-        CLIP_SH2_0_255_MAX_SATU(out0, out1);
+        CLIP_SH2_0_255(out0, out1);
         out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
@@ -1854,7 +1854,7 @@  static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
             tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
             ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
             tmp = __msa_srari_h(tmp, 7);
-            tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+            CLIP_SH_0_255(tmp);
             out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
             ST_D1(out, 0, dst_tmp);
             dst_tmp += dst_stride;
@@ -2000,7 +2000,7 @@  static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
         tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
         ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
         tmp = __msa_srari_h(tmp, 7);
-        tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+        CLIP_SH_0_255(tmp);
         out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
         ST_D1(out, 0, dst_tmp);
         dst_tmp += dst_stride;
@@ -2088,7 +2088,7 @@  static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
         ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
         ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
         SRARI_H2_SH(out0, out1, 7);
-        CLIP_SH2_0_255_MAX_SATU(out0, out1);
+        CLIP_SH2_0_255(out0, out1);
         out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
@@ -2215,7 +2215,7 @@  static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
 
     tmp0 = __msa_adds_s_h(tmp0, in0);
     tmp0 = __msa_srari_h(tmp0, 7);
-    tmp0 = CLIP_SH_0_255(tmp0);
+    CLIP_SH_0_255(tmp0);
     dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
 
     ST_W2(dst0, 0, 1, dst, dst_stride);
@@ -2943,7 +2943,7 @@  static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
     DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
     dst10 = __msa_adds_s_h(dst10, in0);
     dst10 = __msa_srari_h(dst10, 7);
-    dst10 = CLIP_SH_0_255(dst10);
+    CLIP_SH_0_255(dst10);
 
     dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
     ST_W2(dst10, 0, 1, dst, dst_stride);
@@ -3843,7 +3843,7 @@  static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
     tmp = __msa_adds_s_h(tmp, in0);
     tmp = __msa_srari_h(tmp, 7);
-    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+    CLIP_SH_0_255(tmp);
     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
     ST_W2(out, 0, 1, dst, dst_stride);
 }
@@ -3919,7 +3919,7 @@  static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
     ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
     SRARI_H2_SH(tmp0, tmp1, 7);
-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    CLIP_SH2_0_255(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
@@ -4032,7 +4032,7 @@  static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
                     tmp2, tmp3);
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
@@ -4200,7 +4200,7 @@  static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
     ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
                 tmp3);
     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
@@ -4212,7 +4212,7 @@  static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
     ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
     ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
     SRARI_H2_SH(tmp4, tmp5, 7);
-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    CLIP_SH2_0_255(tmp4, tmp5);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
     ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
 }
@@ -4286,7 +4286,7 @@  static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
     ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
     SRARI_H2_SH(tmp0, tmp1, 7);
-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    CLIP_SH2_0_255(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
     ST_D2(out, 0, 1, dst, dst_stride);
 }
@@ -4380,7 +4380,7 @@  static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
                     tmp0, tmp1, tmp2, tmp3);
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += 8;
@@ -4495,8 +4495,8 @@  static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
     ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
     SRARI_H2_SH(tmp4, tmp5, 7);
-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH2_0_255(tmp4, tmp5);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
@@ -4610,7 +4610,7 @@  static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
             ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
                         tmp0, tmp1, tmp2, tmp3);
             SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
             ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (4 * dst_stride);
@@ -4760,7 +4760,7 @@  static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
                     tmp0, tmp1, tmp2, tmp3);
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (4 * dst_stride);
@@ -4846,7 +4846,7 @@  static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
         ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
                     tmp0, tmp1, tmp2, tmp3);
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c
index 68f122e..f775ea8 100644
--- a/libavcodec/mips/hevc_mc_biw_msa.c
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -66,7 +66,7 @@  static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
     out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);   \
     SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \
     PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);         \
-    CLIP_SH2_0_255_MAX_SATU(out0, out1);                             \
+    CLIP_SH2_0_255(out0, out1);                                      \
 }
 
 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,  \
@@ -124,7 +124,7 @@  static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
         dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
         SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
         dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
-        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+        CLIP_SH_0_255(dst0);
         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
         ST_W2(out0, 0, 1, dst, dst_stride);
     } else if (4 == height) {
@@ -1069,8 +1069,8 @@  static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
         dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
                                  (v8i16) weight_vec);
         SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
-        dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
-        out2 = CLIP_SH_0_255(dst2_r);
+        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+        CLIP_SH_0_255(out2);
 
         LD_SB2(src0_ptr, 16, src0, src1);
         src0_ptr += src_stride;
@@ -1100,8 +1100,8 @@  static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
     dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
     dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
     SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
-    dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
-    out2 = CLIP_SH_0_255(dst2_r);
+    out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+    CLIP_SH_0_255(out2);
     PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
     dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
     ST_SH(out0, dst);
@@ -1674,8 +1674,8 @@  static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
         dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
                                  (v8i16) weight_vec);
         SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
-        dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
-        out2 = CLIP_SH_0_255(dst2_r);
+        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+        CLIP_SH_0_255(out2);
         PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
         ST_D2(out0, 0, 1, dst, dst_stride);
         ST_W2(out2, 0, 1, dst + 8, dst_stride);
@@ -2048,7 +2048,7 @@  static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
         SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
-        CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
+        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
@@ -2226,7 +2226,7 @@  static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
             dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
             dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
             SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
-            CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
+            CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
             PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
             out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
             ST_D2(out, 0, 1, dst_tmp, dst_stride);
@@ -2412,7 +2412,7 @@  static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
         SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
-        CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
+        CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
         ST_D2(out, 0, 1, dst_tmp, dst_stride);
@@ -2503,7 +2503,7 @@  static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
         SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
-        CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
+        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
@@ -2683,8 +2683,8 @@  static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
     dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
     dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
     SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
-    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
-    out0 = CLIP_SH_0_255(dst0_r);
+    out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+    CLIP_SH_0_255(out0);
     out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
     ST_W2(out0, 0, 1, dst, dst_stride);
 }
@@ -3554,8 +3554,8 @@  static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
     dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
     dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
     SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
-    dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
-    out = CLIP_SH_0_255(dst10_r);
+    out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
+    CLIP_SH_0_255(out);
     out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
     ST_W2(out, 0, 1, dst, dst_stride);
 }
@@ -4575,7 +4575,7 @@  static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
     SRAR_W2_SW(dst0, dst1, rnd_vec);
     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
-    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+    CLIP_SH_0_255(tmp);
     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
     ST_W2(out, 0, 1, dst, dst_stride);
 }
@@ -4672,7 +4672,7 @@  static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    CLIP_SH2_0_255(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
@@ -4810,7 +4810,7 @@  static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
                     tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
@@ -5008,7 +5008,7 @@  static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
     SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
     PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
                 tmp2, tmp3);
-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
@@ -5030,7 +5030,7 @@  static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
 
-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    CLIP_SH2_0_255(tmp4, tmp5);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
     ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
 }
@@ -5126,7 +5126,7 @@  static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
     dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
     SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    CLIP_SH2_0_255(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
     ST_D2(out, 0, 1, dst, dst_stride);
 }
@@ -5248,7 +5248,7 @@  static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
                     tmp0, tmp1, tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += 8;
@@ -5387,7 +5387,7 @@  static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
     SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
     PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
                 tmp0, tmp1, tmp2, tmp3);
-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
 
     PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
@@ -5399,7 +5399,7 @@  static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    CLIP_SH2_0_255(tmp4, tmp5);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
     ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
@@ -5537,7 +5537,7 @@  static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
             SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
             PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
                         tmp0, tmp1, tmp2, tmp3);
-            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
             ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (4 * dst_stride);
@@ -5724,7 +5724,7 @@  static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
                     tmp0, tmp1, tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (4 * dst_stride);
@@ -5820,7 +5820,7 @@  static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
         SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
                     tmp0, tmp1, tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c b/libavcodec/mips/hevc_mc_uniw_msa.c
index cad1240..1a8c251 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -41,7 +41,7 @@  static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
     SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);                    \
     PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);          \
     ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);          \
-    CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h);                                  \
+    CLIP_SH2_0_255(out0_h, out1_h);                                           \
 }
 
 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \
@@ -88,7 +88,7 @@  static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
         SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
         dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
         dst0 += offset_vec;
-        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+        CLIP_SH_0_255(dst0);
         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
         ST_W2(out0, 0, 1, dst, dst_stride);
     } else if (4 == height) {
@@ -1863,7 +1863,7 @@  static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
         SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
         ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
         ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
-        CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
+        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
         PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
         out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
@@ -2014,7 +2014,7 @@  static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
             SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
             ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
             ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
-            CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst0_l, dst1_l);
+            CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l);
 
             PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
             dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
@@ -2165,7 +2165,7 @@  static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
         MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
         SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
         ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
-        CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l);
+        CLIP_SW2_0_255(dst0_r, dst0_l);
         dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
         out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
         ST_D1(out, 0, dst_tmp);
@@ -2246,7 +2246,7 @@  static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
         SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
         ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
         ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
-        CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
+        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
         PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
         out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
@@ -2394,7 +2394,7 @@  static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
     SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
     dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
     dst0 = __msa_adds_s_h(dst0, offset_vec);
-    dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+    CLIP_SH_0_255(dst0);
     out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
     ST_W2(out, 0, 1, dst, dst_stride);
     dst += (4 * dst_stride);
@@ -3295,7 +3295,7 @@  static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
     SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
     dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
     dst0 = __msa_adds_s_h(dst0, offset_vec);
-    dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+    CLIP_SH_0_255(dst0);
     out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
     ST_W2(out, 0, 1, dst, dst_stride);
 }
@@ -4247,7 +4247,7 @@  static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
     SRAR_W2_SW(dst0, dst1, rnd_vec);
     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
     tmp += offset_vec;
-    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+    CLIP_SH_0_255(tmp);
     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
     ST_W2(out, 0, 1, dst, dst_stride);
 }
@@ -4316,7 +4316,7 @@  static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
     SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    CLIP_SH2_0_255(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
@@ -4417,7 +4417,7 @@  static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
                     tmp2, tmp3);
         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
@@ -4574,8 +4574,8 @@  static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
     ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
     ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH2_0_255(tmp4, tmp5);
     PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
     ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
@@ -4652,7 +4652,7 @@  static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
     SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
-    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    CLIP_SH2_0_255(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
     ST_D2(out, 0, 1, dst, dst_stride);
 }
@@ -4745,7 +4745,7 @@  static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
                     dst3_r, tmp0, tmp1, tmp2, tmp3);
         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += 8;
@@ -4861,8 +4861,8 @@  static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
     ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
     ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
-    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
-    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH2_0_255(tmp4, tmp5);
     PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
     ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
@@ -4973,7 +4973,7 @@  static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
                         dst3_r, tmp0, tmp1, tmp2, tmp3);
             ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
             ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
-            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
             ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (4 * dst_stride);
@@ -5120,7 +5120,7 @@  static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
                     dst3_r, tmp0, tmp1, tmp2, tmp3);
         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (4 * dst_stride);
@@ -5187,7 +5187,7 @@  static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
                     tmp2, tmp3);
         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
-        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
diff --git a/libavcodec/mips/hevcpred_msa.c b/libavcodec/mips/hevcpred_msa.c
index 909b62d..f53276d 100644
--- a/libavcodec/mips/hevcpred_msa.c
+++ b/libavcodec/mips/hevcpred_msa.c
@@ -83,7 +83,7 @@  static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
         vec2 -= vec0;
         vec2 >>= 1;
         vec2 += vec1;
-        vec2 = CLIP_SH_0_255(vec2);
+        CLIP_SH_0_255(vec2);
 
         for (col = 0; col < 4; col++) {
             dst[stride * col] = (uint8_t) vec2[col];
@@ -122,7 +122,7 @@  static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
         vec2 -= vec0;
         vec2 >>= 1;
         vec2 += vec1;
-        vec2 = CLIP_SH_0_255(vec2);
+        CLIP_SH_0_255(vec2);
 
         val0 = vec2[0];
         val1 = vec2[1];
@@ -214,7 +214,7 @@  static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
         src0_r -= src_top_val;
         src0_r >>= 1;
         src0_r += src_left_val;
-        src0_r = CLIP_SH_0_255(src0_r);
+        CLIP_SH_0_255(src0_r);
         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
         val0 = __msa_copy_s_w((v4i32) src0, 0);
         SW(val0, dst);
@@ -254,7 +254,7 @@  static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
         src0_r -= src_top_val;
         src0_r >>= 1;
         src0_r += src_left_val;
-        src0_r = CLIP_SH_0_255(src0_r);
+        CLIP_SH_0_255(src0_r);
         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
         val0 = __msa_copy_s_d((v2i64) src0, 0);
         SD(val0, dst);
diff --git a/libavcodec/mips/idctdsp_msa.c b/libavcodec/mips/idctdsp_msa.c
index b29e420..b6b98dc 100644
--- a/libavcodec/mips/idctdsp_msa.c
+++ b/libavcodec/mips/idctdsp_msa.c
@@ -28,8 +28,7 @@  static void put_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
     LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-    CLIP_SH4_0_255(in0, in1, in2, in3);
-    CLIP_SH4_0_255(in4, in5, in6, in7);
+    CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7);
     PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
     PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
 
@@ -63,8 +62,7 @@  static void put_signed_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
     in6 += 128;
     in7 += 128;
 
-    CLIP_SH4_0_255(in0, in1, in2, in3);
-    CLIP_SH4_0_255(in4, in5, in6, in7);
+    CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7);
     PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
     PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
 
@@ -109,8 +107,7 @@  static void add_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
     in6 += (v8i16) pix6;
     in7 += (v8i16) pix7;
 
-    CLIP_SH4_0_255(in0, in1, in2, in3);
-    CLIP_SH4_0_255(in4, in5, in6, in7);
+    CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7);
     PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
     PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
 
diff --git a/libavcodec/mips/qpeldsp_msa.c b/libavcodec/mips/qpeldsp_msa.c
index f9d1a95..c7675f1 100644
--- a/libavcodec/mips/qpeldsp_msa.c
+++ b/libavcodec/mips/qpeldsp_msa.c
@@ -96,7 +96,7 @@ 
     DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);         \
     res0_r = (v8i16) (sum0_r - sum3_r);                                 \
     res0_r = __msa_srari_h(res0_r, 5);                                  \
-    res0_r = CLIP_SH_0_255(res0_r);                                     \
+    CLIP_SH_0_255(res0_r);                                              \
     out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);        \
                                                                         \
     out;                                                                \
@@ -118,7 +118,7 @@ 
     res0_r = (v8i16) (sum0_r - sum3_r);                                   \
     res0_r += 15;                                                         \
     res0_r >>= 5;                                                         \
-    res0_r = CLIP_SH_0_255(res0_r);                                       \
+    CLIP_SH_0_255(res0_r);                                                \
     out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);          \
                                                                           \
     out;                                                                  \
diff --git a/libavcodec/mips/simple_idct_msa.c b/libavcodec/mips/simple_idct_msa.c
index 8a72359..4bd3dd8 100644
--- a/libavcodec/mips/simple_idct_msa.c
+++ b/libavcodec/mips/simple_idct_msa.c
@@ -336,35 +336,26 @@  static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
     SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
     SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
-    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
-                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
-    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
-                a0_r, a1_r, a2_r, a3_r);
-    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
-    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
-    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
-    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
-    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
-                temp2_r, temp2_r, temp3_r, temp3_r,
-                temp0_r, temp1_r, temp2_r, temp3_r);
-    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
-    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
-    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
-    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
-    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
-    dst += 4 * dst_stride;
-    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
-    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
-    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
-    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
-    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
-                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
-    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
-    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
-    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
-    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
+    PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, in0, in1, in2, in3);
+    PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                in4, in5, in6, in7);
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
+                in0, in1, in2, in3);
+    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
+    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
+    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
+    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
     SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
-    dst += 4 * dst_stride;
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
+                in4, in5, in6, in7);
+    tmp3 = __msa_copy_u_d((v2i64) in4, 1);
+    tmp2 = __msa_copy_u_d((v2i64) in5, 1);
+    tmp1 = __msa_copy_u_d((v2i64) in6, 1);
+    tmp0 = __msa_copy_u_d((v2i64) in7, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
 }
 
 static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
@@ -516,21 +507,17 @@  static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
                 temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
     ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
                temp0_l, temp1_l, temp2_l, temp3_l);
-    temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
-    temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
-    temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
-    temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
-    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
-    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
-    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
-    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
-    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
-                temp2_r, temp2_r, temp3_r, temp3_r,
-                temp0_r, temp1_r, temp2_r, temp3_r);
-    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
-    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
-    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
-    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
+    in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l);
+    in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l);
+    in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l);
+    in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l);
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
+                in0, in1, in2, in3);
+    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
+    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
+    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
+    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
     SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 
     SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
@@ -540,20 +527,17 @@  static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
                 a0_r, a1_r, a2_r, a3_r);
     ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
                a3_l, a2_l, a1_l, a0_l);
-    a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
-    a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
-    a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
-    a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
-    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
-    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
-    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
-    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
-    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
-                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
-    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
-    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
-    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
-    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
+    in4 = (v8i16) (a3_r) + (v8i16) (a3_l);
+    in5 = (v8i16) (a2_r) + (v8i16) (a2_l);
+    in6 = (v8i16) (a1_r) + (v8i16) (a1_l);
+    in7 = (v8i16) (a0_r) + (v8i16) (a0_l);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
+                in4, in5, in6, in7);
+    tmp0 = __msa_copy_u_d((v2i64) in4, 1);
+    tmp1 = __msa_copy_u_d((v2i64) in5, 1);
+    tmp2 = __msa_copy_u_d((v2i64) in6, 1);
+    tmp3 = __msa_copy_u_d((v2i64) in7, 1);
     SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
 }
 
diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c
index b2899ee..90c578f 100644
--- a/libavcodec/mips/vp3dsp_idct_msa.c
+++ b/libavcodec/mips/vp3dsp_idct_msa.c
@@ -187,14 +187,7 @@  static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
         G += c5;
         H += c6;
     }
-    A = CLIP_SW_0_255(A);
-    B = CLIP_SW_0_255(B);
-    C = CLIP_SW_0_255(C);
-    D = CLIP_SW_0_255(D);
-    E = CLIP_SW_0_255(E);
-    F = CLIP_SW_0_255(F);
-    G = CLIP_SW_0_255(G);
-    H = CLIP_SW_0_255(H);
+    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
     sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
     sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
     sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
@@ -205,7 +198,7 @@  static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
     Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
     if (type == 1) {
         Bdd = Add + cnst128w;
-        Bdd = CLIP_SW_0_255(Bdd);
+        CLIP_SW_0_255(Bdd);
         Ad = Bdd;
         Bd = Bdd;
         Cd = Bdd;
@@ -223,14 +216,7 @@  static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
         Fd = Add + c5;
         Gd = Add + c6;
         Hd = Add + c7;
-        Ad = CLIP_SW_0_255(Ad);
-        Bd = CLIP_SW_0_255(Bd);
-        Cd = CLIP_SW_0_255(Cd);
-        Dd = CLIP_SW_0_255(Dd);
-        Ed = CLIP_SW_0_255(Ed);
-        Fd = CLIP_SW_0_255(Fd);
-        Gd = CLIP_SW_0_255(Gd);
-        Hd = CLIP_SW_0_255(Hd);
+        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
     }
     Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
     Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
@@ -309,14 +295,7 @@  static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
         G += c5;
         H += c6;
     }
-    A = CLIP_SW_0_255(A);
-    B = CLIP_SW_0_255(B);
-    C = CLIP_SW_0_255(C);
-    D = CLIP_SW_0_255(D);
-    E = CLIP_SW_0_255(E);
-    F = CLIP_SW_0_255(F);
-    G = CLIP_SW_0_255(G);
-    H = CLIP_SW_0_255(H);
+    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
     sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
     sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
     sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
@@ -327,7 +306,7 @@  static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
     Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
     if (type == 1) {
         Bdd = Add + cnst128w;
-        Bdd = CLIP_SW_0_255(Bdd);
+        CLIP_SW_0_255(Bdd);
         Ad = Bdd;
         Bd = Bdd;
         Cd = Bdd;
@@ -345,14 +324,7 @@  static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
         Fd = Add + c5;
         Gd = Add + c6;
         Hd = Add + c7;
-        Ad = CLIP_SW_0_255(Ad);
-        Bd = CLIP_SW_0_255(Bd);
-        Cd = CLIP_SW_0_255(Cd);
-        Dd = CLIP_SW_0_255(Dd);
-        Ed = CLIP_SW_0_255(Ed);
-        Fd = CLIP_SW_0_255(Fd);
-        Gd = CLIP_SW_0_255(Gd);
-        Hd = CLIP_SW_0_255(Hd);
+        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
     }
     Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
     Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
@@ -436,14 +408,7 @@  void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
     e5 += dc;
     e6 += dc;
     e7 += dc;
-    e0 = CLIP_SW_0_255(e0);
-    e1 = CLIP_SW_0_255(e1);
-    e2 = CLIP_SW_0_255(e2);
-    e3 = CLIP_SW_0_255(e3);
-    e4 = CLIP_SW_0_255(e4);
-    e5 = CLIP_SW_0_255(e5);
-    e6 = CLIP_SW_0_255(e6);
-    e7 = CLIP_SW_0_255(e7);
+    CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7);
 
     /* Left part */
     ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
@@ -458,14 +423,7 @@  void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
     r5 += dc;
     r6 += dc;
     r7 += dc;
-    r0 = CLIP_SW_0_255(r0);
-    r1 = CLIP_SW_0_255(r1);
-    r2 = CLIP_SW_0_255(r2);
-    r3 = CLIP_SW_0_255(r3);
-    r4 = CLIP_SW_0_255(r4);
-    r5 = CLIP_SW_0_255(r5);
-    r6 = CLIP_SW_0_255(r6);
-    r7 = CLIP_SW_0_255(r7);
+    CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7);
     VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
     VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
     VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
@@ -516,10 +474,7 @@  void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
     f1 += e1;
     g0 -= e0;
     g1 -= e1;
-    f0 = CLIP_SW_0_255(f0);
-    f1 = CLIP_SW_0_255(f1);
-    g0 = CLIP_SW_0_255(g0);
-    g1 = CLIP_SW_0_255(g1);
+    CLIP_SW4_0_255(f0, f1, g0, g1);
     VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
 
     /* Final move to first_pixel */
@@ -563,10 +518,7 @@  void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
     f1 += e1;
     g0 -= e0;
     g1 -= e1;
-    f0 = CLIP_SW_0_255(f0);
-    f1 = CLIP_SW_0_255(f1);
-    g0 = CLIP_SW_0_255(g0);
-    g1 = CLIP_SW_0_255(g1);
+    CLIP_SW4_0_255(f0, f1, g0, g1);
     VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
     /* Final move to first_pixel */
     ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
diff --git a/libavcodec/mips/vp8_idct_msa.c b/libavcodec/mips/vp8_idct_msa.c
index ae6fec0..ce37ca1 100644
--- a/libavcodec/mips/vp8_idct_msa.c
+++ b/libavcodec/mips/vp8_idct_msa.c
@@ -71,10 +71,7 @@  void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
     ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
                res0, res1, res2, res3);
     ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
-    res0 = CLIP_SW_0_255(res0);
-    res1 = CLIP_SW_0_255(res1);
-    res2 = CLIP_SW_0_255(res2);
-    res3 = CLIP_SW_0_255(res3);
+    CLIP_SW4_0_255(res0, res1, res2, res3);
     VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
     ST_W2(dest0, 0, 1, dst, stride);
     ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
diff --git a/libavcodec/mips/vp9_idct_msa.c b/libavcodec/mips/vp9_idct_msa.c
index 924b83d..53bfbb4 100644
--- a/libavcodec/mips/vp9_idct_msa.c
+++ b/libavcodec/mips/vp9_idct_msa.c
@@ -764,13 +764,13 @@  static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
 
     res0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst0);
     res0 += out0;
-    res0 = CLIP_SH_0_255(res0);
+    CLIP_SH_0_255(res0);
     res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
     ST_D1(res0, 0, dst);
 
     res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7);
     res7 += out7;
-    res7 = CLIP_SH_0_255(res7);
+    CLIP_SH_0_255(res7);
     res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7);
     ST_D1(res7, 0, dst + 7 * dst_stride);
 
@@ -1193,8 +1193,7 @@  static void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst,
              res3);
         ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
              res7);
-        CLIP_SH4_0_255(res0, res1, res2, res3);
-        CLIP_SH4_0_255(res4, res5, res6, res7);
+        CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
         PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
                     tmp0, tmp1, tmp2, tmp3);
         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
@@ -1982,8 +1981,7 @@  static void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst,
              res3);
         ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
              res7);
-        CLIP_SH4_0_255(res0, res1, res2, res3);
-        CLIP_SH4_0_255(res4, res5, res6, res7);
+        CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
         PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
                     tmp0, tmp1, tmp2, tmp3);
 
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index 528f45e..0061dc4 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -914,99 +914,78 @@ 
 
 /* Description : Clips all halfword elements of input vector between min & max
                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
-   Arguments   : Inputs  - in       (input vector)
-                         - min      (min threshold)
-                         - max      (max threshold)
-                 Outputs - out_m    (output vector with clipped elements)
+   Arguments   : Inputs  - in    (input vector)
+                         - min   (min threshold)
+                         - max   (max threshold)
+                 Outputs - in    (output vector with clipped elements)
                  Return Type - signed halfword
 */
-#define CLIP_SH(in, min, max)                           \
-( {                                                     \
-    v8i16 out_m;                                        \
-                                                        \
-    out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \
-    out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \
-    out_m;                                              \
-} )
+#define CLIP_SH(in, min, max)                     \
+{                                                 \
+    in = __msa_max_s_h((v8i16) min, (v8i16) in);  \
+    in = __msa_min_s_h((v8i16) max, (v8i16) in);  \
+}
 
 /* Description : Clips all signed halfword elements of input vector
                  between 0 & 255
-   Arguments   : Inputs  - in       (input vector)
-                 Outputs - out_m    (output vector with clipped elements)
-                 Return Type - signed halfword
+   Arguments   : Inputs  - in    (input vector)
+                 Outputs - in    (output vector with clipped elements)
+                 Return Type - signed halfwords
 */
-#define CLIP_SH_0_255(in)                                 \
-( {                                                       \
-    v8i16 max_m = __msa_ldi_h(255);                       \
-    v8i16 out_m;                                          \
-                                                          \
-    out_m = __msa_maxi_s_h((v8i16) in, 0);                \
-    out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \
-    out_m;                                                \
-} )
+#define CLIP_SH_0_255(in)                       \
+{                                               \
+    in = __msa_maxi_s_h((v8i16) in, 0);         \
+    in = (v8i16) __msa_sat_u_h((v8u16) in, 7);  \
+}
+
 #define CLIP_SH2_0_255(in0, in1)  \
 {                                 \
-    in0 = CLIP_SH_0_255(in0);     \
-    in1 = CLIP_SH_0_255(in1);     \
+    CLIP_SH_0_255(in0);           \
+    CLIP_SH_0_255(in1);           \
 }
+
 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
 {                                           \
     CLIP_SH2_0_255(in0, in1);               \
     CLIP_SH2_0_255(in2, in3);               \
 }
 
-#define CLIP_SH_0_255_MAX_SATU(in)                    \
-( {                                                   \
-    v8i16 out_m;                                      \
-                                                      \
-    out_m = __msa_maxi_s_h((v8i16) in, 0);            \
-    out_m = (v8i16) __msa_sat_u_h((v8u16) out_m, 7);  \
-    out_m;                                            \
-} )
-#define CLIP_SH2_0_255_MAX_SATU(in0, in1)  \
-{                                          \
-    in0 = CLIP_SH_0_255_MAX_SATU(in0);     \
-    in1 = CLIP_SH_0_255_MAX_SATU(in1);     \
-}
-#define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3)  \
-{                                                    \
-    CLIP_SH2_0_255_MAX_SATU(in0, in1);               \
-    CLIP_SH2_0_255_MAX_SATU(in2, in3);               \
+#define CLIP_SH8_0_255(in0, in1, in2, in3,  \
+                       in4, in5, in6, in7)  \
+{                                           \
+    CLIP_SH4_0_255(in0, in1, in2, in3);     \
+    CLIP_SH4_0_255(in4, in5, in6, in7);     \
 }
 
 /* Description : Clips all signed word elements of input vector
                  between 0 & 255
-   Arguments   : Inputs  - in       (input vector)
-                 Outputs - out_m    (output vector with clipped elements)
+   Arguments   : Inputs  - in    (input vector)
+                 Outputs - in    (output vector with clipped elements)
                  Return Type - signed word
 */
-#define CLIP_SW_0_255(in)                                 \
-( {                                                       \
-    v4i32 max_m = __msa_ldi_w(255);                       \
-    v4i32 out_m;                                          \
-                                                          \
-    out_m = __msa_maxi_s_w((v4i32) in, 0);                \
-    out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
-    out_m;                                                \
-} )
+#define CLIP_SW_0_255(in)                       \
+{                                               \
+    in = __msa_maxi_s_w((v4i32) in, 0);         \
+    in = (v4i32) __msa_sat_u_w((v4u32) in, 7);  \
+}
 
-#define CLIP_SW_0_255_MAX_SATU(in)                    \
-( {                                                   \
-    v4i32 out_m;                                      \
-                                                      \
-    out_m = __msa_maxi_s_w((v4i32) in, 0);            \
-    out_m = (v4i32) __msa_sat_u_w((v4u32) out_m, 7);  \
-    out_m;                                            \
-} )
-#define CLIP_SW2_0_255_MAX_SATU(in0, in1)  \
-{                                          \
-    in0 = CLIP_SW_0_255_MAX_SATU(in0);     \
-    in1 = CLIP_SW_0_255_MAX_SATU(in1);     \
+#define CLIP_SW2_0_255(in0, in1)  \
+{                                 \
+    CLIP_SW_0_255(in0);           \
+    CLIP_SW_0_255(in1);           \
 }
-#define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3)  \
-{                                                    \
-    CLIP_SW2_0_255_MAX_SATU(in0, in1);               \
-    CLIP_SW2_0_255_MAX_SATU(in2, in3);               \
+
+#define CLIP_SW4_0_255(in0, in1, in2, in3)  \
+{                                           \
+    CLIP_SW2_0_255(in0, in1);               \
+    CLIP_SW2_0_255(in2, in3);               \
+}
+
+#define CLIP_SW8_0_255(in0, in1, in2, in3,  \
+                       in4, in5, in6, in7)  \
+{                                           \
+    CLIP_SW4_0_255(in0, in1, in2, in3);     \
+    CLIP_SW4_0_255(in4, in5, in6, in7);     \
 }
 
 /* Description : Addition of 4 signed word elements