diff mbox

[FFmpeg-devel,2/2] avutil/mips: [loongson] simplify macro TRANSPOSE_4H and TRANSPOSE_8B

Message ID 1536221453-15372-2-git-send-email-yinshiyou-hf@loongson.cn
State Accepted
Commit 5161f7bcfd3c2d2e6cb92e782855b7fc00bdf877
Headers show

Commit Message

Shiyou Yin Sept. 6, 2018, 8:10 a.m. UTC
Simplify macro TRANSPOSE_4H in mmiutils.h and add TRANSPOSE_8B as a common macro.
---
 libavcodec/mips/vc1dsp_mmi.c | 12 +++----
 libavcodec/mips/vp8dsp_mmi.c | 72 +++++--------------------------------
 libavutil/mips/mmiutils.h    | 84 ++++++++++++++++++++++++++++----------------
 3 files changed, 65 insertions(+), 103 deletions(-)

Comments

Shiyou Yin Sept. 7, 2018, 3:51 a.m. UTC | #1
>-----Original Message-----
>From: ffmpeg-devel-bounces@ffmpeg.org [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
>Shiyou Yin
>Sent: Thursday, September 6, 2018 4:11 PM
>To: ffmpeg-devel@ffmpeg.org
>Subject: [FFmpeg-devel] [PATCH 2/2] avutil/mips: [loongson] simplify macro TRANSPOSE_4H and
>TRANSPOSE_8B
>
>Simplify macro TRANSPOSE_4H in mmiutils.h and add TRANSPOSE_8B as a common macro.
>---
> libavcodec/mips/vc1dsp_mmi.c | 12 +++----
> libavcodec/mips/vp8dsp_mmi.c | 72 +++++--------------------------------
> libavutil/mips/mmiutils.h    | 84 ++++++++++++++++++++++++++++----------------
> 3 files changed, 65 insertions(+), 103 deletions(-)
>
>diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c
>index a439b40..80778a5 100644
>--- a/libavcodec/mips/vc1dsp_mmi.c
>+++ b/libavcodec/mips/vc1dsp_mmi.c
>@@ -248,8 +248,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
>                                0xfff70004, 0xfff0000f, %[ff_pw_4])
>
>         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
>-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
>-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
>+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
>
>         MMI_SDC1(%[ftmp15], %[dst], 0x00)
>         MMI_SDC1(%[ftmp16], %[dst], 0x10)
>@@ -257,8 +256,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
>         MMI_SDC1(%[ftmp18], %[dst], 0x30)
>
>         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
>-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
>-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
>+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
>
>         MMI_SDC1(%[ftmp19], %[dst], 0x08)
>         MMI_SDC1(%[ftmp20], %[dst], 0x18)
>@@ -301,8 +299,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
>                                0xfff70004, 0xfff0000f, %[ff_pw_4])
>
>         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
>-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
>-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
>+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
>
>         MMI_SDC1(%[ftmp15], %[dst], 0x40)
>         MMI_SDC1(%[ftmp16], %[dst], 0x50)
>@@ -310,8 +307,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
>         MMI_SDC1(%[ftmp18], %[dst], 0x70)
>
>         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
>-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
>-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
>+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
>
>         MMI_SDC1(%[ftmp19], %[dst], 0x48)
>         MMI_SDC1(%[ftmp20], %[dst], 0x58)
>diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c
>index b24a87a..bd80aa1 100644
>--- a/libavcodec/mips/vp8dsp_mmi.c
>+++ b/libavcodec/mips/vp8dsp_mmi.c
>@@ -44,58 +44,6 @@
>         "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
>         "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
>
>-#define MMI_TRANSPOSE8x8_UB_UB(src_0, src_1, src_2, src_3,                  \
>-                               src_4, src_5, src_6, src_7,                  \
>-                               dst_0, dst_1, dst_2, dst_3,                  \
>-                               dst_4, dst_5, dst_6, dst_7)                  \
>-        "li         %[it_1],    0xe4                                \n\t"   \
>-        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
>-        "pshufh     %[db_2],    "#src_0",       %[db_1]             \n\t"   \
>-        "punpcklbh  "#dst_0",   "#src_0",       "#src_1"            \n\t"   \
>-        "punpckhbh  "#dst_1",   %[db_2],        "#src_1"            \n\t"   \
>-        "pshufh     %[db_2],    "#src_2",       %[db_1]             \n\t"   \
>-        "punpcklbh  "#dst_2",   "#src_2",       "#src_3"            \n\t"   \
>-        "punpckhbh  "#dst_3",   %[db_2],        "#src_3"            \n\t"   \
>-        "pshufh     %[db_2],    "#src_4",       %[db_1]             \n\t"   \
>-        "punpcklbh  "#dst_4",   "#src_4",       "#src_5"            \n\t"   \
>-        "punpckhbh  "#dst_5",   %[db_2],        "#src_5"            \n\t"   \
>-        "pshufh     %[db_2],    "#src_6",       %[db_1]             \n\t"   \
>-        "punpcklbh  "#dst_6",   "#src_6",       "#src_7"            \n\t"   \
>-        "punpckhbh  "#dst_7",   %[db_2],        "#src_7"            \n\t"   \
>-                                                                            \
>-        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
>-        "punpcklhw  "#dst_0",   "#dst_0",       "#dst_2"            \n\t"   \
>-        "punpckhhw  "#dst_2",   %[db_2],        "#dst_2"            \n\t"   \
>-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
>-        "punpcklhw  "#dst_1",   "#dst_1",       "#dst_3"            \n\t"   \
>-        "punpckhhw  "#dst_3",   %[db_2],        "#dst_3"            \n\t"   \
>-        "pshufh     %[db_2],    "#dst_4",       %[db_1]             \n\t"   \
>-        "punpcklhw  "#dst_4",   "#dst_4",       "#dst_6"            \n\t"   \
>-        "punpckhhw  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
>-        "pshufh     %[db_2],    "#dst_5",       %[db_1]             \n\t"   \
>-        "punpcklhw  "#dst_5",   "#dst_5",       "#dst_7"            \n\t"   \
>-        "punpckhhw  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
>-                                                                            \
>-        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
>-        "punpcklwd  "#dst_0",   "#dst_0",       "#dst_4"            \n\t"   \
>-        "punpckhwd  "#dst_4",   %[db_2],        "#dst_4"            \n\t"   \
>-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
>-        "punpcklwd  "#dst_1",   "#dst_1",       "#dst_5"            \n\t"   \
>-        "punpckhwd  "#dst_5",   %[db_2],        "#dst_5"            \n\t"   \
>-        "pshufh     %[db_2],    "#dst_2",       %[db_1]             \n\t"   \
>-        "punpcklwd  "#dst_2",   "#dst_2",       "#dst_6"            \n\t"   \
>-        "punpckhwd  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
>-        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
>-        "punpcklwd  "#dst_3",   "#dst_3",       "#dst_7"            \n\t"   \
>-        "punpckhwd  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
>-                                                                            \
>-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
>-        "pshufh     "#dst_1",   "#dst_4",       %[db_1]             \n\t"   \
>-        "pshufh     "#dst_4",   %[db_2],        %[db_1]             \n\t"   \
>-        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
>-        "pshufh     "#dst_3",   "#dst_6",       %[db_1]             \n\t"   \
>-        "pshufh     "#dst_6",   %[db_2],        %[db_1]             \n\t"
>-
> #define MMI_VP8_LOOP_FILTER                                                 \
>         /* Calculation of hev */                                            \
>         "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
>@@ -952,16 +900,14 @@ static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
>         "gsldlc1    %[q3],        0x03(%[tmp0])                   \n\t"
>         "gsldrc1    %[q3],        -0x04(%[tmp0])                  \n\t"
>         /* Matrix transpose */
>-        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
>-                               %[q0], %[q1], %[q2], %[q3],
>-                               %[p3], %[p2], %[p1], %[p0],
>-                               %[q0], %[q1], %[q2], %[q3])
>+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
>+                     %[q0], %[q1], %[q2], %[q3],
>+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
>         MMI_VP8_LOOP_FILTER
>         /* Matrix transpose */
>-        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
>-                               %[q0], %[q1], %[q2], %[q3],
>-                               %[p3], %[p2], %[p1], %[p0],
>-                               %[q0], %[q1], %[q2], %[q3])
>+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
>+                     %[q0], %[q1], %[q2], %[q3],
>+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
>         /* Move to dst */
>         "gssdlc1    %[p3],        0x03(%[dst])                    \n\t"
>         "gssdrc1    %[p3],        -0x04(%[dst])                   \n\t"
>@@ -1233,8 +1179,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
>         MMI_SDC1(%[ftmp0], %[block], 0x18)
>
>         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
>-                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
>-                     %[ftmp9], %[tmp0],  %[ftmp0], %[ftmp10])
>+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
>
>         // t[0 4  8 12]
>         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
>@@ -1269,8 +1214,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
>         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
>
>         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
>-                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
>-                     %[ftmp9], %[tmp0],  %[ftmp0], %[ftmp10])
>+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
>
>         MMI_LWC1(%[ftmp5], %[dst0], 0x00)
>         MMI_LWC1(%[ftmp6], %[dst1], 0x00)
>diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h
>index b16edc4..76b1199 100644
>--- a/libavutil/mips/mmiutils.h
>+++ b/libavutil/mips/mmiutils.h
>@@ -250,30 +250,53 @@
>       : "memory"                                                \
>     );
>
>-#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift) \
>-        "li         "#r1",  0x93                                    \n\t" \
>-        "xor        "#zero","#zero","#zero"                         \n\t" \
>-        "mtc1       "#r1",  "#shift"                                \n\t" \
>-        "punpcklhw  "#t1",  "#m1",  "#zero"                         \n\t" \
>-        "punpcklhw  "#t5",  "#m2",  "#zero"                         \n\t" \
>-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
>-        "or         "#t1",  "#t1",  "#t5"                           \n\t" \
>-        "punpckhhw  "#t2",  "#m1",  "#zero"                         \n\t" \
>-        "punpckhhw  "#t5",  "#m2",  "#zero"                         \n\t" \
>-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
>-        "or         "#t2",  "#t2",  "#t5"                           \n\t" \
>-        "punpcklhw  "#t3",  "#m3",  "#zero"                         \n\t" \
>-        "punpcklhw  "#t5",  "#m4",  "#zero"                         \n\t" \
>-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
>-        "or         "#t3",  "#t3",  "#t5"                           \n\t" \
>-        "punpckhhw  "#t4",  "#m3",  "#zero"                         \n\t" \
>-        "punpckhhw  "#t5",  "#m4",  "#zero"                         \n\t" \
>-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
>-        "or         "#t4",  "#t4",  "#t5"                           \n\t" \
>-        "punpcklwd  "#m1",  "#t1",  "#t3"                           \n\t" \
>-        "punpckhwd  "#m2",  "#t1",  "#t3"                           \n\t" \
>-        "punpcklwd  "#m3",  "#t2",  "#t4"                           \n\t" \
>-        "punpckhwd  "#m4",  "#t2",  "#t4"                           \n\t"
>+/**
>+ * brief: Transpose 4X4 half word packaged data.
>+ * fr_i0, fr_i1, fr_i2, fr_i3: src & dst
>+ * fr_t0, fr_t1, fr_t2, fr_t3: temporary register
>+ */
>+#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3,                          \
>+                     fr_t0, fr_t1, fr_t2, fr_t3)                          \
>+        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
>+        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
>+        "punpcklhw  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
>+        "punpckhhw  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
>+        "punpcklwd  "#fr_i0",   "#fr_t0",   "#fr_t2"                \n\t" \
>+        "punpckhwd  "#fr_i1",   "#fr_t0",   "#fr_t2"                \n\t" \
>+        "punpcklwd  "#fr_i2",   "#fr_t1",   "#fr_t3"                \n\t" \
>+        "punpckhwd  "#fr_i3",   "#fr_t1",   "#fr_t3"                \n\t"
>+
>+/**
>+ * brief: Transpose 8x8 byte packaged data.
>+ * fr_i0~i7: src & dst
>+ * fr_t0~t3: temporary register
>+ */
>+#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5,            \
>+                     fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)            \
>+        "punpcklbh  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
>+        "punpckhbh  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
>+        "punpcklbh  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
>+        "punpckhbh  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
>+        "punpcklbh  "#fr_i0",   "#fr_i4",   "#fr_i5"                \n\t" \
>+        "punpckhbh  "#fr_i1",   "#fr_i4",   "#fr_i5"                \n\t" \
>+        "punpcklbh  "#fr_i2",   "#fr_i6",   "#fr_i7"                \n\t" \
>+        "punpckhbh  "#fr_i3",   "#fr_i6",   "#fr_i7"                \n\t" \
>+        "punpcklhw  "#fr_i4",   "#fr_t0",   "#fr_t2"                \n\t" \
>+        "punpckhhw  "#fr_i5",   "#fr_t0",   "#fr_t2"                \n\t" \
>+        "punpcklhw  "#fr_i6",   "#fr_t1",   "#fr_t3"                \n\t" \
>+        "punpckhhw  "#fr_i7",   "#fr_t1",   "#fr_t3"                \n\t" \
>+        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i2"                \n\t" \
>+        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i2"                \n\t" \
>+        "punpcklhw  "#fr_t2",   "#fr_i1",   "#fr_i3"                \n\t" \
>+        "punpckhhw  "#fr_t3",   "#fr_i1",   "#fr_i3"                \n\t" \
>+        "punpcklwd  "#fr_i0",   "#fr_i4",   "#fr_t0"                \n\t" \
>+        "punpckhwd  "#fr_i1",   "#fr_i4",   "#fr_t0"                \n\t" \
>+        "punpcklwd  "#fr_i2",   "#fr_i5",   "#fr_t1"                \n\t" \
>+        "punpckhwd  "#fr_i3",   "#fr_i5",   "#fr_t1"                \n\t" \
>+        "punpcklwd  "#fr_i4",   "#fr_i6",   "#fr_t2"                \n\t" \
>+        "punpckhwd  "#fr_i5",   "#fr_i6",   "#fr_t2"                \n\t" \
>+        "punpcklwd  "#fr_i6",   "#fr_i7",   "#fr_t3"                \n\t" \
>+        "punpckhwd  "#fr_i7",   "#fr_i7",   "#fr_t3"                \n\t"
>
> /**
>  * brief: Parallel SRA for 8 byte packaged data.
>@@ -303,15 +326,14 @@
>         "psrlh        "#fr_t1",   "#fr_t1",   "#fr_i1"              \n\t" \
>         "packsshb     "#fr_d0",   "#fr_t0",   "#fr_t1"              \n\t"
>
>-
>-#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                              \
>-        "psrah      "#fp1",     "#fp1",     "#shift"                \n\t"   \
>-        "psrah      "#fp2",     "#fp2",     "#shift"                \n\t"   \
>-        "psrah      "#fp3",     "#fp3",     "#shift"                \n\t"   \
>+#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
>+        "psrah      "#fp1",     "#fp1",     "#shift"                \n\t" \
>+        "psrah      "#fp2",     "#fp2",     "#shift"                \n\t" \
>+        "psrah      "#fp3",     "#fp3",     "#shift"                \n\t" \
>         "psrah      "#fp4",     "#fp4",     "#shift"                \n\t"
>
>-#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)          \
>-        PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                              \
>+#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)        \
>+        PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
>         PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift)
>
>
>--
>2.1.0

Hi Michael, could you please help to review this patch.
BTW, this patch was based on the previous patch" [PATCH 2/2] avcodec/mips: [loongson] optimize	vp8	decoding in vp8dsp.", 
you'd better merge it first. Thank you very much.
Michael Niedermayer Sept. 9, 2018, 1:38 a.m. UTC | #2
On Fri, Sep 07, 2018 at 11:51:05AM +0800, Shiyou Yin wrote:
> >-----Original Message-----
> >From: ffmpeg-devel-bounces@ffmpeg.org [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> >Shiyou Yin
> >Sent: Thursday, September 6, 2018 4:11 PM
> >To: ffmpeg-devel@ffmpeg.org
> >Subject: [FFmpeg-devel] [PATCH 2/2] avutil/mips: [loongson] simplify macro TRANSPOSE_4H and
> >TRANSPOSE_8B
> >
> >Simplify macro TRANSPOSE_4H in mmiutils.h and add TRANSPOSE_8B as a common macro.
> >---
> > libavcodec/mips/vc1dsp_mmi.c | 12 +++----
> > libavcodec/mips/vp8dsp_mmi.c | 72 +++++--------------------------------
> > libavutil/mips/mmiutils.h    | 84 ++++++++++++++++++++++++++++----------------
> > 3 files changed, 65 insertions(+), 103 deletions(-)
> >
> >diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c
> >index a439b40..80778a5 100644
> >--- a/libavcodec/mips/vc1dsp_mmi.c
> >+++ b/libavcodec/mips/vc1dsp_mmi.c
> >@@ -248,8 +248,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
> >                                0xfff70004, 0xfff0000f, %[ff_pw_4])
> >
> >         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
> >-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
> >-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
> >+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
> >
> >         MMI_SDC1(%[ftmp15], %[dst], 0x00)
> >         MMI_SDC1(%[ftmp16], %[dst], 0x10)
> >@@ -257,8 +256,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
> >         MMI_SDC1(%[ftmp18], %[dst], 0x30)
> >
> >         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
> >-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
> >-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
> >+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
> >
> >         MMI_SDC1(%[ftmp19], %[dst], 0x08)
> >         MMI_SDC1(%[ftmp20], %[dst], 0x18)
> >@@ -301,8 +299,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
> >                                0xfff70004, 0xfff0000f, %[ff_pw_4])
> >
> >         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
> >-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
> >-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
> >+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
> >
> >         MMI_SDC1(%[ftmp15], %[dst], 0x40)
> >         MMI_SDC1(%[ftmp16], %[dst], 0x50)
> >@@ -310,8 +307,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
> >         MMI_SDC1(%[ftmp18], %[dst], 0x70)
> >
> >         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
> >-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
> >-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
> >+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
> >
> >         MMI_SDC1(%[ftmp19], %[dst], 0x48)
> >         MMI_SDC1(%[ftmp20], %[dst], 0x58)
> >diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c
> >index b24a87a..bd80aa1 100644
> >--- a/libavcodec/mips/vp8dsp_mmi.c
> >+++ b/libavcodec/mips/vp8dsp_mmi.c
> >@@ -44,58 +44,6 @@
> >         "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
> >         "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
> >
> >-#define MMI_TRANSPOSE8x8_UB_UB(src_0, src_1, src_2, src_3,                  \
> >-                               src_4, src_5, src_6, src_7,                  \
> >-                               dst_0, dst_1, dst_2, dst_3,                  \
> >-                               dst_4, dst_5, dst_6, dst_7)                  \
> >-        "li         %[it_1],    0xe4                                \n\t"   \
> >-        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
> >-        "pshufh     %[db_2],    "#src_0",       %[db_1]             \n\t"   \
> >-        "punpcklbh  "#dst_0",   "#src_0",       "#src_1"            \n\t"   \
> >-        "punpckhbh  "#dst_1",   %[db_2],        "#src_1"            \n\t"   \
> >-        "pshufh     %[db_2],    "#src_2",       %[db_1]             \n\t"   \
> >-        "punpcklbh  "#dst_2",   "#src_2",       "#src_3"            \n\t"   \
> >-        "punpckhbh  "#dst_3",   %[db_2],        "#src_3"            \n\t"   \
> >-        "pshufh     %[db_2],    "#src_4",       %[db_1]             \n\t"   \
> >-        "punpcklbh  "#dst_4",   "#src_4",       "#src_5"            \n\t"   \
> >-        "punpckhbh  "#dst_5",   %[db_2],        "#src_5"            \n\t"   \
> >-        "pshufh     %[db_2],    "#src_6",       %[db_1]             \n\t"   \
> >-        "punpcklbh  "#dst_6",   "#src_6",       "#src_7"            \n\t"   \
> >-        "punpckhbh  "#dst_7",   %[db_2],        "#src_7"            \n\t"   \
> >-                                                                            \
> >-        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
> >-        "punpcklhw  "#dst_0",   "#dst_0",       "#dst_2"            \n\t"   \
> >-        "punpckhhw  "#dst_2",   %[db_2],        "#dst_2"            \n\t"   \
> >-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
> >-        "punpcklhw  "#dst_1",   "#dst_1",       "#dst_3"            \n\t"   \
> >-        "punpckhhw  "#dst_3",   %[db_2],        "#dst_3"            \n\t"   \
> >-        "pshufh     %[db_2],    "#dst_4",       %[db_1]             \n\t"   \
> >-        "punpcklhw  "#dst_4",   "#dst_4",       "#dst_6"            \n\t"   \
> >-        "punpckhhw  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
> >-        "pshufh     %[db_2],    "#dst_5",       %[db_1]             \n\t"   \
> >-        "punpcklhw  "#dst_5",   "#dst_5",       "#dst_7"            \n\t"   \
> >-        "punpckhhw  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
> >-                                                                            \
> >-        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
> >-        "punpcklwd  "#dst_0",   "#dst_0",       "#dst_4"            \n\t"   \
> >-        "punpckhwd  "#dst_4",   %[db_2],        "#dst_4"            \n\t"   \
> >-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
> >-        "punpcklwd  "#dst_1",   "#dst_1",       "#dst_5"            \n\t"   \
> >-        "punpckhwd  "#dst_5",   %[db_2],        "#dst_5"            \n\t"   \
> >-        "pshufh     %[db_2],    "#dst_2",       %[db_1]             \n\t"   \
> >-        "punpcklwd  "#dst_2",   "#dst_2",       "#dst_6"            \n\t"   \
> >-        "punpckhwd  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
> >-        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
> >-        "punpcklwd  "#dst_3",   "#dst_3",       "#dst_7"            \n\t"   \
> >-        "punpckhwd  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
> >-                                                                            \
> >-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
> >-        "pshufh     "#dst_1",   "#dst_4",       %[db_1]             \n\t"   \
> >-        "pshufh     "#dst_4",   %[db_2],        %[db_1]             \n\t"   \
> >-        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
> >-        "pshufh     "#dst_3",   "#dst_6",       %[db_1]             \n\t"   \
> >-        "pshufh     "#dst_6",   %[db_2],        %[db_1]             \n\t"
> >-
> > #define MMI_VP8_LOOP_FILTER                                                 \
> >         /* Calculation of hev */                                            \
> >         "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
> >@@ -952,16 +900,14 @@ static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
> >         "gsldlc1    %[q3],        0x03(%[tmp0])                   \n\t"
> >         "gsldrc1    %[q3],        -0x04(%[tmp0])                  \n\t"
> >         /* Matrix transpose */
> >-        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
> >-                               %[q0], %[q1], %[q2], %[q3],
> >-                               %[p3], %[p2], %[p1], %[p0],
> >-                               %[q0], %[q1], %[q2], %[q3])
> >+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
> >+                     %[q0], %[q1], %[q2], %[q3],
> >+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
> >         MMI_VP8_LOOP_FILTER
> >         /* Matrix transpose */
> >-        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
> >-                               %[q0], %[q1], %[q2], %[q3],
> >-                               %[p3], %[p2], %[p1], %[p0],
> >-                               %[q0], %[q1], %[q2], %[q3])
> >+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
> >+                     %[q0], %[q1], %[q2], %[q3],
> >+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
> >         /* Move to dst */
> >         "gssdlc1    %[p3],        0x03(%[dst])                    \n\t"
> >         "gssdrc1    %[p3],        -0x04(%[dst])                   \n\t"
> >@@ -1233,8 +1179,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
> >         MMI_SDC1(%[ftmp0], %[block], 0x18)
> >
> >         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
> >-                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
> >-                     %[ftmp9], %[tmp0],  %[ftmp0], %[ftmp10])
> >+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
> >
> >         // t[0 4  8 12]
> >         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
> >@@ -1269,8 +1214,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
> >         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
> >
> >         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
> >-                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
> >-                     %[ftmp9], %[tmp0],  %[ftmp0], %[ftmp10])
> >+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
> >
> >         MMI_LWC1(%[ftmp5], %[dst0], 0x00)
> >         MMI_LWC1(%[ftmp6], %[dst1], 0x00)
> >diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h
> >index b16edc4..76b1199 100644
> >--- a/libavutil/mips/mmiutils.h
> >+++ b/libavutil/mips/mmiutils.h
> >@@ -250,30 +250,53 @@
> >       : "memory"                                                \
> >     );
> >
> >-#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift) \
> >-        "li         "#r1",  0x93                                    \n\t" \
> >-        "xor        "#zero","#zero","#zero"                         \n\t" \
> >-        "mtc1       "#r1",  "#shift"                                \n\t" \
> >-        "punpcklhw  "#t1",  "#m1",  "#zero"                         \n\t" \
> >-        "punpcklhw  "#t5",  "#m2",  "#zero"                         \n\t" \
> >-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
> >-        "or         "#t1",  "#t1",  "#t5"                           \n\t" \
> >-        "punpckhhw  "#t2",  "#m1",  "#zero"                         \n\t" \
> >-        "punpckhhw  "#t5",  "#m2",  "#zero"                         \n\t" \
> >-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
> >-        "or         "#t2",  "#t2",  "#t5"                           \n\t" \
> >-        "punpcklhw  "#t3",  "#m3",  "#zero"                         \n\t" \
> >-        "punpcklhw  "#t5",  "#m4",  "#zero"                         \n\t" \
> >-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
> >-        "or         "#t3",  "#t3",  "#t5"                           \n\t" \
> >-        "punpckhhw  "#t4",  "#m3",  "#zero"                         \n\t" \
> >-        "punpckhhw  "#t5",  "#m4",  "#zero"                         \n\t" \
> >-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
> >-        "or         "#t4",  "#t4",  "#t5"                           \n\t" \
> >-        "punpcklwd  "#m1",  "#t1",  "#t3"                           \n\t" \
> >-        "punpckhwd  "#m2",  "#t1",  "#t3"                           \n\t" \
> >-        "punpcklwd  "#m3",  "#t2",  "#t4"                           \n\t" \
> >-        "punpckhwd  "#m4",  "#t2",  "#t4"                           \n\t"
> >+/**
> >+ * brief: Transpose 4X4 half word packaged data.
> >+ * fr_i0, fr_i1, fr_i2, fr_i3: src & dst
> >+ * fr_t0, fr_t1, fr_t2, fr_t3: temporary register
> >+ */
> >+#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3,                          \
> >+                     fr_t0, fr_t1, fr_t2, fr_t3)                          \
> >+        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
> >+        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
> >+        "punpcklhw  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
> >+        "punpckhhw  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
> >+        "punpcklwd  "#fr_i0",   "#fr_t0",   "#fr_t2"                \n\t" \
> >+        "punpckhwd  "#fr_i1",   "#fr_t0",   "#fr_t2"                \n\t" \
> >+        "punpcklwd  "#fr_i2",   "#fr_t1",   "#fr_t3"                \n\t" \
> >+        "punpckhwd  "#fr_i3",   "#fr_t1",   "#fr_t3"                \n\t"
> >+
> >+/**
> >+ * brief: Transpose 8x8 byte packaged data.
> >+ * fr_i0~i7: src & dst
> >+ * fr_t0~t3: temporary register
> >+ */
> >+#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5,            \
> >+                     fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)            \
> >+        "punpcklbh  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
> >+        "punpckhbh  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
> >+        "punpcklbh  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
> >+        "punpckhbh  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
> >+        "punpcklbh  "#fr_i0",   "#fr_i4",   "#fr_i5"                \n\t" \
> >+        "punpckhbh  "#fr_i1",   "#fr_i4",   "#fr_i5"                \n\t" \
> >+        "punpcklbh  "#fr_i2",   "#fr_i6",   "#fr_i7"                \n\t" \
> >+        "punpckhbh  "#fr_i3",   "#fr_i6",   "#fr_i7"                \n\t" \
> >+        "punpcklhw  "#fr_i4",   "#fr_t0",   "#fr_t2"                \n\t" \
> >+        "punpckhhw  "#fr_i5",   "#fr_t0",   "#fr_t2"                \n\t" \
> >+        "punpcklhw  "#fr_i6",   "#fr_t1",   "#fr_t3"                \n\t" \
> >+        "punpckhhw  "#fr_i7",   "#fr_t1",   "#fr_t3"                \n\t" \
> >+        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i2"                \n\t" \
> >+        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i2"                \n\t" \
> >+        "punpcklhw  "#fr_t2",   "#fr_i1",   "#fr_i3"                \n\t" \
> >+        "punpckhhw  "#fr_t3",   "#fr_i1",   "#fr_i3"                \n\t" \
> >+        "punpcklwd  "#fr_i0",   "#fr_i4",   "#fr_t0"                \n\t" \
> >+        "punpckhwd  "#fr_i1",   "#fr_i4",   "#fr_t0"                \n\t" \
> >+        "punpcklwd  "#fr_i2",   "#fr_i5",   "#fr_t1"                \n\t" \
> >+        "punpckhwd  "#fr_i3",   "#fr_i5",   "#fr_t1"                \n\t" \
> >+        "punpcklwd  "#fr_i4",   "#fr_i6",   "#fr_t2"                \n\t" \
> >+        "punpckhwd  "#fr_i5",   "#fr_i6",   "#fr_t2"                \n\t" \
> >+        "punpcklwd  "#fr_i6",   "#fr_i7",   "#fr_t3"                \n\t" \
> >+        "punpckhwd  "#fr_i7",   "#fr_i7",   "#fr_t3"                \n\t"
> >
> > /**
> >  * brief: Parallel SRA for 8 byte packaged data.
> >@@ -303,15 +326,14 @@
> >         "psrlh        "#fr_t1",   "#fr_t1",   "#fr_i1"              \n\t" \
> >         "packsshb     "#fr_d0",   "#fr_t0",   "#fr_t1"              \n\t"
> >
> >-
> >-#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                              \
> >-        "psrah      "#fp1",     "#fp1",     "#shift"                \n\t"   \
> >-        "psrah      "#fp2",     "#fp2",     "#shift"                \n\t"   \
> >-        "psrah      "#fp3",     "#fp3",     "#shift"                \n\t"   \
> >+#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
> >+        "psrah      "#fp1",     "#fp1",     "#shift"                \n\t" \
> >+        "psrah      "#fp2",     "#fp2",     "#shift"                \n\t" \
> >+        "psrah      "#fp3",     "#fp3",     "#shift"                \n\t" \
> >         "psrah      "#fp4",     "#fp4",     "#shift"                \n\t"
> >
> >-#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)          \
> >-        PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                              \
> >+#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)        \
> >+        PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
> >         PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift)
> >
> >
> >--
> >2.1.0
> 
> Hi Michael, could you please help to review this patch.
> BTW, this patch was based on the previous patch" [PATCH 2/2] avcodec/mips: [loongson] optimize	vp8	decoding in vp8dsp.", 
> you'd better merge it first. Thank you very much.

will apply both

thanks

[...]
diff mbox

Patch

diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c
index a439b40..80778a5 100644
--- a/libavcodec/mips/vc1dsp_mmi.c
+++ b/libavcodec/mips/vc1dsp_mmi.c
@@ -248,8 +248,7 @@  void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 
         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 
         MMI_SDC1(%[ftmp15], %[dst], 0x00)
         MMI_SDC1(%[ftmp16], %[dst], 0x10)
@@ -257,8 +256,7 @@  void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
         MMI_SDC1(%[ftmp18], %[dst], 0x30)
 
         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 
         MMI_SDC1(%[ftmp19], %[dst], 0x08)
         MMI_SDC1(%[ftmp20], %[dst], 0x18)
@@ -301,8 +299,7 @@  void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 
         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 
         MMI_SDC1(%[ftmp15], %[dst], 0x40)
         MMI_SDC1(%[ftmp16], %[dst], 0x50)
@@ -310,8 +307,7 @@  void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
         MMI_SDC1(%[ftmp18], %[dst], 0x70)
 
         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
-                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
-                     %[ftmp5], %[tmp0],  %[ftmp6], %[ftmp7])
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 
         MMI_SDC1(%[ftmp19], %[dst], 0x48)
         MMI_SDC1(%[ftmp20], %[dst], 0x58)
diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c
index b24a87a..bd80aa1 100644
--- a/libavcodec/mips/vp8dsp_mmi.c
+++ b/libavcodec/mips/vp8dsp_mmi.c
@@ -44,58 +44,6 @@ 
         "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
         "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
 
-#define MMI_TRANSPOSE8x8_UB_UB(src_0, src_1, src_2, src_3,                  \
-                               src_4, src_5, src_6, src_7,                  \
-                               dst_0, dst_1, dst_2, dst_3,                  \
-                               dst_4, dst_5, dst_6, dst_7)                  \
-        "li         %[it_1],    0xe4                                \n\t"   \
-        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
-        "pshufh     %[db_2],    "#src_0",       %[db_1]             \n\t"   \
-        "punpcklbh  "#dst_0",   "#src_0",       "#src_1"            \n\t"   \
-        "punpckhbh  "#dst_1",   %[db_2],        "#src_1"            \n\t"   \
-        "pshufh     %[db_2],    "#src_2",       %[db_1]             \n\t"   \
-        "punpcklbh  "#dst_2",   "#src_2",       "#src_3"            \n\t"   \
-        "punpckhbh  "#dst_3",   %[db_2],        "#src_3"            \n\t"   \
-        "pshufh     %[db_2],    "#src_4",       %[db_1]             \n\t"   \
-        "punpcklbh  "#dst_4",   "#src_4",       "#src_5"            \n\t"   \
-        "punpckhbh  "#dst_5",   %[db_2],        "#src_5"            \n\t"   \
-        "pshufh     %[db_2],    "#src_6",       %[db_1]             \n\t"   \
-        "punpcklbh  "#dst_6",   "#src_6",       "#src_7"            \n\t"   \
-        "punpckhbh  "#dst_7",   %[db_2],        "#src_7"            \n\t"   \
-                                                                            \
-        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
-        "punpcklhw  "#dst_0",   "#dst_0",       "#dst_2"            \n\t"   \
-        "punpckhhw  "#dst_2",   %[db_2],        "#dst_2"            \n\t"   \
-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
-        "punpcklhw  "#dst_1",   "#dst_1",       "#dst_3"            \n\t"   \
-        "punpckhhw  "#dst_3",   %[db_2],        "#dst_3"            \n\t"   \
-        "pshufh     %[db_2],    "#dst_4",       %[db_1]             \n\t"   \
-        "punpcklhw  "#dst_4",   "#dst_4",       "#dst_6"            \n\t"   \
-        "punpckhhw  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
-        "pshufh     %[db_2],    "#dst_5",       %[db_1]             \n\t"   \
-        "punpcklhw  "#dst_5",   "#dst_5",       "#dst_7"            \n\t"   \
-        "punpckhhw  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
-                                                                            \
-        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
-        "punpcklwd  "#dst_0",   "#dst_0",       "#dst_4"            \n\t"   \
-        "punpckhwd  "#dst_4",   %[db_2],        "#dst_4"            \n\t"   \
-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
-        "punpcklwd  "#dst_1",   "#dst_1",       "#dst_5"            \n\t"   \
-        "punpckhwd  "#dst_5",   %[db_2],        "#dst_5"            \n\t"   \
-        "pshufh     %[db_2],    "#dst_2",       %[db_1]             \n\t"   \
-        "punpcklwd  "#dst_2",   "#dst_2",       "#dst_6"            \n\t"   \
-        "punpckhwd  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
-        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
-        "punpcklwd  "#dst_3",   "#dst_3",       "#dst_7"            \n\t"   \
-        "punpckhwd  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
-                                                                            \
-        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
-        "pshufh     "#dst_1",   "#dst_4",       %[db_1]             \n\t"   \
-        "pshufh     "#dst_4",   %[db_2],        %[db_1]             \n\t"   \
-        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
-        "pshufh     "#dst_3",   "#dst_6",       %[db_1]             \n\t"   \
-        "pshufh     "#dst_6",   %[db_2],        %[db_1]             \n\t"
-
 #define MMI_VP8_LOOP_FILTER                                                 \
         /* Calculation of hev */                                            \
         "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
@@ -952,16 +900,14 @@  static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
         "gsldlc1    %[q3],        0x03(%[tmp0])                   \n\t"
         "gsldrc1    %[q3],        -0x04(%[tmp0])                  \n\t"
         /* Matrix transpose */
-        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
-                               %[q0], %[q1], %[q2], %[q3],
-                               %[p3], %[p2], %[p1], %[p0],
-                               %[q0], %[q1], %[q2], %[q3])
+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
+                     %[q0], %[q1], %[q2], %[q3],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
         MMI_VP8_LOOP_FILTER
         /* Matrix transpose */
-        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
-                               %[q0], %[q1], %[q2], %[q3],
-                               %[p3], %[p2], %[p1], %[p0],
-                               %[q0], %[q1], %[q2], %[q3])
+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
+                     %[q0], %[q1], %[q2], %[q3],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
         /* Move to dst */
         "gssdlc1    %[p3],        0x03(%[dst])                    \n\t"
         "gssdrc1    %[p3],        -0x04(%[dst])                   \n\t"
@@ -1233,8 +1179,7 @@  void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
         MMI_SDC1(%[ftmp0], %[block], 0x18)
 
         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
-                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
-                     %[ftmp9], %[tmp0],  %[ftmp0], %[ftmp10])
+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
 
         // t[0 4  8 12]
         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
@@ -1269,8 +1214,7 @@  void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
 
         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
-                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
-                     %[ftmp9], %[tmp0],  %[ftmp0], %[ftmp10])
+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
 
         MMI_LWC1(%[ftmp5], %[dst0], 0x00)
         MMI_LWC1(%[ftmp6], %[dst1], 0x00)
diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h
index b16edc4..76b1199 100644
--- a/libavutil/mips/mmiutils.h
+++ b/libavutil/mips/mmiutils.h
@@ -250,30 +250,53 @@ 
       : "memory"                                                \
     );
 
-#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift) \
-        "li         "#r1",  0x93                                    \n\t" \
-        "xor        "#zero","#zero","#zero"                         \n\t" \
-        "mtc1       "#r1",  "#shift"                                \n\t" \
-        "punpcklhw  "#t1",  "#m1",  "#zero"                         \n\t" \
-        "punpcklhw  "#t5",  "#m2",  "#zero"                         \n\t" \
-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
-        "or         "#t1",  "#t1",  "#t5"                           \n\t" \
-        "punpckhhw  "#t2",  "#m1",  "#zero"                         \n\t" \
-        "punpckhhw  "#t5",  "#m2",  "#zero"                         \n\t" \
-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
-        "or         "#t2",  "#t2",  "#t5"                           \n\t" \
-        "punpcklhw  "#t3",  "#m3",  "#zero"                         \n\t" \
-        "punpcklhw  "#t5",  "#m4",  "#zero"                         \n\t" \
-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
-        "or         "#t3",  "#t3",  "#t5"                           \n\t" \
-        "punpckhhw  "#t4",  "#m3",  "#zero"                         \n\t" \
-        "punpckhhw  "#t5",  "#m4",  "#zero"                         \n\t" \
-        "pshufh     "#t5",  "#t5",  "#shift"                        \n\t" \
-        "or         "#t4",  "#t4",  "#t5"                           \n\t" \
-        "punpcklwd  "#m1",  "#t1",  "#t3"                           \n\t" \
-        "punpckhwd  "#m2",  "#t1",  "#t3"                           \n\t" \
-        "punpcklwd  "#m3",  "#t2",  "#t4"                           \n\t" \
-        "punpckhwd  "#m4",  "#t2",  "#t4"                           \n\t"
+/**
+ * brief: Transpose 4X4 half word packaged data.
+ * fr_i0, fr_i1, fr_i2, fr_i3: src & dst
+ * fr_t0, fr_t1, fr_t2, fr_t3: temporary register
+ */
+#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3,                          \
+                     fr_t0, fr_t1, fr_t2, fr_t3)                          \
+        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
+        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
+        "punpcklhw  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
+        "punpckhhw  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
+        "punpcklwd  "#fr_i0",   "#fr_t0",   "#fr_t2"                \n\t" \
+        "punpckhwd  "#fr_i1",   "#fr_t0",   "#fr_t2"                \n\t" \
+        "punpcklwd  "#fr_i2",   "#fr_t1",   "#fr_t3"                \n\t" \
+        "punpckhwd  "#fr_i3",   "#fr_t1",   "#fr_t3"                \n\t"
+
+/**
+ * brief: Transpose 8x8 byte packaged data.
+ * fr_i0~i7: src & dst
+ * fr_t0~t3: temporary register
+ */
+#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5,            \
+                     fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)            \
+        "punpcklbh  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
+        "punpckhbh  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
+        "punpcklbh  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
+        "punpckhbh  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
+        "punpcklbh  "#fr_i0",   "#fr_i4",   "#fr_i5"                \n\t" \
+        "punpckhbh  "#fr_i1",   "#fr_i4",   "#fr_i5"                \n\t" \
+        "punpcklbh  "#fr_i2",   "#fr_i6",   "#fr_i7"                \n\t" \
+        "punpckhbh  "#fr_i3",   "#fr_i6",   "#fr_i7"                \n\t" \
+        "punpcklhw  "#fr_i4",   "#fr_t0",   "#fr_t2"                \n\t" \
+        "punpckhhw  "#fr_i5",   "#fr_t0",   "#fr_t2"                \n\t" \
+        "punpcklhw  "#fr_i6",   "#fr_t1",   "#fr_t3"                \n\t" \
+        "punpckhhw  "#fr_i7",   "#fr_t1",   "#fr_t3"                \n\t" \
+        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i2"                \n\t" \
+        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i2"                \n\t" \
+        "punpcklhw  "#fr_t2",   "#fr_i1",   "#fr_i3"                \n\t" \
+        "punpckhhw  "#fr_t3",   "#fr_i1",   "#fr_i3"                \n\t" \
+        "punpcklwd  "#fr_i0",   "#fr_i4",   "#fr_t0"                \n\t" \
+        "punpckhwd  "#fr_i1",   "#fr_i4",   "#fr_t0"                \n\t" \
+        "punpcklwd  "#fr_i2",   "#fr_i5",   "#fr_t1"                \n\t" \
+        "punpckhwd  "#fr_i3",   "#fr_i5",   "#fr_t1"                \n\t" \
+        "punpcklwd  "#fr_i4",   "#fr_i6",   "#fr_t2"                \n\t" \
+        "punpckhwd  "#fr_i5",   "#fr_i6",   "#fr_t2"                \n\t" \
+        "punpcklwd  "#fr_i6",   "#fr_i7",   "#fr_t3"                \n\t" \
+        "punpckhwd  "#fr_i7",   "#fr_i7",   "#fr_t3"                \n\t"
 
 /**
  * brief: Parallel SRA for 8 byte packaged data.
@@ -303,15 +326,14 @@ 
         "psrlh        "#fr_t1",   "#fr_t1",   "#fr_i1"              \n\t" \
         "packsshb     "#fr_d0",   "#fr_t0",   "#fr_t1"              \n\t"
 
-
-#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                              \
-        "psrah      "#fp1",     "#fp1",     "#shift"                \n\t"   \
-        "psrah      "#fp2",     "#fp2",     "#shift"                \n\t"   \
-        "psrah      "#fp3",     "#fp3",     "#shift"                \n\t"   \
+#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
+        "psrah      "#fp1",     "#fp1",     "#shift"                \n\t" \
+        "psrah      "#fp2",     "#fp2",     "#shift"                \n\t" \
+        "psrah      "#fp3",     "#fp3",     "#shift"                \n\t" \
         "psrah      "#fp4",     "#fp4",     "#shift"                \n\t"
 
-#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)          \
-        PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                              \
+#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)        \
+        PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
         PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift)