Message ID | 1536221453-15372-2-git-send-email-yinshiyou-hf@loongson.cn |
---|---|
State | Accepted |
Commit | 5161f7bcfd3c2d2e6cb92e782855b7fc00bdf877 |
Headers | show |
>-----Original Message----- >From: ffmpeg-devel-bounces@ffmpeg.org [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of >Shiyou Yin >Sent: Thursday, September 6, 2018 4:11 PM >To: ffmpeg-devel@ffmpeg.org >Subject: [FFmpeg-devel] [PATCH 2/2] avutil/mips: [loongson] simplify macro TRANSPOSE_4H and >TRANSPOSE_8B > >Simplify macro TRANSPOSE_4H in mmiutils.h and add TRANSPOSE_8B as a common macro. >--- > libavcodec/mips/vc1dsp_mmi.c | 12 +++---- > libavcodec/mips/vp8dsp_mmi.c | 72 +++++-------------------------------- > libavutil/mips/mmiutils.h | 84 ++++++++++++++++++++++++++++---------------- > 3 files changed, 65 insertions(+), 103 deletions(-) > >diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c >index a439b40..80778a5 100644 >--- a/libavcodec/mips/vc1dsp_mmi.c >+++ b/libavcodec/mips/vc1dsp_mmi.c >@@ -248,8 +248,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > 0xfff70004, 0xfff0000f, %[ff_pw_4]) > > TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > MMI_SDC1(%[ftmp15], %[dst], 0x00) > MMI_SDC1(%[ftmp16], %[dst], 0x10) >@@ -257,8 +256,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > MMI_SDC1(%[ftmp18], %[dst], 0x30) > > TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > MMI_SDC1(%[ftmp19], %[dst], 0x08) > MMI_SDC1(%[ftmp20], %[dst], 0x18) >@@ -301,8 +299,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > 0xfff70004, 0xfff0000f, %[ff_pw_4]) > > TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > MMI_SDC1(%[ftmp15], %[dst], 0x40) > MMI_SDC1(%[ftmp16], %[dst], 0x50) >@@ -310,8 +307,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > MMI_SDC1(%[ftmp18], %[dst], 0x70) > > TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > MMI_SDC1(%[ftmp19], %[dst], 0x48) > MMI_SDC1(%[ftmp20], %[dst], 0x58) >diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c >index b24a87a..bd80aa1 100644 >--- a/libavcodec/mips/vp8dsp_mmi.c >+++ b/libavcodec/mips/vp8dsp_mmi.c >@@ -44,58 +44,6 @@ > "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \ > "punpckhbh "#dst_l", "#src", %[db_2] \n\t" > >-#define MMI_TRANSPOSE8x8_UB_UB(src_0, src_1, src_2, src_3, \ >- src_4, src_5, src_6, src_7, \ >- dst_0, dst_1, dst_2, dst_3, \ >- dst_4, dst_5, dst_6, dst_7) \ >- "li %[it_1], 0xe4 \n\t" \ >- "dmtc1 %[it_1], %[db_1] \n\t" \ >- "pshufh %[db_2], "#src_0", %[db_1] \n\t" \ >- "punpcklbh "#dst_0", "#src_0", "#src_1" \n\t" \ >- "punpckhbh "#dst_1", %[db_2], "#src_1" \n\t" \ >- "pshufh %[db_2], "#src_2", %[db_1] \n\t" \ >- "punpcklbh "#dst_2", "#src_2", "#src_3" \n\t" \ >- "punpckhbh "#dst_3", %[db_2], "#src_3" \n\t" \ >- "pshufh %[db_2], "#src_4", %[db_1] \n\t" \ >- "punpcklbh "#dst_4", "#src_4", "#src_5" \n\t" \ >- "punpckhbh "#dst_5", %[db_2], "#src_5" \n\t" \ >- "pshufh %[db_2], "#src_6", %[db_1] \n\t" \ >- "punpcklbh "#dst_6", "#src_6", "#src_7" \n\t" \ >- "punpckhbh "#dst_7", %[db_2], "#src_7" \n\t" \ >- \ >- "pshufh %[db_2], "#dst_0", %[db_1] \n\t" \ >- "punpcklhw "#dst_0", "#dst_0", "#dst_2" \n\t" \ >- "punpckhhw "#dst_2", %[db_2], "#dst_2" \n\t" \ >- "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ >- "punpcklhw "#dst_1", "#dst_1", "#dst_3" \n\t" \ >- "punpckhhw "#dst_3", %[db_2], "#dst_3" \n\t" \ >- "pshufh %[db_2], "#dst_4", %[db_1] \n\t" \ >- "punpcklhw "#dst_4", "#dst_4", "#dst_6" \n\t" \ >- "punpckhhw "#dst_6", %[db_2], "#dst_6" \n\t" \ >- "pshufh %[db_2], "#dst_5", %[db_1] \n\t" \ >- "punpcklhw "#dst_5", "#dst_5", "#dst_7" \n\t" \ >- "punpckhhw "#dst_7", %[db_2], "#dst_7" \n\t" \ >- \ >- "pshufh %[db_2], "#dst_0", %[db_1] \n\t" \ >- "punpcklwd "#dst_0", "#dst_0", "#dst_4" \n\t" \ >- "punpckhwd "#dst_4", %[db_2], "#dst_4" \n\t" \ >- "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ >- "punpcklwd "#dst_1", "#dst_1", "#dst_5" \n\t" \ >- "punpckhwd "#dst_5", %[db_2], "#dst_5" \n\t" \ >- "pshufh %[db_2], "#dst_2", %[db_1] \n\t" \ >- "punpcklwd "#dst_2", "#dst_2", "#dst_6" \n\t" \ >- "punpckhwd "#dst_6", %[db_2], "#dst_6" \n\t" \ >- "pshufh %[db_2], "#dst_3", %[db_1] \n\t" \ >- "punpcklwd "#dst_3", "#dst_3", "#dst_7" \n\t" \ >- "punpckhwd "#dst_7", %[db_2], "#dst_7" \n\t" \ >- \ >- "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ >- "pshufh "#dst_1", "#dst_4", %[db_1] \n\t" \ >- "pshufh "#dst_4", %[db_2], %[db_1] \n\t" \ >- "pshufh %[db_2], "#dst_3", %[db_1] \n\t" \ >- "pshufh "#dst_3", "#dst_6", %[db_1] \n\t" \ >- "pshufh "#dst_6", %[db_2], %[db_1] \n\t" >- > #define MMI_VP8_LOOP_FILTER \ > /* Calculation of hev */ \ > "dmtc1 %[thresh], %[ftmp3] \n\t" \ >@@ -952,16 +900,14 @@ static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, > "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t" > "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t" > /* Matrix transpose */ >- MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0], >- %[q0], %[q1], %[q2], %[q3], >- %[p3], %[p2], %[p1], %[p0], >- %[q0], %[q1], %[q2], %[q3]) >+ TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], >+ %[q0], %[q1], %[q2], %[q3], >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > MMI_VP8_LOOP_FILTER > /* Matrix transpose */ >- MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0], >- %[q0], %[q1], %[q2], %[q3], >- %[p3], %[p2], %[p1], %[p0], >- %[q0], %[q1], %[q2], %[q3]) >+ TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], >+ %[q0], %[q1], %[q2], %[q3], >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > /* Move to dst */ > "gssdlc1 %[p3], 0x03(%[dst]) \n\t" > "gssdrc1 %[p3], -0x04(%[dst]) \n\t" >@@ -1233,8 +1179,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) > MMI_SDC1(%[ftmp0], %[block], 0x18) > > TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], >- %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8], >- %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10]) >+ %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) > > // t[0 4 8 12] > "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" >@@ -1269,8 +1214,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) > "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" > > TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], >- %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8], >- %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10]) >+ %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) > > MMI_LWC1(%[ftmp5], %[dst0], 0x00) > MMI_LWC1(%[ftmp6], %[dst1], 0x00) >diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h >index b16edc4..76b1199 100644 >--- a/libavutil/mips/mmiutils.h >+++ b/libavutil/mips/mmiutils.h >@@ -250,30 +250,53 @@ > : "memory" \ > ); > >-#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift) \ >- "li "#r1", 0x93 \n\t" \ >- "xor "#zero","#zero","#zero" \n\t" \ >- "mtc1 "#r1", "#shift" \n\t" \ >- "punpcklhw "#t1", "#m1", "#zero" \n\t" \ >- "punpcklhw "#t5", "#m2", "#zero" \n\t" \ >- "pshufh "#t5", "#t5", "#shift" \n\t" \ >- "or "#t1", "#t1", "#t5" \n\t" \ >- "punpckhhw "#t2", "#m1", "#zero" \n\t" \ >- "punpckhhw "#t5", "#m2", "#zero" \n\t" \ >- "pshufh "#t5", "#t5", "#shift" \n\t" \ >- "or "#t2", "#t2", "#t5" \n\t" \ >- "punpcklhw "#t3", "#m3", "#zero" \n\t" \ >- "punpcklhw "#t5", "#m4", "#zero" \n\t" \ >- "pshufh "#t5", "#t5", "#shift" \n\t" \ >- "or "#t3", "#t3", "#t5" \n\t" \ >- "punpckhhw "#t4", "#m3", "#zero" \n\t" \ >- "punpckhhw "#t5", "#m4", "#zero" \n\t" \ >- "pshufh "#t5", "#t5", "#shift" \n\t" \ >- "or "#t4", "#t4", "#t5" \n\t" \ >- "punpcklwd "#m1", "#t1", "#t3" \n\t" \ >- "punpckhwd "#m2", "#t1", "#t3" \n\t" \ >- "punpcklwd "#m3", "#t2", "#t4" \n\t" \ >- "punpckhwd "#m4", "#t2", "#t4" \n\t" >+/** >+ * brief: Transpose 4X4 half word packaged data. >+ * fr_i0, fr_i1, fr_i2, fr_i3: src & dst >+ * fr_t0, fr_t1, fr_t2, fr_t3: temporary register >+ */ >+#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, \ >+ fr_t0, fr_t1, fr_t2, fr_t3) \ >+ "punpcklhw "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \ >+ "punpckhhw "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \ >+ "punpcklhw "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \ >+ "punpckhhw "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \ >+ "punpcklwd "#fr_i0", "#fr_t0", "#fr_t2" \n\t" \ >+ "punpckhwd "#fr_i1", "#fr_t0", "#fr_t2" \n\t" \ >+ "punpcklwd "#fr_i2", "#fr_t1", "#fr_t3" \n\t" \ >+ "punpckhwd "#fr_i3", "#fr_t1", "#fr_t3" \n\t" >+ >+/** >+ * brief: Transpose 8x8 byte packaged data. >+ * fr_i0~i7: src & dst >+ * fr_t0~t3: temporary register >+ */ >+#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, \ >+ fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3) \ >+ "punpcklbh "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \ >+ "punpckhbh "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \ >+ "punpcklbh "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \ >+ "punpckhbh "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \ >+ "punpcklbh "#fr_i0", "#fr_i4", "#fr_i5" \n\t" \ >+ "punpckhbh "#fr_i1", "#fr_i4", "#fr_i5" \n\t" \ >+ "punpcklbh "#fr_i2", "#fr_i6", "#fr_i7" \n\t" \ >+ "punpckhbh "#fr_i3", "#fr_i6", "#fr_i7" \n\t" \ >+ "punpcklhw "#fr_i4", "#fr_t0", "#fr_t2" \n\t" \ >+ "punpckhhw "#fr_i5", "#fr_t0", "#fr_t2" \n\t" \ >+ "punpcklhw "#fr_i6", "#fr_t1", "#fr_t3" \n\t" \ >+ "punpckhhw "#fr_i7", "#fr_t1", "#fr_t3" \n\t" \ >+ "punpcklhw "#fr_t0", "#fr_i0", "#fr_i2" \n\t" \ >+ "punpckhhw "#fr_t1", "#fr_i0", "#fr_i2" \n\t" \ >+ "punpcklhw "#fr_t2", "#fr_i1", "#fr_i3" \n\t" \ >+ "punpckhhw "#fr_t3", "#fr_i1", "#fr_i3" \n\t" \ >+ "punpcklwd "#fr_i0", "#fr_i4", "#fr_t0" \n\t" \ >+ "punpckhwd "#fr_i1", "#fr_i4", "#fr_t0" \n\t" \ >+ "punpcklwd "#fr_i2", "#fr_i5", "#fr_t1" \n\t" \ >+ "punpckhwd "#fr_i3", "#fr_i5", "#fr_t1" \n\t" \ >+ "punpcklwd "#fr_i4", "#fr_i6", "#fr_t2" \n\t" \ >+ "punpckhwd "#fr_i5", "#fr_i6", "#fr_t2" \n\t" \ >+ "punpcklwd "#fr_i6", "#fr_i7", "#fr_t3" \n\t" \ >+ "punpckhwd "#fr_i7", "#fr_i7", "#fr_t3" \n\t" > > /** > * brief: Parallel SRA for 8 byte packaged data. >@@ -303,15 +326,14 @@ > "psrlh "#fr_t1", "#fr_t1", "#fr_i1" \n\t" \ > "packsshb "#fr_d0", "#fr_t0", "#fr_t1" \n\t" > >- >-#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ >- "psrah "#fp1", "#fp1", "#shift" \n\t" \ >- "psrah "#fp2", "#fp2", "#shift" \n\t" \ >- "psrah "#fp3", "#fp3", "#shift" \n\t" \ >+#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ >+ "psrah "#fp1", "#fp1", "#shift" \n\t" \ >+ "psrah "#fp2", "#fp2", "#shift" \n\t" \ >+ "psrah "#fp3", "#fp3", "#shift" \n\t" \ > "psrah "#fp4", "#fp4", "#shift" \n\t" > >-#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \ >- PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ >+#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \ >+ PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ > PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift) > > >-- >2.1.0 Hi Michael, could you please help to review this patch. BTW, this patch was based on the previous patch" [PATCH 2/2] avcodec/mips: [loongson] optimize vp8 decoding in vp8dsp.", you'd better merge it first. Thank you very much.
On Fri, Sep 07, 2018 at 11:51:05AM +0800, Shiyou Yin wrote: > >-----Original Message----- > >From: ffmpeg-devel-bounces@ffmpeg.org [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of > >Shiyou Yin > >Sent: Thursday, September 6, 2018 4:11 PM > >To: ffmpeg-devel@ffmpeg.org > >Subject: [FFmpeg-devel] [PATCH 2/2] avutil/mips: [loongson] simplify macro TRANSPOSE_4H and > >TRANSPOSE_8B > > > >Simplify macro TRANSPOSE_4H in mmiutils.h and add TRANSPOSE_8B as a common macro. > >--- > > libavcodec/mips/vc1dsp_mmi.c | 12 +++---- > > libavcodec/mips/vp8dsp_mmi.c | 72 +++++-------------------------------- > > libavutil/mips/mmiutils.h | 84 ++++++++++++++++++++++++++++---------------- > > 3 files changed, 65 insertions(+), 103 deletions(-) > > > >diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c > >index a439b40..80778a5 100644 > >--- a/libavcodec/mips/vc1dsp_mmi.c > >+++ b/libavcodec/mips/vc1dsp_mmi.c > >@@ -248,8 +248,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > > 0xfff70004, 0xfff0000f, %[ff_pw_4]) > > > > TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], > >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], > >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) > >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > > > MMI_SDC1(%[ftmp15], %[dst], 0x00) > > MMI_SDC1(%[ftmp16], %[dst], 0x10) > >@@ -257,8 +256,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > > MMI_SDC1(%[ftmp18], %[dst], 0x30) > > > > TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], > >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], > >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) > >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > > > MMI_SDC1(%[ftmp19], %[dst], 0x08) > > MMI_SDC1(%[ftmp20], %[dst], 0x18) > >@@ -301,8 +299,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > > 0xfff70004, 0xfff0000f, %[ff_pw_4]) > > > > TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], > >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], > >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) > >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > > > MMI_SDC1(%[ftmp15], %[dst], 0x40) > > MMI_SDC1(%[ftmp16], %[dst], 0x50) > >@@ -310,8 +307,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) > > MMI_SDC1(%[ftmp18], %[dst], 0x70) > > > > TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], > >- %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], > >- %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) > >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > > > MMI_SDC1(%[ftmp19], %[dst], 0x48) > > MMI_SDC1(%[ftmp20], %[dst], 0x58) > >diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c > >index b24a87a..bd80aa1 100644 > >--- a/libavcodec/mips/vp8dsp_mmi.c > >+++ b/libavcodec/mips/vp8dsp_mmi.c > >@@ -44,58 +44,6 @@ > > "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \ > > "punpckhbh "#dst_l", "#src", %[db_2] \n\t" > > > >-#define MMI_TRANSPOSE8x8_UB_UB(src_0, src_1, src_2, src_3, \ > >- src_4, src_5, src_6, src_7, \ > >- dst_0, dst_1, dst_2, dst_3, \ > >- dst_4, dst_5, dst_6, dst_7) \ > >- "li %[it_1], 0xe4 \n\t" \ > >- "dmtc1 %[it_1], %[db_1] \n\t" \ > >- "pshufh %[db_2], "#src_0", %[db_1] \n\t" \ > >- "punpcklbh "#dst_0", "#src_0", "#src_1" \n\t" \ > >- "punpckhbh "#dst_1", %[db_2], "#src_1" \n\t" \ > >- "pshufh %[db_2], "#src_2", %[db_1] \n\t" \ > >- "punpcklbh "#dst_2", "#src_2", "#src_3" \n\t" \ > >- "punpckhbh "#dst_3", %[db_2], "#src_3" \n\t" \ > >- "pshufh %[db_2], "#src_4", %[db_1] \n\t" \ > >- "punpcklbh "#dst_4", "#src_4", "#src_5" \n\t" \ > >- "punpckhbh "#dst_5", %[db_2], "#src_5" \n\t" \ > >- "pshufh %[db_2], "#src_6", %[db_1] \n\t" \ > >- "punpcklbh "#dst_6", "#src_6", "#src_7" \n\t" \ > >- "punpckhbh "#dst_7", %[db_2], "#src_7" \n\t" \ > >- \ > >- "pshufh %[db_2], "#dst_0", %[db_1] \n\t" \ > >- "punpcklhw "#dst_0", "#dst_0", "#dst_2" \n\t" \ > >- "punpckhhw "#dst_2", %[db_2], "#dst_2" \n\t" \ > >- "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ > >- "punpcklhw "#dst_1", "#dst_1", "#dst_3" \n\t" \ > >- "punpckhhw "#dst_3", %[db_2], "#dst_3" \n\t" \ > >- "pshufh %[db_2], "#dst_4", %[db_1] \n\t" \ > >- "punpcklhw "#dst_4", "#dst_4", "#dst_6" \n\t" \ > >- "punpckhhw "#dst_6", %[db_2], "#dst_6" \n\t" \ > >- "pshufh %[db_2], "#dst_5", %[db_1] \n\t" \ > >- "punpcklhw "#dst_5", "#dst_5", "#dst_7" \n\t" \ > >- "punpckhhw "#dst_7", %[db_2], "#dst_7" \n\t" \ > >- \ > >- "pshufh %[db_2], "#dst_0", %[db_1] \n\t" \ > >- "punpcklwd "#dst_0", "#dst_0", "#dst_4" \n\t" \ > >- "punpckhwd "#dst_4", %[db_2], "#dst_4" \n\t" \ > >- "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ > >- "punpcklwd "#dst_1", "#dst_1", "#dst_5" \n\t" \ > >- "punpckhwd "#dst_5", %[db_2], "#dst_5" \n\t" \ > >- "pshufh %[db_2], "#dst_2", %[db_1] \n\t" \ > >- "punpcklwd "#dst_2", "#dst_2", "#dst_6" \n\t" \ > >- "punpckhwd "#dst_6", %[db_2], "#dst_6" \n\t" \ > >- "pshufh %[db_2], "#dst_3", %[db_1] \n\t" \ > >- "punpcklwd "#dst_3", "#dst_3", "#dst_7" \n\t" \ > >- "punpckhwd "#dst_7", %[db_2], "#dst_7" \n\t" \ > >- \ > >- "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ > >- "pshufh "#dst_1", "#dst_4", %[db_1] \n\t" \ > >- "pshufh "#dst_4", %[db_2], %[db_1] \n\t" \ > >- "pshufh %[db_2], "#dst_3", %[db_1] \n\t" \ > >- "pshufh "#dst_3", "#dst_6", %[db_1] \n\t" \ > >- "pshufh "#dst_6", %[db_2], %[db_1] \n\t" > >- > > #define MMI_VP8_LOOP_FILTER \ > > /* Calculation of hev */ \ > > "dmtc1 %[thresh], %[ftmp3] \n\t" \ > >@@ -952,16 +900,14 @@ static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, > > "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t" > > "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t" > > /* Matrix transpose */ > >- MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0], > >- %[q0], %[q1], %[q2], %[q3], > >- %[p3], %[p2], %[p1], %[p0], > >- %[q0], %[q1], %[q2], %[q3]) > >+ TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], > >+ %[q0], %[q1], %[q2], %[q3], > >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > MMI_VP8_LOOP_FILTER > > /* Matrix transpose */ > >- MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0], > >- %[q0], %[q1], %[q2], %[q3], > >- %[p3], %[p2], %[p1], %[p0], > >- %[q0], %[q1], %[q2], %[q3]) > >+ TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], > >+ %[q0], %[q1], %[q2], %[q3], > >+ %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) > > /* Move to dst */ > > "gssdlc1 %[p3], 0x03(%[dst]) \n\t" > > "gssdrc1 %[p3], -0x04(%[dst]) \n\t" > >@@ -1233,8 +1179,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) > > MMI_SDC1(%[ftmp0], %[block], 0x18) > > > > TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], > >- %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8], > >- %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10]) > >+ %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) > > > > // t[0 4 8 12] > > "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" > >@@ -1269,8 +1214,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) > > "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" > > > > TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], > >- %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8], > >- %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10]) > >+ %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) > > > > MMI_LWC1(%[ftmp5], %[dst0], 0x00) > > MMI_LWC1(%[ftmp6], %[dst1], 0x00) > >diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h > >index b16edc4..76b1199 100644 > >--- a/libavutil/mips/mmiutils.h > >+++ b/libavutil/mips/mmiutils.h > >@@ -250,30 +250,53 @@ > > : "memory" \ > > ); > > > >-#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift) \ > >- "li "#r1", 0x93 \n\t" \ > >- "xor "#zero","#zero","#zero" \n\t" \ > >- "mtc1 "#r1", "#shift" \n\t" \ > >- "punpcklhw "#t1", "#m1", "#zero" \n\t" \ > >- "punpcklhw "#t5", "#m2", "#zero" \n\t" \ > >- "pshufh "#t5", "#t5", "#shift" \n\t" \ > >- "or "#t1", "#t1", "#t5" \n\t" \ > >- "punpckhhw "#t2", "#m1", "#zero" \n\t" \ > >- "punpckhhw "#t5", "#m2", "#zero" \n\t" \ > >- "pshufh "#t5", "#t5", "#shift" \n\t" \ > >- "or "#t2", "#t2", "#t5" \n\t" \ > >- "punpcklhw "#t3", "#m3", "#zero" \n\t" \ > >- "punpcklhw "#t5", "#m4", "#zero" \n\t" \ > >- "pshufh "#t5", "#t5", "#shift" \n\t" \ > >- "or "#t3", "#t3", "#t5" \n\t" \ > >- "punpckhhw "#t4", "#m3", "#zero" \n\t" \ > >- "punpckhhw "#t5", "#m4", "#zero" \n\t" \ > >- "pshufh "#t5", "#t5", "#shift" \n\t" \ > >- "or "#t4", "#t4", "#t5" \n\t" \ > >- "punpcklwd "#m1", "#t1", "#t3" \n\t" \ > >- "punpckhwd "#m2", "#t1", "#t3" \n\t" \ > >- "punpcklwd "#m3", "#t2", "#t4" \n\t" \ > >- "punpckhwd "#m4", "#t2", "#t4" \n\t" > >+/** > >+ * brief: Transpose 4X4 half word packaged data. > >+ * fr_i0, fr_i1, fr_i2, fr_i3: src & dst > >+ * fr_t0, fr_t1, fr_t2, fr_t3: temporary register > >+ */ > >+#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, \ > >+ fr_t0, fr_t1, fr_t2, fr_t3) \ > >+ "punpcklhw "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \ > >+ "punpckhhw "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \ > >+ "punpcklhw "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \ > >+ "punpckhhw "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \ > >+ "punpcklwd "#fr_i0", "#fr_t0", "#fr_t2" \n\t" \ > >+ "punpckhwd "#fr_i1", "#fr_t0", "#fr_t2" \n\t" \ > >+ "punpcklwd "#fr_i2", "#fr_t1", "#fr_t3" \n\t" \ > >+ "punpckhwd "#fr_i3", "#fr_t1", "#fr_t3" \n\t" > >+ > >+/** > >+ * brief: Transpose 8x8 byte packaged data. > >+ * fr_i0~i7: src & dst > >+ * fr_t0~t3: temporary register > >+ */ > >+#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, \ > >+ fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3) \ > >+ "punpcklbh "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \ > >+ "punpckhbh "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \ > >+ "punpcklbh "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \ > >+ "punpckhbh "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \ > >+ "punpcklbh "#fr_i0", "#fr_i4", "#fr_i5" \n\t" \ > >+ "punpckhbh "#fr_i1", "#fr_i4", "#fr_i5" \n\t" \ > >+ "punpcklbh "#fr_i2", "#fr_i6", "#fr_i7" \n\t" \ > >+ "punpckhbh "#fr_i3", "#fr_i6", "#fr_i7" \n\t" \ > >+ "punpcklhw "#fr_i4", "#fr_t0", "#fr_t2" \n\t" \ > >+ "punpckhhw "#fr_i5", "#fr_t0", "#fr_t2" \n\t" \ > >+ "punpcklhw "#fr_i6", "#fr_t1", "#fr_t3" \n\t" \ > >+ "punpckhhw "#fr_i7", "#fr_t1", "#fr_t3" \n\t" \ > >+ "punpcklhw "#fr_t0", "#fr_i0", "#fr_i2" \n\t" \ > >+ "punpckhhw "#fr_t1", "#fr_i0", "#fr_i2" \n\t" \ > >+ "punpcklhw "#fr_t2", "#fr_i1", "#fr_i3" \n\t" \ > >+ "punpckhhw "#fr_t3", "#fr_i1", "#fr_i3" \n\t" \ > >+ "punpcklwd "#fr_i0", "#fr_i4", "#fr_t0" \n\t" \ > >+ "punpckhwd "#fr_i1", "#fr_i4", "#fr_t0" \n\t" \ > >+ "punpcklwd "#fr_i2", "#fr_i5", "#fr_t1" \n\t" \ > >+ "punpckhwd "#fr_i3", "#fr_i5", "#fr_t1" \n\t" \ > >+ "punpcklwd "#fr_i4", "#fr_i6", "#fr_t2" \n\t" \ > >+ "punpckhwd "#fr_i5", "#fr_i6", "#fr_t2" \n\t" \ > >+ "punpcklwd "#fr_i6", "#fr_i7", "#fr_t3" \n\t" \ > >+ "punpckhwd "#fr_i7", "#fr_i7", "#fr_t3" \n\t" > > > > /** > > * brief: Parallel SRA for 8 byte packaged data. > >@@ -303,15 +326,14 @@ > > "psrlh "#fr_t1", "#fr_t1", "#fr_i1" \n\t" \ > > "packsshb "#fr_d0", "#fr_t0", "#fr_t1" \n\t" > > > >- > >-#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ > >- "psrah "#fp1", "#fp1", "#shift" \n\t" \ > >- "psrah "#fp2", "#fp2", "#shift" \n\t" \ > >- "psrah "#fp3", "#fp3", "#shift" \n\t" \ > >+#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ > >+ "psrah "#fp1", "#fp1", "#shift" \n\t" \ > >+ "psrah "#fp2", "#fp2", "#shift" \n\t" \ > >+ "psrah "#fp3", "#fp3", "#shift" \n\t" \ > > "psrah "#fp4", "#fp4", "#shift" \n\t" > > > >-#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \ > >- PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ > >+#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \ > >+ PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ > > PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift) > > > > > >-- > >2.1.0 > > Hi Michael, could you please help to review this patch. > BTW, this patch was based on the previous patch" [PATCH 2/2] avcodec/mips: [loongson] optimize vp8 decoding in vp8dsp.", > you'd better merge it first. Thank you very much. will apply both thanks [...]
diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c index a439b40..80778a5 100644 --- a/libavcodec/mips/vc1dsp_mmi.c +++ b/libavcodec/mips/vc1dsp_mmi.c @@ -248,8 +248,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) 0xfff70004, 0xfff0000f, %[ff_pw_4]) TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], - %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], - %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) + %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) MMI_SDC1(%[ftmp15], %[dst], 0x00) MMI_SDC1(%[ftmp16], %[dst], 0x10) @@ -257,8 +256,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) MMI_SDC1(%[ftmp18], %[dst], 0x30) TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], - %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], - %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) + %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) MMI_SDC1(%[ftmp19], %[dst], 0x08) MMI_SDC1(%[ftmp20], %[dst], 0x18) @@ -301,8 +299,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) 0xfff70004, 0xfff0000f, %[ff_pw_4]) TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], - %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], - %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) + %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) MMI_SDC1(%[ftmp15], %[dst], 0x40) MMI_SDC1(%[ftmp16], %[dst], 0x50) @@ -310,8 +307,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) MMI_SDC1(%[ftmp18], %[dst], 0x70) TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], - %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], - %[ftmp5], %[tmp0], %[ftmp6], %[ftmp7]) + %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) MMI_SDC1(%[ftmp19], %[dst], 0x48) MMI_SDC1(%[ftmp20], %[dst], 0x58) diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c index b24a87a..bd80aa1 100644 --- a/libavcodec/mips/vp8dsp_mmi.c +++ b/libavcodec/mips/vp8dsp_mmi.c @@ -44,58 +44,6 @@ "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \ "punpckhbh "#dst_l", "#src", %[db_2] \n\t" -#define MMI_TRANSPOSE8x8_UB_UB(src_0, src_1, src_2, src_3, \ - src_4, src_5, src_6, src_7, \ - dst_0, dst_1, dst_2, dst_3, \ - dst_4, dst_5, dst_6, dst_7) \ - "li %[it_1], 0xe4 \n\t" \ - "dmtc1 %[it_1], %[db_1] \n\t" \ - "pshufh %[db_2], "#src_0", %[db_1] \n\t" \ - "punpcklbh "#dst_0", "#src_0", "#src_1" \n\t" \ - "punpckhbh "#dst_1", %[db_2], "#src_1" \n\t" \ - "pshufh %[db_2], "#src_2", %[db_1] \n\t" \ - "punpcklbh "#dst_2", "#src_2", "#src_3" \n\t" \ - "punpckhbh "#dst_3", %[db_2], "#src_3" \n\t" \ - "pshufh %[db_2], "#src_4", %[db_1] \n\t" \ - "punpcklbh "#dst_4", "#src_4", "#src_5" \n\t" \ - "punpckhbh "#dst_5", %[db_2], "#src_5" \n\t" \ - "pshufh %[db_2], "#src_6", %[db_1] \n\t" \ - "punpcklbh "#dst_6", "#src_6", "#src_7" \n\t" \ - "punpckhbh "#dst_7", %[db_2], "#src_7" \n\t" \ - \ - "pshufh %[db_2], "#dst_0", %[db_1] \n\t" \ - "punpcklhw "#dst_0", "#dst_0", "#dst_2" \n\t" \ - "punpckhhw "#dst_2", %[db_2], "#dst_2" \n\t" \ - "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ - "punpcklhw "#dst_1", "#dst_1", "#dst_3" \n\t" \ - "punpckhhw "#dst_3", %[db_2], "#dst_3" \n\t" \ - "pshufh %[db_2], "#dst_4", %[db_1] \n\t" \ - "punpcklhw "#dst_4", "#dst_4", "#dst_6" \n\t" \ - "punpckhhw "#dst_6", %[db_2], "#dst_6" \n\t" \ - "pshufh %[db_2], "#dst_5", %[db_1] \n\t" \ - "punpcklhw "#dst_5", "#dst_5", "#dst_7" \n\t" \ - "punpckhhw "#dst_7", %[db_2], "#dst_7" \n\t" \ - \ - "pshufh %[db_2], "#dst_0", %[db_1] \n\t" \ - "punpcklwd "#dst_0", "#dst_0", "#dst_4" \n\t" \ - "punpckhwd "#dst_4", %[db_2], "#dst_4" \n\t" \ - "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ - "punpcklwd "#dst_1", "#dst_1", "#dst_5" \n\t" \ - "punpckhwd "#dst_5", %[db_2], "#dst_5" \n\t" \ - "pshufh %[db_2], "#dst_2", %[db_1] \n\t" \ - "punpcklwd "#dst_2", "#dst_2", "#dst_6" \n\t" \ - "punpckhwd "#dst_6", %[db_2], "#dst_6" \n\t" \ - "pshufh %[db_2], "#dst_3", %[db_1] \n\t" \ - "punpcklwd "#dst_3", "#dst_3", "#dst_7" \n\t" \ - "punpckhwd "#dst_7", %[db_2], "#dst_7" \n\t" \ - \ - "pshufh %[db_2], "#dst_1", %[db_1] \n\t" \ - "pshufh "#dst_1", "#dst_4", %[db_1] \n\t" \ - "pshufh "#dst_4", %[db_2], %[db_1] \n\t" \ - "pshufh %[db_2], "#dst_3", %[db_1] \n\t" \ - "pshufh "#dst_3", "#dst_6", %[db_1] \n\t" \ - "pshufh "#dst_6", %[db_2], %[db_1] \n\t" - #define MMI_VP8_LOOP_FILTER \ /* Calculation of hev */ \ "dmtc1 %[thresh], %[ftmp3] \n\t" \ @@ -952,16 +900,14 @@ static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t" "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t" /* Matrix transpose */ - MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0], - %[q0], %[q1], %[q2], %[q3], - %[p3], %[p2], %[p1], %[p0], - %[q0], %[q1], %[q2], %[q3]) + TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], + %[q0], %[q1], %[q2], %[q3], + %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) MMI_VP8_LOOP_FILTER /* Matrix transpose */ - MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0], - %[q0], %[q1], %[q2], %[q3], - %[p3], %[p2], %[p1], %[p0], - %[q0], %[q1], %[q2], %[q3]) + TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], + %[q0], %[q1], %[q2], %[q3], + %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) /* Move to dst */ "gssdlc1 %[p3], 0x03(%[dst]) \n\t" "gssdrc1 %[p3], -0x04(%[dst]) \n\t" @@ -1233,8 +1179,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) MMI_SDC1(%[ftmp0], %[block], 0x18) TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], - %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8], - %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10]) + %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) // t[0 4 8 12] "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" @@ -1269,8 +1214,7 @@ void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], - %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8], - %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10]) + %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) MMI_LWC1(%[ftmp5], %[dst0], 0x00) MMI_LWC1(%[ftmp6], %[dst1], 0x00) diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h index b16edc4..76b1199 100644 --- a/libavutil/mips/mmiutils.h +++ b/libavutil/mips/mmiutils.h @@ -250,30 +250,53 @@ : "memory" \ ); -#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift) \ - "li "#r1", 0x93 \n\t" \ - "xor "#zero","#zero","#zero" \n\t" \ - "mtc1 "#r1", "#shift" \n\t" \ - "punpcklhw "#t1", "#m1", "#zero" \n\t" \ - "punpcklhw "#t5", "#m2", "#zero" \n\t" \ - "pshufh "#t5", "#t5", "#shift" \n\t" \ - "or "#t1", "#t1", "#t5" \n\t" \ - "punpckhhw "#t2", "#m1", "#zero" \n\t" \ - "punpckhhw "#t5", "#m2", "#zero" \n\t" \ - "pshufh "#t5", "#t5", "#shift" \n\t" \ - "or "#t2", "#t2", "#t5" \n\t" \ - "punpcklhw "#t3", "#m3", "#zero" \n\t" \ - "punpcklhw "#t5", "#m4", "#zero" \n\t" \ - "pshufh "#t5", "#t5", "#shift" \n\t" \ - "or "#t3", "#t3", "#t5" \n\t" \ - "punpckhhw "#t4", "#m3", "#zero" \n\t" \ - "punpckhhw "#t5", "#m4", "#zero" \n\t" \ - "pshufh "#t5", "#t5", "#shift" \n\t" \ - "or "#t4", "#t4", "#t5" \n\t" \ - "punpcklwd "#m1", "#t1", "#t3" \n\t" \ - "punpckhwd "#m2", "#t1", "#t3" \n\t" \ - "punpcklwd "#m3", "#t2", "#t4" \n\t" \ - "punpckhwd "#m4", "#t2", "#t4" \n\t" +/** + * brief: Transpose 4X4 half word packaged data. + * fr_i0, fr_i1, fr_i2, fr_i3: src & dst + * fr_t0, fr_t1, fr_t2, fr_t3: temporary register + */ +#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, \ + fr_t0, fr_t1, fr_t2, fr_t3) \ + "punpcklhw "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \ + "punpckhhw "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \ + "punpcklhw "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \ + "punpckhhw "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \ + "punpcklwd "#fr_i0", "#fr_t0", "#fr_t2" \n\t" \ + "punpckhwd "#fr_i1", "#fr_t0", "#fr_t2" \n\t" \ + "punpcklwd "#fr_i2", "#fr_t1", "#fr_t3" \n\t" \ + "punpckhwd "#fr_i3", "#fr_t1", "#fr_t3" \n\t" + +/** + * brief: Transpose 8x8 byte packaged data. + * fr_i0~i7: src & dst + * fr_t0~t3: temporary register + */ +#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, \ + fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3) \ + "punpcklbh "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \ + "punpckhbh "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \ + "punpcklbh "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \ + "punpckhbh "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \ + "punpcklbh "#fr_i0", "#fr_i4", "#fr_i5" \n\t" \ + "punpckhbh "#fr_i1", "#fr_i4", "#fr_i5" \n\t" \ + "punpcklbh "#fr_i2", "#fr_i6", "#fr_i7" \n\t" \ + "punpckhbh "#fr_i3", "#fr_i6", "#fr_i7" \n\t" \ + "punpcklhw "#fr_i4", "#fr_t0", "#fr_t2" \n\t" \ + "punpckhhw "#fr_i5", "#fr_t0", "#fr_t2" \n\t" \ + "punpcklhw "#fr_i6", "#fr_t1", "#fr_t3" \n\t" \ + "punpckhhw "#fr_i7", "#fr_t1", "#fr_t3" \n\t" \ + "punpcklhw "#fr_t0", "#fr_i0", "#fr_i2" \n\t" \ + "punpckhhw "#fr_t1", "#fr_i0", "#fr_i2" \n\t" \ + "punpcklhw "#fr_t2", "#fr_i1", "#fr_i3" \n\t" \ + "punpckhhw "#fr_t3", "#fr_i1", "#fr_i3" \n\t" \ + "punpcklwd "#fr_i0", "#fr_i4", "#fr_t0" \n\t" \ + "punpckhwd "#fr_i1", "#fr_i4", "#fr_t0" \n\t" \ + "punpcklwd "#fr_i2", "#fr_i5", "#fr_t1" \n\t" \ + "punpckhwd "#fr_i3", "#fr_i5", "#fr_t1" \n\t" \ + "punpcklwd "#fr_i4", "#fr_i6", "#fr_t2" \n\t" \ + "punpckhwd "#fr_i5", "#fr_i6", "#fr_t2" \n\t" \ + "punpcklwd "#fr_i6", "#fr_i7", "#fr_t3" \n\t" \ + "punpckhwd "#fr_i7", "#fr_i7", "#fr_t3" \n\t" /** * brief: Parallel SRA for 8 byte packaged data. @@ -303,15 +326,14 @@ "psrlh "#fr_t1", "#fr_t1", "#fr_i1" \n\t" \ "packsshb "#fr_d0", "#fr_t0", "#fr_t1" \n\t" - -#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ - "psrah "#fp1", "#fp1", "#shift" \n\t" \ - "psrah "#fp2", "#fp2", "#shift" \n\t" \ - "psrah "#fp3", "#fp3", "#shift" \n\t" \ +#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ + "psrah "#fp1", "#fp1", "#shift" \n\t" \ + "psrah "#fp2", "#fp2", "#shift" \n\t" \ + "psrah "#fp3", "#fp3", "#shift" \n\t" \ "psrah "#fp4", "#fp4", "#shift" \n\t" -#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \ - PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ +#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \ + PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \ PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift)