diff mbox

[FFmpeg-devel] avcodec/mips: [loongson] optimize vp8 decoding in vp8dsp.

Message ID 1529393760-21426-1-git-send-email-yinshiyou-hf@loongson.cn
State Superseded
Headers show

Commit Message

殷时友 June 19, 2018, 7:36 a.m. UTC
From: gxw <guxiwei-hf@loongson.cn>

Optimize vp8 loop filter with mmi, 4 functions optimized:
1. ff_vp8_h_loop_filter8uv_mmi.
2. ff_vp8_v_loop_filter8uv_mmi.
3. ff_vp8_h_loop_filter16_mmi.
4. ff_vp8_v_loop_filter16_mmi.

Vp8 decoding speed improved about 50%(from 73fps to 110fps, Tested on loongson 3A3000).

Change-Id: Iba567c7ab4c6a284b5e8ccbc567575448a508350
Signed-off-by: gxw <guxiwei-hf@loongson.cn>
Signed-off-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
---
 libavcodec/mips/vp8dsp_mmi.c | 1216 ++++++++++++++----------------------------
 1 file changed, 396 insertions(+), 820 deletions(-)

Comments

殷时友 July 5, 2018, 12:02 p.m. UTC | #1
Is there anyone reviewing my patch? I sent 9 patchs about 20 days ago, but only one patch received reply at June 14.
Michael Niedermayer July 5, 2018, 8:26 p.m. UTC | #2
On Thu, Jul 05, 2018 at 08:02:01PM +0800, yinshiyou-hf@loongson.cn wrote:
> 
> Is there anyone reviewing my patch? I sent 9 patchs about 20 days ago, but only one patch received reply at June 14. 

Theres a maintainer listed for mips in MAINTAINERs
he should review the patches. If he has no time then someone else should
maybe help him
Also it may make sense to split maintainership of the 2 MIPS implementations
if that would make reviews work better but it still requires someone to do
the review.

I dont know mips asm well enough to do a optimal review

thanks

[...]
Manojkumar Bhosale July 6, 2018, 6:08 a.m. UTC | #3
Hi yinshiyou-hf,

Sorry for not responding to your patches.
As I am no longer being part of MIPS, you may want to contact MIPS and ask for assigning the appropriate maintainer.

Copying David Lau from MIPS.

Thanks,
Manoj


-----Original Message-----
From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of Michael Niedermayer
Sent: Friday, July 6, 2018 1:56 AM
To: FFmpeg development discussions and patches
Subject: Re: [FFmpeg-devel] [PATCH] avcodec/mips: [loongson] optimize vp8 decoding in vp8dsp.

On Thu, Jul 05, 2018 at 08:02:01PM +0800, yinshiyou-hf@loongson.cn wrote:
> 
> Is there anyone reviewing my patch? I sent 9 patchs about 20 days ago, but only one patch received reply at June 14. 

Theres a maintainer listed for mips in MAINTAINERs he should review the patches. If he has no time then someone else should maybe help him Also it may make sense to split maintainership of the 2 MIPS implementations if that would make reviews work better but it still requires someone to do the review.

I dont know mips asm well enough to do a optimal review

thanks

[...]
殷时友 July 16, 2018, 10:01 a.m. UTC | #4
> > 

> > Is there anyone reviewing my patch? I sent 9 patchs about 20 days ago, but only one patch received reply at June 14. 

> 

> Theres a maintainer listed for mips in MAINTAINERs

> he should review the patches. If he has no time then someone else should

> maybe help him

> Also it may make sense to split maintainership of the 2 MIPS implementations

> if that would make reviews work better but it still requires someone to do

> the review.

> 

> I dont know mips asm well enough to do a optimal review


Thank you very much for your reply.
I have received an email from mips mantainer, he said he is no longer part of mips. what a pity.
diff mbox

Patch

diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c
index f972628..e279148 100644
--- a/libavcodec/mips/vp8dsp_mmi.c
+++ b/libavcodec/mips/vp8dsp_mmi.c
@@ -25,6 +25,251 @@ 
 #include "constants.h"
 #include "libavutil/mips/mmiutils.h"
 
+#define DECLARE_DOUBLE_1            double db_1
+#define DECLARE_DOUBLE_2            double db_2
+#define DECLARE_UINT32_T            uint32_t  it_1
+#define RESTRICT_ASM_DOUBLE_1       [db_1]"=&f"(db_1)
+#define RESTRICT_ASM_DOUBLE_2       [db_2]"=&f"(db_2)
+#define RESTRICT_ASM_UINT32_T       [it_1]"=&r"(it_1)
+
+#define MMI_PSRAB(src, size)                                                \
+        "li         %[it_1],    0x08                                \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "psllh      %[db_2],    "#src",         %[db_1]             \n\t"   \
+        PTR_ADDU    "%[it_1],   "#size",        %[it_1]             \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "psrah      %[db_2],    %[db_2],        %[db_1]             \n\t"   \
+        "psrah      "#src",     "#src",         %[db_1]             \n\t"   \
+        PTR_SUBU    "%[it_1],   %[it_1],        "#size"             \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "psllh      "#src",     "#src",         %[db_1]             \n\t"   \
+        "xor        %[db_1],    %[db_1],        %[db_1]             \n\t"   \
+        "li         %[it_1],    0x00ff00ff                          \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "punpcklwd  %[db_1],    %[db_1],        %[db_1]             \n\t"   \
+        "and        %[db_2],    %[db_2],        %[db_1]             \n\t"   \
+        "paddb      "#src",     "#src",         %[db_2]             \n\t"
+
+#define MMI_PSRLB(src, size)                                                \
+        "li         %[it_1],    0x08                                \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "psllh      %[db_2],    "#src",         %[db_1]             \n\t"   \
+        PTR_ADDU    "%[it_1],   "#size",        %[it_1]             \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "psrlh      %[db_2],    %[db_2],        %[db_1]             \n\t"   \
+        "psrlh      "#src",     "#src",         %[db_1]             \n\t"   \
+        PTR_SUBU    "%[it_1],   %[it_1],        "#size"             \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "psllh      "#src",     "#src",         %[db_1]             \n\t"   \
+        "paddb      "#src",     "#src",         %[db_2]             \n\t"
+
+#define MMI_PCMPGTUB(dst, src1, src2)                                       \
+        "pcmpeqb    %[db_1],    "#src1",        "#src2"             \n\t"   \
+        "pmaxub     %[db_2],    "#src1",        "#src2"             \n\t"   \
+        "pcmpeqb    %[db_2],    %[db_2],        "#src1"             \n\t"   \
+        "xor        "#dst",     %[db_2],        %[db_1]             \n\t"
+
+#define MMI_BTOH(dst_l, dst_r, src)                                         \
+        "xor        %[db_1],    %[db_1],        %[db_1]             \n\t"   \
+        "pcmpgtb    %[db_2],    %[db_1],        "#src"              \n\t"   \
+        "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
+        "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
+
+#define MMI_TRANSPOSE8x8_UB_UB(src_0, src_1, src_2, src_3,                  \
+                               src_4, src_5, src_6, src_7,                  \
+                               dst_0, dst_1, dst_2, dst_3,                  \
+                               dst_4, dst_5, dst_6, dst_7)                  \
+        "li         %[it_1],    0xe4                                \n\t"   \
+        "dmtc1      %[it_1],    %[db_1]                             \n\t"   \
+        "pshufh     %[db_2],    "#src_0",       %[db_1]             \n\t"   \
+        "punpcklbh  "#dst_0",   "#src_0",       "#src_1"            \n\t"   \
+        "punpckhbh  "#dst_1",   %[db_2],        "#src_1"            \n\t"   \
+        "pshufh     %[db_2],    "#src_2",       %[db_1]             \n\t"   \
+        "punpcklbh  "#dst_2",   "#src_2",       "#src_3"            \n\t"   \
+        "punpckhbh  "#dst_3",   %[db_2],        "#src_3"            \n\t"   \
+        "pshufh     %[db_2],    "#src_4",       %[db_1]             \n\t"   \
+        "punpcklbh  "#dst_4",   "#src_4",       "#src_5"            \n\t"   \
+        "punpckhbh  "#dst_5",   %[db_2],        "#src_5"            \n\t"   \
+        "pshufh     %[db_2],    "#src_6",       %[db_1]             \n\t"   \
+        "punpcklbh  "#dst_6",   "#src_6",       "#src_7"            \n\t"   \
+        "punpckhbh  "#dst_7",   %[db_2],        "#src_7"            \n\t"   \
+                                                                            \
+        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
+        "punpcklhw  "#dst_0",   "#dst_0",       "#dst_2"            \n\t"   \
+        "punpckhhw  "#dst_2",   %[db_2],        "#dst_2"            \n\t"   \
+        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
+        "punpcklhw  "#dst_1",   "#dst_1",       "#dst_3"            \n\t"   \
+        "punpckhhw  "#dst_3",   %[db_2],        "#dst_3"            \n\t"   \
+        "pshufh     %[db_2],    "#dst_4",       %[db_1]             \n\t"   \
+        "punpcklhw  "#dst_4",   "#dst_4",       "#dst_6"            \n\t"   \
+        "punpckhhw  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
+        "pshufh     %[db_2],    "#dst_5",       %[db_1]             \n\t"   \
+        "punpcklhw  "#dst_5",   "#dst_5",       "#dst_7"            \n\t"   \
+        "punpckhhw  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
+                                                                            \
+        "pshufh     %[db_2],    "#dst_0",       %[db_1]             \n\t"   \
+        "punpcklwd  "#dst_0",   "#dst_0",       "#dst_4"            \n\t"   \
+        "punpckhwd  "#dst_4",   %[db_2],        "#dst_4"            \n\t"   \
+        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
+        "punpcklwd  "#dst_1",   "#dst_1",       "#dst_5"            \n\t"   \
+        "punpckhwd  "#dst_5",   %[db_2],        "#dst_5"            \n\t"   \
+        "pshufh     %[db_2],    "#dst_2",       %[db_1]             \n\t"   \
+        "punpcklwd  "#dst_2",   "#dst_2",       "#dst_6"            \n\t"   \
+        "punpckhwd  "#dst_6",   %[db_2],        "#dst_6"            \n\t"   \
+        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
+        "punpcklwd  "#dst_3",   "#dst_3",       "#dst_7"            \n\t"   \
+        "punpckhwd  "#dst_7",   %[db_2],        "#dst_7"            \n\t"   \
+                                                                            \
+        "pshufh     %[db_2],    "#dst_1",       %[db_1]             \n\t"   \
+        "pshufh     "#dst_1",   "#dst_4",       %[db_1]             \n\t"   \
+        "pshufh     "#dst_4",   %[db_2],        %[db_1]             \n\t"   \
+        "pshufh     %[db_2],    "#dst_3",       %[db_1]             \n\t"   \
+        "pshufh     "#dst_3",   "#dst_6",       %[db_1]             \n\t"   \
+        "pshufh     "#dst_6",   %[db_2],        %[db_1]             \n\t"
+
+#define MMI_VP8_LOOP_FILTER                                                 \
+        /* Calculation of hev */                                            \
+        "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "pasubub    %[ftmp0],   %[p1],          %[p0]               \n\t"   \
+        "pasubub    %[ftmp1],   %[q1],          %[q0]               \n\t"   \
+        "pmaxub     %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"   \
+        MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3])                            \
+        /* Calculation of mask */                                           \
+        "pasubub    %[ftmp1],   %[p0],          %[q0]               \n\t"   \
+        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
+        "pasubub    %[ftmp2],   %[p1],          %[q1]               \n\t"   \
+        "li         %[tmp0],    0x01                                \n\t"   \
+        MMI_PSRLB(%[ftmp2],  %[tmp0])                                       \
+        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
+        "dmtc1      %[e],       %[ftmp3]                            \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3])                           \
+        "pmaxub     %[mask],    %[mask],        %[ftmp0]            \n\t"   \
+        "pasubub    %[ftmp1],   %[p3],          %[p2]               \n\t"   \
+        "pasubub    %[ftmp2],   %[p2],          %[p1]               \n\t"   \
+        "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
+        "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
+        "pasubub    %[ftmp1],   %[q3],          %[q2]               \n\t"   \
+        "pasubub    %[ftmp2],   %[q2],          %[q1]               \n\t"   \
+        "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
+        "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
+        "dmtc1      %[i],       %[ftmp3]                            \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3])                            \
+        "li         %[tmp0],    0xffffffff                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp3]                            \n\t"   \
+        "xor        %[mask],    %[mask],        %[ftmp3]            \n\t"   \
+        /* VP8_MBFILTER */                                                  \
+        "li         %[tmp0],    0x80808080                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp7]                            \n\t"   \
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"   \
+        "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"   \
+        "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
+        "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
+        "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
+        "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
+        "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
+        "psubsb     %[ftmp4],   %[p1],          %[q1]               \n\t"   \
+        "psubb      %[ftmp5],   %[q0],          %[p0]               \n\t"   \
+        "li         %[tmp0],    0x00030003                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp6]                            \n\t"   \
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"   \
+        MMI_BTOH(%[ftmp1],  %[ftmp0],  %[ftmp5])                            \
+        MMI_BTOH(%[ftmp3],  %[ftmp2],  %[ftmp4])                            \
+        /* Right part */                                                    \
+        "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"   \
+        "paddh      %[ftmp0],   %[ftmp2],       %[ftmp0]            \n\t"   \
+        /* Left part */                                                     \
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"   \
+        "paddh      %[ftmp1],   %[ftmp3],       %[ftmp1]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp1],   %[ftmp0],       %[ftmp1]            \n\t"   \
+        "and        %[ftmp1],   %[ftmp1],       %[mask]             \n\t"   \
+        "and        %[ftmp2],   %[ftmp1],       %[hev]              \n\t"   \
+        "li         %[tmp0],    0x04040404                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        "paddsb     %[ftmp3],   %[ftmp2],       %[ftmp0]            \n\t"   \
+        "li         %[tmp0],    0x03                                \n\t"   \
+        MMI_PSRAB(%[ftmp3],  %[tmp0])                                       \
+        "li         %[tmp0],    0x03030303                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        "paddsb     %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"   \
+        "li         %[tmp0],    0x03                                \n\t"   \
+        MMI_PSRAB(%[ftmp4],  %[tmp0])                                       \
+        "psubsb     %[q0],      %[q0],          %[ftmp3]            \n\t"   \
+        "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
+        /* filt_val &= ~hev */                                              \
+        "pcmpeqw    %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        "xor        %[hev],     %[hev],         %[ftmp0]            \n\t"   \
+        "and        %[ftmp1],   %[ftmp1],       %[hev]              \n\t"   \
+        MMI_BTOH(%[ftmp5],  %[ftmp6],  %[ftmp1])                            \
+        "li         %[tmp0],    0x07                                \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
+        "li         %[tmp0],    0x001b001b                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
+        "li         %[tmp0],    0x003f003f                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        /* Right part */                                                    \
+        "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+        /* Left part */                                                     \
+        "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "psubsb     %[q0],      %[q0],          %[ftmp4]            \n\t"   \
+        "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
+        "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
+        "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
+        "li         %[tmp0],    0x00120012                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
+        /* Right part */                                                    \
+        "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+        /* Left part */                                                     \
+        "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "psubsb     %[q1],      %[q1],          %[ftmp4]            \n\t"   \
+        "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
+        "paddsb     %[p1],      %[p1],          %[ftmp4]            \n\t"   \
+        "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
+        "li         %[tmp0],    0x03                                \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
+        /* Right part */                                                    \
+        "psllh      %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+        /* Left part */                                                     \
+        "psllh      %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "psubsb     %[q2],      %[q2],          %[ftmp4]            \n\t"   \
+        "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
+        "paddsb     %[p2],      %[p2],          %[ftmp4]            \n\t"   \
+        "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"
+
 #define PUT_VP8_EPEL4_H6_MMI(src, dst)                                      \
         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
@@ -475,29 +720,6 @@  DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
     0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
 };
 
-#if 0
-#define FILTER_6TAP(src, F, stride)                                           \
-    cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
-        F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] -             \
-        F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
-
-#define FILTER_4TAP(src, F, stride)                                           \
-    cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
-        F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
-
-static const uint8_t subpel_filters[7][6] = {
-    { 0,  6, 123,  12,  1, 0 },
-    { 2, 11, 108,  36,  8, 1 },
-    { 0,  9,  93,  50,  6, 0 },
-    { 3, 16,  77,  77, 16, 3 },
-    { 0,  6,  50,  93,  9, 0 },
-    { 1,  8,  36, 108, 11, 2 },
-    { 0,  1,  12, 123,  6, 0 },
-};
-
-#define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
-#define MUL_35468(a)  (((a) * 35468) >> 16)
-#endif
 
 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
@@ -621,15 +843,71 @@  static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
 {
-    int i;
-
-    for (i = 0; i < 8; i++)
-        if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
-            if (hev(dst + i * 1, stride, hev_thresh))
-                vp8_filter_common_is4tap(dst + i * 1, stride);
-            else
-                filter_mbedge(dst + i * 1, stride);
-        }
+    double ftmp[18];
+    uint32_t tmp[1];
+    DECLARE_DOUBLE_1;
+    DECLARE_DOUBLE_2;
+    DECLARE_UINT32_T;
+    __asm__ volatile(
+        /* Get data from dst */
+        "gsldlc1    %[q0],      0x07(%[dst])                      \n\t"
+        "gsldrc1    %[q0],      0x00(%[dst])                      \n\t"
+        PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gsldlc1    %[p0],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p0],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[p1],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p1],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[p2],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p2],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[p3],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p3],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gsldlc1    %[q1],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[q1],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[q2],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[q2],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[q3],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[q3],      0x00(%[tmp0])                     \n\t"
+        MMI_VP8_LOOP_FILTER
+        /* Move to dst */
+        "gssdlc1    %[q0],      0x07(%[dst])                      \n\t"
+        "gssdrc1    %[q0],      0x00(%[dst])                      \n\t"
+        PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gssdlc1    %[p0],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[p0],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gssdlc1    %[p1],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[p1],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gssdlc1    %[p2],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[p2],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gssdlc1    %[q1],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[q1],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gssdlc1    %[q2],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[q2],      0x00(%[tmp0])                     \n\t"
+        : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
+          [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
+          [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
+          [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
+          [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
+          [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
+          [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
+          [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
+          [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
+          [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
+          RESTRICT_ASM_UINT32_T
+        : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
+          [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
 }
 
 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
@@ -650,15 +928,87 @@  static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
 {
-    int i;
-
-    for (i = 0; i < 8; i++)
-        if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
-            if (hev(dst + i * stride, 1, hev_thresh))
-                vp8_filter_common_is4tap(dst + i * stride, 1);
-            else
-                filter_mbedge(dst + i * stride, 1);
-        }
+    double ftmp[18];
+    uint32_t tmp[1];
+    DECLARE_DOUBLE_1;
+    DECLARE_DOUBLE_2;
+    DECLARE_UINT32_T;
+    __asm__ volatile(
+        /* Get data from dst */
+        "gsldlc1    %[p3],        0x03(%[dst])                    \n\t"
+        "gsldrc1    %[p3],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[tmp0],     %[dst],           %[stride]     \n\t"
+        "gsldlc1    %[p2],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[p2],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[p1],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[p1],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[p0],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[p0],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q0],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q0],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q1],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q1],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q2],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q2],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q3],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q3],        -0x04(%[tmp0])                  \n\t"
+        /* Matrix transpose */
+        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
+                               %[q0], %[q1], %[q2], %[q3],
+                               %[p3], %[p2], %[p1], %[p0],
+                               %[q0], %[q1], %[q2], %[q3])
+        MMI_VP8_LOOP_FILTER
+        /* Matrix transpose */
+        MMI_TRANSPOSE8x8_UB_UB(%[p3], %[p2], %[p1], %[p0],
+                               %[q0], %[q1], %[q2], %[q3],
+                               %[p3], %[p2], %[p1], %[p0],
+                               %[q0], %[q1], %[q2], %[q3])
+        /* Move to dst */
+        "gssdlc1    %[p3],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p3],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[p2],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p2],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[p1],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p1],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[p0],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p0],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q0],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q0],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q1],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q1],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q2],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q2],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q3],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q3],        -0x04(%[dst])                   \n\t"
+        : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
+          [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
+          [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
+          [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
+          [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
+          [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
+          [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
+          [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
+          [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
+          [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
+          RESTRICT_ASM_UINT32_T
+        : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
+          [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
 }
 
 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
@@ -678,7 +1028,6 @@  static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
 
 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
 {
-#if 1
     double ftmp[8];
     DECLARE_VAR_ALL64;
 
@@ -740,74 +1089,6 @@  void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
         : [dc]"r"((uint8_t *)dc)
         : "memory"
     );
-#else
-    int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
-
-    t00 = dc[0] + dc[12];
-    t10 = dc[1] + dc[13];
-    t20 = dc[2] + dc[14];
-    t30 = dc[3] + dc[15];
-
-    t03 = dc[0] - dc[12];
-    t13 = dc[1] - dc[13];
-    t23 = dc[2] - dc[14];
-    t33 = dc[3] - dc[15];
-
-    t01 = dc[4] + dc[ 8];
-    t11 = dc[5] + dc[ 9];
-    t21 = dc[6] + dc[10];
-    t31 = dc[7] + dc[11];
-
-    t02 = dc[4] - dc[ 8];
-    t12 = dc[5] - dc[ 9];
-    t22 = dc[6] - dc[10];
-    t32 = dc[7] - dc[11];
-
-    dc[ 0] = t00 + t01;
-    dc[ 1] = t10 + t11;
-    dc[ 2] = t20 + t21;
-    dc[ 3] = t30 + t31;
-
-    dc[ 4] = t03 + t02;
-    dc[ 5] = t13 + t12;
-    dc[ 6] = t23 + t22;
-    dc[ 7] = t33 + t32;
-
-    dc[ 8] = t00 - t01;
-    dc[ 9] = t10 - t11;
-    dc[10] = t20 - t21;
-    dc[11] = t30 - t31;
-
-    dc[12] = t03 - t02;
-    dc[13] = t13 - t12;
-    dc[14] = t23 - t22;
-    dc[15] = t33 - t32;
-
-    block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
-    block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
-    block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
-    block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
-
-    block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
-    block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
-    block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
-    block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
-
-    block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
-    block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
-    block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
-    block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
-
-    block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
-    block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
-    block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
-    block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
-
-    AV_ZERO64(dc + 0);
-    AV_ZERO64(dc + 4);
-    AV_ZERO64(dc + 8);
-    AV_ZERO64(dc + 12);
-#endif
 }
 
 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
@@ -836,7 +1117,6 @@  void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
 
 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
 {
-#if 1
     DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
     DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
     double ftmp[12];
@@ -968,44 +1248,10 @@  void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
           [ff_ph_4e7b]"f"(ff_ph_4e7b),      [ff_ph_22a3]"f"(ff_ph_22a3)
         : "memory"
     );
-#else
-    int i, t0, t1, t2, t3;
-    int16_t tmp[16];
-
-    for (i = 0; i < 4; i++) {
-        t0 = block[0 + i] + block[8 + i];
-        t1 = block[0 + i] - block[8 + i];
-        t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
-        t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
-        block[ 0 + i] = 0;
-        block[ 4 + i] = 0;
-        block[ 8 + i] = 0;
-        block[12 + i] = 0;
-
-        tmp[i * 4 + 0] = t0 + t3;
-        tmp[i * 4 + 1] = t1 + t2;
-        tmp[i * 4 + 2] = t1 - t2;
-        tmp[i * 4 + 3] = t0 - t3;
-    }
-
-    for (i = 0; i < 4; i++) {
-        t0 = tmp[0 + i] + tmp[8 + i];
-        t1 = tmp[0 + i] - tmp[8 + i];
-        t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
-        t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
-
-        dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
-        dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
-        dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
-        dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
-        dst   += stride;
-    }
-#endif
 }
 
 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
 {
-#if 1
     int dc = (block[0] + 4) >> 3;
     double ftmp[6];
     DECLARE_VAR_LOW32;
@@ -1046,19 +1292,6 @@  void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
           [dc]"r"(dc)
         : "memory"
     );
-#else
-    int i, dc = (block[0] + 4) >> 3;
-
-    block[0] = 0;
-
-    for (i = 0; i < 4; i++) {
-        dst[0] = av_clip_uint8(dst[0] + dc);
-        dst[1] = av_clip_uint8(dst[1] + dc);
-        dst[2] = av_clip_uint8(dst[2] + dc);
-        dst[3] = av_clip_uint8(dst[3] + dc);
-        dst   += stride;
-    }
-#endif
 }
 
 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
@@ -1083,29 +1316,16 @@  void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
         int flim_I, int hev_thresh)
 {
-    int i;
-
-    for (i = 0; i < 16; i++)
-        if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
-            if (hev(dst + i * 1, stride, hev_thresh))
-                vp8_filter_common_is4tap(dst + i * 1, stride);
-            else
-                filter_mbedge(dst + i * 1, stride);
-        }
+    vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
+    vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
 }
 
 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
         int flim_I, int hev_thresh)
 {
-    int i;
-
-    for (i = 0; i < 16; i++)
-        if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
-            if (hev(dst + i * stride, 1, hev_thresh))
-                vp8_filter_common_is4tap(dst + i * stride, 1);
-            else
-                filter_mbedge(dst + i * stride, 1);
-        }
+    vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
+    vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
+                           hev_thresh);
 }
 
 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
@@ -1188,7 +1408,6 @@  void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int x, int y)
 {
-#if 1
     double ftmp[2];
     uint64_t tmp[2];
     mips_reg addr[2];
@@ -1224,18 +1443,11 @@  void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [srcstride]"r"((mips_reg)srcstride)
         : "memory"
     );
-#else
-    int i;
-
-    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
-        memcpy(dst, src, 16);
-#endif
 }
 
 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int x, int y)
 {
-#if 1
     double ftmp[1];
     uint64_t tmp[1];
     mips_reg addr[2];
@@ -1264,18 +1476,11 @@  void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [srcstride]"r"((mips_reg)srcstride)
         : "memory"
     );
-#else
-    int i;
-
-    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
-        memcpy(dst, src, 8);
-#endif
 }
 
 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int x, int y)
 {
-#if 1
     double ftmp[1];
     uint64_t tmp[1];
     mips_reg addr[2];
@@ -1304,18 +1509,11 @@  void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [srcstride]"r"((mips_reg)srcstride)
         : "memory"
     );
-#else
-    int i;
-
-    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
-        memcpy(dst, src, 4);
-#endif
 }
 
 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -1375,24 +1573,11 @@  void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_4TAP(src, filter, 1);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -1436,24 +1621,11 @@  void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_4TAP(src, filter, 1);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
     double ftmp[6];
     uint32_t tmp[1];
@@ -1491,24 +1663,11 @@  void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_4TAP(src, filter, 1);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -1569,24 +1728,11 @@  void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_6TAP(src, filter, 1);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -1631,24 +1777,11 @@  void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_6TAP(src, filter, 1);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
     double ftmp[6];
     uint32_t tmp[1];
@@ -1687,24 +1820,11 @@  void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_6TAP(src, filter, 1);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[my - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -1765,24 +1885,11 @@  void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[my - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_4TAP(src, filter, srcstride);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[my - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -1828,24 +1935,11 @@  void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[my - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_4TAP(src, filter, srcstride);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[my - 1];
     double ftmp[6];
     uint32_t tmp[1];
@@ -1885,24 +1979,11 @@  void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[my - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_4TAP(src, filter, srcstride);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[my - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -1964,24 +2045,11 @@  void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[my - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_6TAP(src, filter, srcstride);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[my - 1];
     double ftmp[9];
     uint32_t tmp[1];
@@ -2028,24 +2096,11 @@  void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[my - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_6TAP(src, filter, srcstride);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     const uint64_t *filter = fourtap_subpel_filters[my - 1];
     double ftmp[6];
     uint32_t tmp[1];
@@ -2086,24 +2141,11 @@  void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
         : "memory"
     );
-#else
-    const uint8_t *filter = subpel_filters[my - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_6TAP(src, filter, srcstride);
-        dst += dststride;
-        src += srcstride;
-    }
-#endif
 }
 
 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
     uint8_t *tmp = tmp_array;
 
@@ -2111,38 +2153,11 @@  void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
     tmp = tmp_array + 16;
     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[560];
-    uint8_t *tmp = tmp_array;
-
-    src -= srcstride;
-
-    for (y = 0; y < h + 3; y++) {
-        for (x = 0; x < 16; x++)
-            tmp[x] = FILTER_4TAP(src, filter, 1);
-        tmp += 16;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 16;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_4TAP(tmp, filter, 16);
-        dst += dststride;
-        tmp += 16;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
     uint8_t *tmp = tmp_array;
 
@@ -2150,38 +2165,11 @@  void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
     tmp = tmp_array + 8;
     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[152];
-    uint8_t *tmp = tmp_array;
-
-    src -= srcstride;
-
-    for (y = 0; y < h + 3; y++) {
-        for (x = 0; x < 8; x++)
-            tmp[x] = FILTER_4TAP(src, filter, 1);
-        tmp += 8;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 8;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_4TAP(tmp, filter, 8);
-        dst += dststride;
-        tmp += 8;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
     uint8_t *tmp = tmp_array;
 
@@ -2189,37 +2177,11 @@  void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
     tmp = tmp_array + 4;
     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[44];
-    uint8_t *tmp = tmp_array;
-
-    src -= srcstride;
-
-    for (y = 0; y < h + 3; y++) {
-        for (x = 0; x < 4; x++)
-            tmp[x] = FILTER_4TAP(src, filter, 1);
-        tmp += 4;
-        src += srcstride;
-    }
-    tmp    = tmp_array + 4;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_4TAP(tmp, filter, 4);
-        dst += dststride;
-        tmp += 4;
-    }
-#endif
 }
 
 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
     uint8_t *tmp = tmp_array;
 
@@ -2227,38 +2189,11 @@  void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
     tmp    = tmp_array + 32;
     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[592];
-    uint8_t *tmp = tmp_array;
-
-    src -= 2 * srcstride;
-
-    for (y = 0; y < h + 5; y++) {
-        for (x = 0; x < 16; x++)
-            tmp[x] = FILTER_4TAP(src, filter, 1);
-        tmp += 16;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 32;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_6TAP(tmp, filter, 16);
-        dst += dststride;
-        tmp += 16;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
     uint8_t *tmp = tmp_array;
 
@@ -2266,38 +2201,11 @@  void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
     tmp    = tmp_array + 16;
     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[168];
-    uint8_t *tmp = tmp_array;
-
-    src -= 2 * srcstride;
-
-    for (y = 0; y < h + 5; y++) {
-        for (x = 0; x < 8; x++)
-            tmp[x] = FILTER_4TAP(src, filter, 1);
-        tmp += 8;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 16;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_6TAP(tmp, filter, 8);
-        dst += dststride;
-        tmp += 8;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
     uint8_t *tmp = tmp_array;
 
@@ -2305,38 +2213,11 @@  void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
     tmp    = tmp_array + 8;
     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[52];
-    uint8_t *tmp = tmp_array;
-
-    src -= 2 * srcstride;
-
-    for (y = 0; y < h + 5; y++) {
-        for (x = 0; x < 4; x++)
-            tmp[x] = FILTER_4TAP(src, filter, 1);
-        tmp += 4;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 8;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_6TAP(tmp, filter, 4);
-        dst += dststride;
-        tmp += 4;
-    }
-#endif
 }
 
 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
     uint8_t *tmp = tmp_array;
 
@@ -2344,38 +2225,11 @@  void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
     tmp    = tmp_array + 16;
     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[560];
-    uint8_t *tmp = tmp_array;
-
-    src -= srcstride;
-
-    for (y = 0; y < h + 3; y++) {
-        for (x = 0; x < 16; x++)
-            tmp[x] = FILTER_6TAP(src, filter, 1);
-        tmp += 16;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 16;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_4TAP(tmp, filter, 16);
-        dst += dststride;
-        tmp += 16;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
     uint8_t *tmp = tmp_array;
 
@@ -2383,38 +2237,11 @@  void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
     tmp    = tmp_array + 8;
     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[152];
-    uint8_t *tmp = tmp_array;
-
-    src -= srcstride;
-
-    for (y = 0; y < h + 3; y++) {
-        for (x = 0; x < 8; x++)
-            tmp[x] = FILTER_6TAP(src, filter, 1);
-        tmp += 8;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 8;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_4TAP(tmp, filter, 8);
-        dst += dststride;
-        tmp += 8;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
     uint8_t *tmp = tmp_array;
 
@@ -2422,38 +2249,11 @@  void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
     tmp    = tmp_array + 4;
     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[44];
-    uint8_t *tmp = tmp_array;
-
-    src -= srcstride;
-
-    for (y = 0; y < h + 3; y++) {
-        for (x = 0; x < 4; x++)
-            tmp[x] = FILTER_6TAP(src, filter, 1);
-        tmp += 4;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 4;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_4TAP(tmp, filter, 4);
-        dst += dststride;
-        tmp += 4;
-    }
-#endif
 }
 
 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
     uint8_t *tmp = tmp_array;
 
@@ -2461,38 +2261,11 @@  void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
     tmp    = tmp_array + 32;
     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[592];
-    uint8_t *tmp = tmp_array;
-
-    src -= 2 * srcstride;
-
-    for (y = 0; y < h + 5; y++) {
-        for (x = 0; x < 16; x++)
-            tmp[x] = FILTER_6TAP(src, filter, 1);
-        tmp += 16;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 32;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = FILTER_6TAP(tmp, filter, 16);
-        dst += dststride;
-        tmp += 16;
-    }
-#endif
 }
 
 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
     uint8_t *tmp = tmp_array;
 
@@ -2500,38 +2273,11 @@  void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
     tmp    = tmp_array + 16;
     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[168];
-    uint8_t *tmp = tmp_array;
-
-    src -= 2 * srcstride;
-
-    for (y = 0; y < h + 5; y++) {
-        for (x = 0; x < 8; x++)
-            tmp[x] = FILTER_6TAP(src, filter, 1);
-        tmp += 8;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 16;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = FILTER_6TAP(tmp, filter, 8);
-        dst += dststride;
-        tmp += 8;
-    }
-#endif
 }
 
 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
         ptrdiff_t srcstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
     uint8_t *tmp = tmp_array;
 
@@ -2539,38 +2285,11 @@  void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
     tmp    = tmp_array + 8;
     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
-#else
-    const uint8_t *filter = subpel_filters[mx - 1];
-    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
-    int x, y;
-    uint8_t tmp_array[52];
-    uint8_t *tmp = tmp_array;
-
-    src -= 2 * srcstride;
-
-    for (y = 0; y < h + 5; y++) {
-        for (x = 0; x < 4; x++)
-            tmp[x] = FILTER_6TAP(src, filter, 1);
-        tmp += 4;
-        src += srcstride;
-    }
-
-    tmp    = tmp_array + 8;
-    filter = subpel_filters[my - 1];
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = FILTER_6TAP(tmp, filter, 4);
-        dst += dststride;
-        tmp += 4;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     int a = 8 - mx, b = mx;
     double ftmp[7];
     uint32_t tmp[1];
@@ -2630,23 +2349,11 @@  void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
           [ff_pw_4]"f"(ff_pw_4)
         : "memory"
     );
-#else
-    int a = 8 - mx, b = mx;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
-        dst += dstride;
-        src += sstride;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     int c = 8 - my, d = my;
     double ftmp[7];
     uint32_t tmp[1];
@@ -2698,57 +2405,21 @@  void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
           [ff_pw_4]"f"(ff_pw_4)
         : "memory"
     );
-#else
-    int c = 8 - my, d = my;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
-        dst += dstride;
-        src += sstride;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
     uint8_t *tmp = tmp_array;
 
     ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
     ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
-#else
-    int a = 8 - mx, b = mx;
-    int c = 8 - my, d = my;
-    int x, y;
-    uint8_t tmp_array[528];
-    uint8_t *tmp = tmp_array;
-
-    for (y = 0; y < h + 1; y++) {
-        for (x = 0; x < 16; x++)
-            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
-        tmp += 16;
-        src += sstride;
-    }
-
-    tmp = tmp_array;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
-        dst += dstride;
-        tmp += 16;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     int a = 8 - mx, b = mx;
     double ftmp[7];
     uint32_t tmp[1];
@@ -2792,23 +2463,11 @@  void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
           [ff_pw_4]"f"(ff_pw_4)
         : "memory"
     );
-#else
-    int a = 8 - mx, b = mx;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
-        dst += dstride;
-        src += sstride;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     int c = 8 - my, d = my;
     double ftmp[7];
     uint32_t tmp[1];
@@ -2854,57 +2513,21 @@  void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
           [ff_pw_4]"f"(ff_pw_4)
         : "memory"
     );
-#else
-    int c = 8 - my, d = my;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
-        dst += dstride;
-        src += sstride;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
     uint8_t *tmp = tmp_array;
 
     ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
     ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
-#else
-    int a = 8 - mx, b = mx;
-    int c = 8 - my, d = my;
-    int x, y;
-    uint8_t tmp_array[136];
-    uint8_t *tmp = tmp_array;
-
-    for (y = 0; y < h + 1; y++) {
-        for (x = 0; x < 8; x++)
-            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
-        tmp += 8;
-        src += sstride;
-    }
-
-    tmp = tmp_array;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 8; x++)
-            dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
-        dst += dstride;
-        tmp += 8;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     int a = 8 - mx, b = mx;
     double ftmp[5];
     uint32_t tmp[1];
@@ -2945,23 +2568,11 @@  void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
           [ff_pw_4]"f"(ff_pw_4)
         : "memory"
     );
-#else
-    int a = 8 - mx, b = mx;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
-        dst += dstride;
-        src += sstride;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     int c = 8 - my, d = my;
     double ftmp[7];
     uint32_t tmp[1];
@@ -3004,49 +2615,14 @@  void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
           [ff_pw_4]"f"(ff_pw_4)
         : "memory"
     );
-#else
-    int c = 8 - my, d = my;
-    int x, y;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
-        dst += dstride;
-        src += sstride;
-    }
-#endif
 }
 
 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
         ptrdiff_t sstride, int h, int mx, int my)
 {
-#if 1
     DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
     uint8_t *tmp = tmp_array;
 
     ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
     ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
-#else
-    int a = 8 - mx, b = mx;
-    int c = 8 - my, d = my;
-    int x, y;
-    uint8_t tmp_array[36];
-    uint8_t *tmp = tmp_array;
-
-    for (y = 0; y < h + 1; y++) {
-        for (x = 0; x < 4; x++)
-            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
-        tmp += 4;
-        src += sstride;
-    }
-
-    tmp = tmp_array;
-
-    for (y = 0; y < h; y++) {
-        for (x = 0; x < 4; x++)
-            dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
-        dst += dstride;
-        tmp += 4;
-    }
-#endif
 }