Message ID | 1563866874-10116-1-git-send-email-yinshiyou-hf@loongson.cn |
---|---|
State | Superseded |
Headers | show |
Why is "block" not aligned? Does the code for other architectures also use unaligned instructions for these? On 23.07.2019, at 09:27, Shiyou Yin <yinshiyou-hf@loongson.cn> wrote: > Ensure the address accesed by gssqc1/gslqc1 are 16-bits memory-aligned. > --- > libavcodec/mips/h264dsp_mmi.c | 48 +++++++++++++----------------------- > libavcodec/mips/simple_idct_mmi.c | 51 +++++++++++++++++++++++++-------------- > libavutil/mips/mmiutils.h | 2 +- > 3 files changed, 51 insertions(+), 50 deletions(-) > > diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c > index ac65a20..a85d782 100644 > --- a/libavcodec/mips/h264dsp_mmi.c > +++ b/libavcodec/mips/h264dsp_mmi.c > @@ -38,6 +38,11 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) > MMI_LDC1(%[ftmp2], %[src], 0x08) > MMI_LDC1(%[ftmp3], %[src], 0x10) > MMI_LDC1(%[ftmp4], %[src], 0x18) > + /* memset(src, 0, 32); */ > + MMI_USDC1(%[ftmp0], %[src], 0x00) > + MMI_USDC1(%[ftmp0], %[src], 0x08) > + MMI_USDC1(%[ftmp0], %[src], 0x10) > + MMI_USDC1(%[ftmp0], %[src], 0x18) > MMI_ULWC1(%[ftmp5], %[dst0], 0x00) > MMI_ULWC1(%[ftmp6], %[dst1], 0x00) > MMI_ULWC1(%[ftmp7], %[dst2], 0x00) > @@ -58,11 +63,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) > MMI_SWC1(%[ftmp2], %[dst1], 0x00) > MMI_SWC1(%[ftmp3], %[dst2], 0x00) > MMI_SWC1(%[ftmp4], %[dst3], 0x00) > - > - /* memset(src, 0, 32); */ > - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t" > : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), > [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), > [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), > @@ -85,15 +85,21 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > DECLARE_VAR_ADDRT; > > __asm__ volatile ( > - "dli %[tmp0], 0x01 \n\t" > MMI_LDC1(%[ftmp0], %[block], 0x00) > - "mtc1 %[tmp0], %[ftmp8] \n\t" > MMI_LDC1(%[ftmp1], %[block], 0x08) > - "dli %[tmp0], 0x06 \n\t" > MMI_LDC1(%[ftmp2], %[block], 0x10) > + MMI_LDC1(%[ftmp3], %[block], 0x18) > + /* memset(block, 0, 32) */ > + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" > + MMI_USDC1(%[ftmp4], %[block], 0x00) > + MMI_USDC1(%[ftmp4], %[block], 0x08) > + MMI_USDC1(%[ftmp4], %[block], 0x10) > + MMI_USDC1(%[ftmp4], %[block], 0x18) > + "dli %[tmp0], 0x01 \n\t" > + "mtc1 %[tmp0], %[ftmp8] \n\t" > + "dli %[tmp0], 0x06 \n\t" > "mtc1 %[tmp0], %[ftmp9] \n\t" > "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t" > - MMI_LDC1(%[ftmp3], %[block], 0x18) > "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t" > "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" > "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" > @@ -121,15 +127,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t" > "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" > "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t" > - "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" > "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" > - MMI_SDC1(%[ftmp7], %[block], 0x00) > - MMI_SDC1(%[ftmp7], %[block], 0x08) > - MMI_SDC1(%[ftmp7], %[block], 0x10) > - MMI_SDC1(%[ftmp7], %[block], 0x18) > MMI_ULWC1(%[ftmp2], %[dst], 0x00) > - "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" > MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00) > + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" > + "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" > "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t" > "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" > "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" > @@ -153,11 +155,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > MMI_SWC1(%[ftmp2], %[dst], 0x00) > "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" > MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00) > - > - /* memset(block, 0, 32) */ > - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" > : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), > [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), > [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), > @@ -620,17 +617,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > MMI_SWC1(%[ftmp6], %[addr0], 0x00) > MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00) > PTR_ADDIU "$29, $29, 0x20 \n\t" > - > - /* memset(block, 0, 128) */ > - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x20(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x30(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x40(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x50(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x60(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x70(%[block]) \n\t" > : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), > [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), > [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), > diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c > index 7f4bb74..f54f9ea 100644 > --- a/libavcodec/mips/simple_idct_mmi.c > +++ b/libavcodec/mips/simple_idct_mmi.c > @@ -39,7 +39,7 @@ > #define COL_SHIFT 20 > #define DC_SHIFT 3 > > -DECLARE_ALIGNED(8, const int16_t, W_arr)[46] = { > +DECLARE_ALIGNED(16, const int16_t, W_arr)[46] = { > W4, W2, W4, W6, > W1, W3, W5, W7, > W4, W6, -W4, -W2, > @@ -147,14 +147,22 @@ void ff_simple_idct_8_mmi(int16_t *block) > "gslqc1 $f25, $f24, 0x30(%[w_arr]) \n\t" > "gslqc1 $f17, $f16, 0x40(%[w_arr]) \n\t" > /* load source in block */ > - "gslqc1 $f1, $f0, 0x00(%[block]) \n\t" > - "gslqc1 $f3, $f2, 0x10(%[block]) \n\t" > - "gslqc1 $f5, $f4, 0x20(%[block]) \n\t" > - "gslqc1 $f7, $f6, 0x30(%[block]) \n\t" > - "gslqc1 $f9, $f8, 0x40(%[block]) \n\t" > - "gslqc1 $f11, $f10, 0x50(%[block]) \n\t" > - "gslqc1 $f13, $f12, 0x60(%[block]) \n\t" > - "gslqc1 $f15, $f14, 0x70(%[block]) \n\t" > + MMI_ULDC1($f0, %[block], 0x00) > + MMI_ULDC1($f1, %[block], 0x08) > + MMI_ULDC1($f2, %[block], 0x10) > + MMI_ULDC1($f3, %[block], 0x18) > + MMI_ULDC1($f4, %[block], 0x20) > + MMI_ULDC1($f5, %[block], 0x28) > + MMI_ULDC1($f6, %[block], 0x30) > + MMI_ULDC1($f7, %[block], 0x38) > + MMI_ULDC1($f8, %[block], 0x40) > + MMI_ULDC1($f9, %[block], 0x48) > + MMI_ULDC1($f10, %[block], 0x50) > + MMI_ULDC1($f11, %[block], 0x58) > + MMI_ULDC1($f12, %[block], 0x60) > + MMI_ULDC1($f13, %[block], 0x68) > + MMI_ULDC1($f14, %[block], 0x70) > + MMI_ULDC1($f15, %[block], 0x78) > > /* $9: mask ; $f17: ROW_SHIFT */ > "dmfc1 $9, $f17 \n\t" > @@ -394,15 +402,22 @@ void ff_simple_idct_8_mmi(int16_t *block) > "punpcklwd $f11, $f27, $f29 \n\t" > "punpckhwd $f15, $f27, $f29 \n\t" > /* Store */ > - "gssqc1 $f1, $f0, 0x00(%[block]) \n\t" > - "gssqc1 $f5, $f4, 0x10(%[block]) \n\t" > - "gssqc1 $f9, $f8, 0x20(%[block]) \n\t" > - "gssqc1 $f13, $f12, 0x30(%[block]) \n\t" > - "gssqc1 $f3, $f2, 0x40(%[block]) \n\t" > - "gssqc1 $f7, $f6, 0x50(%[block]) \n\t" > - "gssqc1 $f11, $f10, 0x60(%[block]) \n\t" > - "gssqc1 $f15, $f14, 0x70(%[block]) \n\t" > - > + MMI_USDC1($f0, %[block], 0X00) > + MMI_USDC1($f1, %[block], 0X08) > + MMI_USDC1($f4, %[block], 0X10) > + MMI_USDC1($f5, %[block], 0X18) > + MMI_USDC1($f8, %[block], 0X20) > + MMI_USDC1($f9, %[block], 0X28) > + MMI_USDC1($f12, %[block], 0X30) > + MMI_USDC1($f13, %[block], 0X38) > + MMI_USDC1($f2, %[block], 0X40) > + MMI_USDC1($f3, %[block], 0X48) > + MMI_USDC1($f6, %[block], 0X50) > + MMI_USDC1($f7, %[block], 0X58) > + MMI_USDC1($f10, %[block], 0X60) > + MMI_USDC1($f11, %[block], 0X68) > + MMI_USDC1($f14, %[block], 0X70) > + MMI_USDC1($f15, %[block], 0X78) > : [block]"+&r"(block) > : [w_arr]"r"(W_arr) > : "memory" > diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h > index 05f6b31..bfa6d8b 100644 > --- a/libavutil/mips/mmiutils.h > +++ b/libavutil/mips/mmiutils.h > @@ -205,7 +205,7 @@ > * backup register > */ > #define BACKUP_REG \ > - double temp_backup_reg[8]; \ > + double __attribute__ ((aligned (16))) temp_backup_reg[8]; \ > if (_MIPS_SIM == _ABI64) \ > __asm__ volatile ( \ > "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \ > -- > 2.1.0 > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>Why is "block" not aligned? Does the code for other architectures also use unaligned instructions for >these? Thank you for reminding me. After checking the struct H264SliceContext and function call process, 'block' is find out as 16-bit aligned. There are some refines in this patch, I will upload them in a new patch and only keep the following changes in this patch(V2). >> diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c >> index 7f4bb74..f54f9ea 100644 >> --- a/libavcodec/mips/simple_idct_mmi.c >> +++ b/libavcodec/mips/simple_idct_mmi.c >> @@ -39,7 +39,7 @@ >> #define COL_SHIFT 20 >> #define DC_SHIFT 3 >> >> -DECLARE_ALIGNED(8, const int16_t, W_arr)[46] = { >> +DECLARE_ALIGNED(16, const int16_t, W_arr)[46] = { >> W4, W2, W4, W6, >> W1, W3, W5, W7, >> W4, W6, -W4, -W2, >> diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h >> index 05f6b31..bfa6d8b 100644 >> --- a/libavutil/mips/mmiutils.h >> +++ b/libavutil/mips/mmiutils.h >> @@ -205,7 +205,7 @@ >> * backup register >> */ >> #define BACKUP_REG \ >> - double temp_backup_reg[8]; \ >> + double __attribute__ ((aligned (16))) temp_backup_reg[8]; \ >> if (_MIPS_SIM == _ABI64) \ >> __asm__ volatile ( \ >> "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \ >> -- >> 2.1.0 >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c index ac65a20..a85d782 100644 --- a/libavcodec/mips/h264dsp_mmi.c +++ b/libavcodec/mips/h264dsp_mmi.c @@ -38,6 +38,11 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) MMI_LDC1(%[ftmp2], %[src], 0x08) MMI_LDC1(%[ftmp3], %[src], 0x10) MMI_LDC1(%[ftmp4], %[src], 0x18) + /* memset(src, 0, 32); */ + MMI_USDC1(%[ftmp0], %[src], 0x00) + MMI_USDC1(%[ftmp0], %[src], 0x08) + MMI_USDC1(%[ftmp0], %[src], 0x10) + MMI_USDC1(%[ftmp0], %[src], 0x18) MMI_ULWC1(%[ftmp5], %[dst0], 0x00) MMI_ULWC1(%[ftmp6], %[dst1], 0x00) MMI_ULWC1(%[ftmp7], %[dst2], 0x00) @@ -58,11 +63,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) MMI_SWC1(%[ftmp2], %[dst1], 0x00) MMI_SWC1(%[ftmp3], %[dst2], 0x00) MMI_SWC1(%[ftmp4], %[dst3], 0x00) - - /* memset(src, 0, 32); */ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), @@ -85,15 +85,21 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) DECLARE_VAR_ADDRT; __asm__ volatile ( - "dli %[tmp0], 0x01 \n\t" MMI_LDC1(%[ftmp0], %[block], 0x00) - "mtc1 %[tmp0], %[ftmp8] \n\t" MMI_LDC1(%[ftmp1], %[block], 0x08) - "dli %[tmp0], 0x06 \n\t" MMI_LDC1(%[ftmp2], %[block], 0x10) + MMI_LDC1(%[ftmp3], %[block], 0x18) + /* memset(block, 0, 32) */ + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + MMI_USDC1(%[ftmp4], %[block], 0x00) + MMI_USDC1(%[ftmp4], %[block], 0x08) + MMI_USDC1(%[ftmp4], %[block], 0x10) + MMI_USDC1(%[ftmp4], %[block], 0x18) + "dli %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x06 \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t" - MMI_LDC1(%[ftmp3], %[block], 0x18) "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t" "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" @@ -121,15 +127,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t" "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t" - "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" - MMI_SDC1(%[ftmp7], %[block], 0x00) - MMI_SDC1(%[ftmp7], %[block], 0x08) - MMI_SDC1(%[ftmp7], %[block], 0x10) - MMI_SDC1(%[ftmp7], %[block], 0x18) MMI_ULWC1(%[ftmp2], %[dst], 0x00) - "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00) + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t" "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" @@ -153,11 +155,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) MMI_SWC1(%[ftmp2], %[dst], 0x00) "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00) - - /* memset(block, 0, 32) */ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), @@ -620,17 +617,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride) MMI_SWC1(%[ftmp6], %[addr0], 0x00) MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00) PTR_ADDIU "$29, $29, 0x20 \n\t" - - /* memset(block, 0, 128) */ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x20(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x30(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x40(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x50(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x60(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x70(%[block]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c index 7f4bb74..f54f9ea 100644 --- a/libavcodec/mips/simple_idct_mmi.c +++ b/libavcodec/mips/simple_idct_mmi.c @@ -39,7 +39,7 @@ #define COL_SHIFT 20 #define DC_SHIFT 3 -DECLARE_ALIGNED(8, const int16_t, W_arr)[46] = { +DECLARE_ALIGNED(16, const int16_t, W_arr)[46] = { W4, W2, W4, W6, W1, W3, W5, W7, W4, W6, -W4, -W2, @@ -147,14 +147,22 @@ void ff_simple_idct_8_mmi(int16_t *block) "gslqc1 $f25, $f24, 0x30(%[w_arr]) \n\t" "gslqc1 $f17, $f16, 0x40(%[w_arr]) \n\t" /* load source in block */ - "gslqc1 $f1, $f0, 0x00(%[block]) \n\t" - "gslqc1 $f3, $f2, 0x10(%[block]) \n\t" - "gslqc1 $f5, $f4, 0x20(%[block]) \n\t" - "gslqc1 $f7, $f6, 0x30(%[block]) \n\t" - "gslqc1 $f9, $f8, 0x40(%[block]) \n\t" - "gslqc1 $f11, $f10, 0x50(%[block]) \n\t" - "gslqc1 $f13, $f12, 0x60(%[block]) \n\t" - "gslqc1 $f15, $f14, 0x70(%[block]) \n\t" + MMI_ULDC1($f0, %[block], 0x00) + MMI_ULDC1($f1, %[block], 0x08) + MMI_ULDC1($f2, %[block], 0x10) + MMI_ULDC1($f3, %[block], 0x18) + MMI_ULDC1($f4, %[block], 0x20) + MMI_ULDC1($f5, %[block], 0x28) + MMI_ULDC1($f6, %[block], 0x30) + MMI_ULDC1($f7, %[block], 0x38) + MMI_ULDC1($f8, %[block], 0x40) + MMI_ULDC1($f9, %[block], 0x48) + MMI_ULDC1($f10, %[block], 0x50) + MMI_ULDC1($f11, %[block], 0x58) + MMI_ULDC1($f12, %[block], 0x60) + MMI_ULDC1($f13, %[block], 0x68) + MMI_ULDC1($f14, %[block], 0x70) + MMI_ULDC1($f15, %[block], 0x78) /* $9: mask ; $f17: ROW_SHIFT */ "dmfc1 $9, $f17 \n\t" @@ -394,15 +402,22 @@ void ff_simple_idct_8_mmi(int16_t *block) "punpcklwd $f11, $f27, $f29 \n\t" "punpckhwd $f15, $f27, $f29 \n\t" /* Store */ - "gssqc1 $f1, $f0, 0x00(%[block]) \n\t" - "gssqc1 $f5, $f4, 0x10(%[block]) \n\t" - "gssqc1 $f9, $f8, 0x20(%[block]) \n\t" - "gssqc1 $f13, $f12, 0x30(%[block]) \n\t" - "gssqc1 $f3, $f2, 0x40(%[block]) \n\t" - "gssqc1 $f7, $f6, 0x50(%[block]) \n\t" - "gssqc1 $f11, $f10, 0x60(%[block]) \n\t" - "gssqc1 $f15, $f14, 0x70(%[block]) \n\t" - + MMI_USDC1($f0, %[block], 0X00) + MMI_USDC1($f1, %[block], 0X08) + MMI_USDC1($f4, %[block], 0X10) + MMI_USDC1($f5, %[block], 0X18) + MMI_USDC1($f8, %[block], 0X20) + MMI_USDC1($f9, %[block], 0X28) + MMI_USDC1($f12, %[block], 0X30) + MMI_USDC1($f13, %[block], 0X38) + MMI_USDC1($f2, %[block], 0X40) + MMI_USDC1($f3, %[block], 0X48) + MMI_USDC1($f6, %[block], 0X50) + MMI_USDC1($f7, %[block], 0X58) + MMI_USDC1($f10, %[block], 0X60) + MMI_USDC1($f11, %[block], 0X68) + MMI_USDC1($f14, %[block], 0X70) + MMI_USDC1($f15, %[block], 0X78) : [block]"+&r"(block) : [w_arr]"r"(W_arr) : "memory" diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h index 05f6b31..bfa6d8b 100644 --- a/libavutil/mips/mmiutils.h +++ b/libavutil/mips/mmiutils.h @@ -205,7 +205,7 @@ * backup register */ #define BACKUP_REG \ - double temp_backup_reg[8]; \ + double __attribute__ ((aligned (16))) temp_backup_reg[8]; \ if (_MIPS_SIM == _ABI64) \ __asm__ volatile ( \ "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \