[FFmpeg-devel,2/2] avcodec/x86/mpegenc: support transpose permuation type

Submitted by James Darnley on June 16, 2017, 1:53 p.m.

Details

Message ID 20170616135328.14495-2-jdarnley@obe.tv
State Accepted
Commit e3db94302c795a955d97566eaba8a0b0dfef788c
Headers show

Commit Message

James Darnley June 16, 2017, 1:53 p.m.
---
 libavcodec/x86/mpegvideoenc_template.c | 47 +++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

Comments

Michael Niedermayer June 16, 2017, 3:48 p.m.
On Fri, Jun 16, 2017 at 03:53:28PM +0200, James Darnley wrote:
> ---
>  libavcodec/x86/mpegvideoenc_template.c | 47 +++++++++++++++++++++++++++++++++-
>  1 file changed, 46 insertions(+), 1 deletion(-)

Are these 2 with the other patchset intended to give exactly the same
output ?

for example: (with the 2 patches and the previously problematic one)
./ffmpeg -i ~/tickets/5311/TOON.AVI -flags +bitexact file.avi
./ffmpeg-try -i ~/tickets/5311/TOON.AVI -flags +bitexact file-new.avi

-rw-r----- 1 michael michael 9124822 Jun 16 17:45 file.avi
-rw-r----- 1 michael michael 9124874 Jun 16 17:46 file-new.avi

source file should be at:
https://drive.google.com/open?id=0B3_pEBoLs0faaFY0ME92SDA1VEU
(link from ticket)

[...]
James Darnley June 16, 2017, 5:32 p.m.
On 2017-06-16 17:48, Michael Niedermayer wrote:
> On Fri, Jun 16, 2017 at 03:53:28PM +0200, James Darnley wrote:
>> ---
>>  libavcodec/x86/mpegvideoenc_template.c | 47 +++++++++++++++++++++++++++++++++-
>>  1 file changed, 46 insertions(+), 1 deletion(-)
> 
> Are these 2 with the other patchset intended to give exactly the same
> output ?
> 
> for example: (with the 2 patches and the previously problematic one)
> ./ffmpeg -i ~/tickets/5311/TOON.AVI -flags +bitexact file.avi
> ./ffmpeg-try -i ~/tickets/5311/TOON.AVI -flags +bitexact file-new.avi
> 
> -rw-r----- 1 michael michael 9124822 Jun 16 17:45 file.avi
> -rw-r----- 1 michael michael 9124874 Jun 16 17:46 file-new.avi

Thank you.  The intention was that all 8 patches should give identical
output to either the C or the MMX or both.  Ronald says he has modeled
his "hacks" based on the C because it is much clearer than the MMX.

Barring any remaining bugs in my/our new code we would expect it to be
identical to the C.  With this sample I do infarct get identical
results.  For you I think these should give identical results
> ./ffmpeg-try -i ~/tickets/5311/TOON.AVI -flags +bitexact -idct simple file.avi
> ./ffmpeg-try -i ~/tickets/5311/TOON.AVI -flags +bitexact -idct simpleauto file.avi

The first should use the C and the second should use the new SSE2 (or
AVX depending on your CPU).

This could be an issue with the MMX function raising its head.  MMX was
the old default choice.  Ronald and I are investigating.

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index 3ce72e1367..1201be514b 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -366,12 +366,57 @@  static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
         block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
         block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    } else if (s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE) {
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x08] = temp_block[0x01];
+        block[0x01] = temp_block[0x08]; block[0x02] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x09] = temp_block[0x09]; block[0x10] = temp_block[0x02];
+        block[0x18] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x11] = temp_block[0x0A]; block[0x0A] = temp_block[0x11];
+        block[0x03] = temp_block[0x18]; block[0x04] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x0B] = temp_block[0x19];
+        block[0x12] = temp_block[0x12]; block[0x19] = temp_block[0x0B];
+        block[0x20] = temp_block[0x04]; block[0x28] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x21] = temp_block[0x0C]; block[0x1A] = temp_block[0x13];
+        block[0x13] = temp_block[0x1A]; block[0x0C] = temp_block[0x21];
+        block[0x05] = temp_block[0x28]; block[0x06] = temp_block[0x30];
+        block[0x0D] = temp_block[0x29]; block[0x14] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1B] = temp_block[0x1B]; block[0x22] = temp_block[0x14];
+        block[0x29] = temp_block[0x0D]; block[0x30] = temp_block[0x06];
+        block[0x38] = temp_block[0x07]; block[0x31] = temp_block[0x0E];
+        block[0x2A] = temp_block[0x15]; block[0x23] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x1C] = temp_block[0x23]; block[0x15] = temp_block[0x2A];
+        block[0x0E] = temp_block[0x31]; block[0x07] = temp_block[0x38];
+        block[0x0F] = temp_block[0x39]; block[0x16] = temp_block[0x32];
+        block[0x1D] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x2B] = temp_block[0x1D]; block[0x32] = temp_block[0x16];
+        block[0x39] = temp_block[0x0F]; block[0x3A] = temp_block[0x17];
+        block[0x33] = temp_block[0x1E]; block[0x2C] = temp_block[0x25];
+        block[0x25] = temp_block[0x2C]; block[0x1E] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x17] = temp_block[0x3A]; block[0x1F] = temp_block[0x3B];
+        block[0x26] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
+        block[0x34] = temp_block[0x26]; block[0x3B] = temp_block[0x1F];
+        block[0x3C] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x2E] = temp_block[0x35]; block[0x27] = temp_block[0x3C];
+        block[0x2F] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
+        block[0x3D] = temp_block[0x2F]; block[0x3E] = temp_block[0x37];
+        block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
     } else {
         av_log(s, AV_LOG_DEBUG, "s->idsp.perm_type: %d\n",
                 (int)s->idsp.perm_type);
         av_assert0(s->idsp.perm_type == FF_IDCT_PERM_NONE ||
                 s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2 ||
-                s->idsp.perm_type == FF_IDCT_PERM_SIMPLE);
+                s->idsp.perm_type == FF_IDCT_PERM_SIMPLE ||
+                s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE);
     }
     end:
     return last_non_zero_p1 - 1;