[FFmpeg-devel,10/11] avcodec/x86: add an 8-bit simple IDCT function based on the x86-64 high depth functions

Submitted by James Darnley on June 19, 2017, 3:11 p.m.

Details

Message ID 20170619151104.31273-11-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley June 19, 2017, 3:11 p.m.
Includes add/put functions

Rounding contributed by Ronald S. Bultje
---
 libavcodec/tests/x86/dct.c                |  2 +
 libavcodec/x86/idctdsp_init.c             | 23 ++++++++
 libavcodec/x86/simple_idct.h              |  9 +++
 libavcodec/x86/simple_idct10.asm          | 92 +++++++++++++++++++++++++++++++
 libavcodec/x86/simple_idct10_template.asm |  6 +-
 5 files changed, 130 insertions(+), 2 deletions(-)

Comments

Michael Niedermayer June 19, 2017, 8:32 p.m.
On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> Includes add/put functions
> 
> Rounding contributed by Ronald S. Bultje
> ---
>  libavcodec/tests/x86/dct.c                |  2 +
>  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
>  libavcodec/x86/simple_idct.h              |  9 +++
>  libavcodec/x86/simple_idct10.asm          | 92 +++++++++++++++++++++++++++++++
>  libavcodec/x86/simple_idct10_template.asm |  6 +-
>  5 files changed, 130 insertions(+), 2 deletions(-)

this changes the output of:
./ffmpeg -an -i ~/tickets/4400/cartest_supers.mov -flags +bitexact out-ref.avi

ls -alF out-ref.avi out.avi
-rw-r----- 1 michael michael 761042 Jun 19 22:29 out.avi
-rw-r----- 1 michael michael 761044 Jun 19 22:29 out-ref.avi

file should be this: http://samples.ffmpeg.org/ffmpeg-bugs/trac/ticket4400/cartest_supers.mov
or here:
http://electronrotoscope.com/ffmpeg/cartest_supers.mov

[...]
Ronald S. Bultje June 20, 2017, 1:03 p.m.
Hi,

On Mon, Jun 19, 2017 at 4:32 PM, Michael Niedermayer <michael@niedermayer.cc
> wrote:

> On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> > Includes add/put functions
> >
> > Rounding contributed by Ronald S. Bultje
> > ---
> >  libavcodec/tests/x86/dct.c                |  2 +
> >  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
> >  libavcodec/x86/simple_idct.h              |  9 +++
> >  libavcodec/x86/simple_idct10.asm          | 92
> +++++++++++++++++++++++++++++++
> >  libavcodec/x86/simple_idct10_template.asm |  6 +-
> >  5 files changed, 130 insertions(+), 2 deletions(-)
>
> this changes the output of:
> ./ffmpeg -an -i ~/tickets/4400/cartest_supers.mov -flags +bitexact
> out-ref.avi
>
> ls -alF out-ref.avi out.avi
> -rw-r----- 1 michael michael 761042 Jun 19 22:29 out.avi
> -rw-r----- 1 michael michael 761044 Jun 19 22:29 out-ref.avi


This is because you're comparing the non-bitexact mmx IDCT (which is
enabled even if the bitexact flag is set) with the bitexact sse2 IDCT.

Compare (without this patch) the C IDCT ("simple"):

./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct simple -flags
+bitexact /tmp/out-ref-simple.avi
-rw-r--r--  1 ronaldbultje  wheel  831994 Jun 20 08:56
/tmp/out-ref-simple.avi

with the MMX IDCT ("simplemmx", which is selected by "auto" and enabled by
default):

./ffmpeg -an -i ~/Downloads/cartest_supers.mov -flags +bitexact
/tmp/out-ref.avi
or
./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct simplemmx -flags
+bitexact /tmp/out-ref.avi
or
./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct simpleauto -flags
+bitexact /tmp/out-ref.avi
or
./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct auto -flags +bitexact
/tmp/out-ref.avi
-rw-r--r--  1 ronaldbultje  wheel  831998 Jun 20 08:54 /tmp/out-ref.avi

After this patch, all of these (simplemmx, simpleauto, simple, auto) will
refer to SSE2 instead of MMX, thus making their results identical to the C
version again.

Ronald
James Darnley June 20, 2017, 4:46 p.m.
On 2017-06-19 17:11, James Darnley wrote:
> diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
> index 51baf84c82..02fd445ec0 100644
> --- a/libavcodec/x86/simple_idct10_template.asm
> +++ b/libavcodec/x86/simple_idct10_template.asm
> @@ -258,6 +258,10 @@
>  
>      IDCT_1D     %1, %2, %8
>  %elif %2 == 11
> +    ; This copies the DC-only shortcut.  When there is only a DC coefficient the
> +    ; C shifts the value and splats it to all coeffs rather than multiplying and
> +    ; doing the full IDCT.  This causes a difference on 8-bit because the
> +    ; coefficient is 16383 rather than 16384 (which you can get with shifting).
>      por     m1, m8, m13
>      por     m1, m12
>      por     m1, [blockq+ 16]       ; { row[1] }[0-7]
> @@ -293,8 +297,6 @@
>      por  m9, m6
>      pand m10, m5
>      por  m10, m6
> -    pand m3, m5
> -    por  m3, m6
>  %else
>      IDCT_1D     %1, %2
>  %endif
> 

Now I see where these went.  I've moved these to the previous commit
which added the DC-only hack and as I said earlier I will push that one
soon.
Michael Niedermayer June 20, 2017, 11:13 p.m.
On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> Includes add/put functions
> 
> Rounding contributed by Ronald S. Bultje
> ---
>  libavcodec/tests/x86/dct.c                |  2 +
>  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
>  libavcodec/x86/simple_idct.h              |  9 +++
>  libavcodec/x86/simple_idct10.asm          | 92 +++++++++++++++++++++++++++++++
>  libavcodec/x86/simple_idct10_template.asm |  6 +-
>  5 files changed, 130 insertions(+), 2 deletions(-)

breaks:

./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1  -vf format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi



[...]
Ronald S. Bultje June 21, 2017, 12:24 a.m.
Hi,

On Tue, Jun 20, 2017 at 7:13 PM, Michael Niedermayer <michael@niedermayer.cc
> wrote:

> On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> > Includes add/put functions
> >
> > Rounding contributed by Ronald S. Bultje
> > ---
> >  libavcodec/tests/x86/dct.c                |  2 +
> >  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
> >  libavcodec/x86/simple_idct.h              |  9 +++
> >  libavcodec/x86/simple_idct10.asm          | 92
> +++++++++++++++++++++++++++++++
> >  libavcodec/x86/simple_idct10_template.asm |  6 +-
> >  5 files changed, 130 insertions(+), 2 deletions(-)
>
> breaks:
>
> ./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1  -vf
> format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi


Please define "breaks".

Ronald
Michael Niedermayer June 21, 2017, 12:33 a.m.
On Tue, Jun 20, 2017 at 08:24:55PM -0400, Ronald S. Bultje wrote:
> Hi,
> 
> On Tue, Jun 20, 2017 at 7:13 PM, Michael Niedermayer <michael@niedermayer.cc
> > wrote:
> 
> > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> > > Includes add/put functions
> > >
> > > Rounding contributed by Ronald S. Bultje
> > > ---
> > >  libavcodec/tests/x86/dct.c                |  2 +
> > >  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
> > >  libavcodec/x86/simple_idct.h              |  9 +++
> > >  libavcodec/x86/simple_idct10.asm          | 92
> > +++++++++++++++++++++++++++++++
> > >  libavcodec/x86/simple_idct10_template.asm |  6 +-
> > >  5 files changed, 130 insertions(+), 2 deletions(-)
> >
> > breaks:
> >
> > ./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1  -vf
> > format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi
> 
> 
> Please define "breaks".

it looked like the blocks where somehow using the wrong scantable
that is recognizable but clearly not correct looking

[...]
Ronald S. Bultje June 24, 2017, 12:11 p.m.
Hi,

On Tue, Jun 20, 2017 at 7:13 PM, Michael Niedermayer <michael@niedermayer.cc
> wrote:

> On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> > Includes add/put functions
> >
> > Rounding contributed by Ronald S. Bultje
> > ---
> >  libavcodec/tests/x86/dct.c                |  2 +
> >  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
> >  libavcodec/x86/simple_idct.h              |  9 +++
> >  libavcodec/x86/simple_idct10.asm          | 92
> +++++++++++++++++++++++++++++++
> >  libavcodec/x86/simple_idct10_template.asm |  6 +-
> >  5 files changed, 130 insertions(+), 2 deletions(-)
>
> breaks:
>
> ./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1  -vf
> format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi


Fixed in 97f7f831691f2a2bddbd258bcbe332516d64a91b.

Ronald
Michael Niedermayer June 24, 2017, 7:27 p.m.
On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> Includes add/put functions
> 
> Rounding contributed by Ronald S. Bultje
> ---
>  libavcodec/tests/x86/dct.c                |  2 +
>  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
>  libavcodec/x86/simple_idct.h              |  9 +++
>  libavcodec/x86/simple_idct10.asm          | 92 +++++++++++++++++++++++++++++++
>  libavcodec/x86/simple_idct10_template.asm |  6 +-
>  5 files changed, 130 insertions(+), 2 deletions(-)

Sorry for the delay, testing this took me a bit longer than expected
as many files change slightly and looking at differences manually
is manual work ...

This patch changes the default IDCT on x86(64), which is intended IIUC
It also changes the IDCT when simplemmx is set

but on x86-32 simplemmx does after this patch not produce the same
result as simplemmx on x86-64.

iam not sure but
maybe the changed code should enable on FF_IDCT_SIMPLE instead of
FF_IDCT_SIMPLEMMX ?
whats your oppinion on this ?
the next patch would add FF_IDCT_SIMPLE but it also leaves
FF_IDCT_SIMPLEMMX

[...]
Ronald S. Bultje June 24, 2017, 10:30 p.m.
Hi,

On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc
> wrote:

> On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> > Includes add/put functions
> >
> > Rounding contributed by Ronald S. Bultje
> > ---
> >  libavcodec/tests/x86/dct.c                |  2 +
> >  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
> >  libavcodec/x86/simple_idct.h              |  9 +++
> >  libavcodec/x86/simple_idct10.asm          | 92
> +++++++++++++++++++++++++++++++
> >  libavcodec/x86/simple_idct10_template.asm |  6 +-
> >  5 files changed, 130 insertions(+), 2 deletions(-)
>
> Sorry for the delay, testing this took me a bit longer than expected
> as many files change slightly and looking at differences manually
> is manual work ...
>

Understood, and thanks for taking the time to do the testing.


> This patch changes the default IDCT on x86(64), which is intended IIUC
> It also changes the IDCT when simplemmx is set
>
> but on x86-32 simplemmx does after this patch not produce the same
> result as simplemmx on x86-64.
>
> iam not sure but
> maybe the changed code should enable on FF_IDCT_SIMPLE instead of
> FF_IDCT_SIMPLEMMX ?
> whats your oppinion on this ?
> the next patch would add FF_IDCT_SIMPLE but it also leaves
> FF_IDCT_SIMPLEMMX


 That's a good point, I also considered that question (not so much the
32bit vs. 64bit, but the mmx vs. sse2). The question is basically what
simplemmx means. Is it the exact binary result of the mmx function? Or is
it a way of saying "almost simple, but with some rounding diffs because
mmx"?

If the second, then simple is a superset of simplemmx. If the first, then
we should remove simplemmx from the list of "supported" idcts for the
sse2/avx functions. I have no preference (I assumed it meant the first),
but if you'd prefer to use the second meaning, then that's an easy
modification to make and it won't practically have any impact for most use
cases I think...

Ronald
Ronald S. Bultje June 24, 2017, 10:33 p.m.
Hi,

On Sat, Jun 24, 2017 at 6:30 PM, Ronald S. Bultje <rsbultje@gmail.com>
wrote:

> If the second, then simple is a superset of simplemmx. If the first, then
> we should remove simplemmx from the list of "supported" idcts for the
> sse2/avx functions. I have no preference (I assumed it meant the first)
>

Just to explain this a little more: I assumed the first, because the C
function is assigned if simplemmx is set but cpuflags is set to 0 (or we're
not on x86). However, that could also be considered a fallback so I
understand both points of view.

Ronald
Michael Niedermayer June 25, 2017, 7:27 p.m.
On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote:
> Hi,
> 
> On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc
> > wrote:
> 
> > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> > > Includes add/put functions
> > >
> > > Rounding contributed by Ronald S. Bultje
> > > ---
> > >  libavcodec/tests/x86/dct.c                |  2 +
> > >  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
> > >  libavcodec/x86/simple_idct.h              |  9 +++
> > >  libavcodec/x86/simple_idct10.asm          | 92
> > +++++++++++++++++++++++++++++++
> > >  libavcodec/x86/simple_idct10_template.asm |  6 +-
> > >  5 files changed, 130 insertions(+), 2 deletions(-)
> >
> > Sorry for the delay, testing this took me a bit longer than expected
> > as many files change slightly and looking at differences manually
> > is manual work ...
> >
> 
> Understood, and thanks for taking the time to do the testing.
> 
> 
> > This patch changes the default IDCT on x86(64), which is intended IIUC
> > It also changes the IDCT when simplemmx is set
> >
> > but on x86-32 simplemmx does after this patch not produce the same
> > result as simplemmx on x86-64.
> >
> > iam not sure but
> > maybe the changed code should enable on FF_IDCT_SIMPLE instead of
> > FF_IDCT_SIMPLEMMX ?
> > whats your oppinion on this ?
> > the next patch would add FF_IDCT_SIMPLE but it also leaves
> > FF_IDCT_SIMPLEMMX
> 
> 
>  That's a good point, I also considered that question (not so much the
> 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what
> simplemmx means. Is it the exact binary result of the mmx function? Or is
> it a way of saying "almost simple, but with some rounding diffs because
> mmx"?
> 
> If the second, then simple is a superset of simplemmx. If the first, then
> we should remove simplemmx from the list of "supported" idcts for the
> sse2/avx functions. I have no preference (I assumed it meant the first),
> but if you'd prefer to use the second meaning, then that's an easy
> modification to make and it won't practically have any impact for most use
> cases I think...

I didnt think about meaning, rather more about practice.
if someone reports any issue using "simplemmx" and bitexact and
that fails to be reproduced it could be confusing.
This is especially plausible when the bug is not idct rounding but
a bug in a later stage just triggered by specific output from the idct

also potential future fate tests of simplemmx or other simd idcts
require there to be a way to select a specific idct output

no strong oppinion on this ...

[...]
--
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

If you fake or manipulate statistics in a paper in physics you will never
get a job again.
If you fake or manipulate statistics in a paper in medicin you will get
a job for life at the pharma industry.
James Darnley June 26, 2017, 12:20 p.m.
On 2017-06-25 21:27, Michael Niedermayer wrote:
> On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote:
>> Hi,
>>
>> On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc wrote:
>>
>>> This patch changes the default IDCT on x86(64), which is intended IIUC
>>> It also changes the IDCT when simplemmx is set
>>>
>>> but on x86-32 simplemmx does after this patch not produce the same
>>> result as simplemmx on x86-64.
>>>
>>> iam not sure but
>>> maybe the changed code should enable on FF_IDCT_SIMPLE instead of
>>> FF_IDCT_SIMPLEMMX ?
>>> whats your oppinion on this ?
>>> the next patch would add FF_IDCT_SIMPLE but it also leaves
>>> FF_IDCT_SIMPLEMMX
>>
>>
>>  That's a good point, I also considered that question (not so much the
>> 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what
>> simplemmx means. Is it the exact binary result of the mmx function? Or is
>> it a way of saying "almost simple, but with some rounding diffs because
>> mmx"?
>>
>> If the second, then simple is a superset of simplemmx. If the first, then
>> we should remove simplemmx from the list of "supported" idcts for the
>> sse2/avx functions. I have no preference (I assumed it meant the first),
>> but if you'd prefer to use the second meaning, then that's an easy
>> modification to make and it won't practically have any impact for most use
>> cases I think...
> 
> I didnt think about meaning, rather more about practice.
> if someone reports any issue using "simplemmx" and bitexact and
> that fails to be reproduced it could be confusing.
> This is especially plausible when the bug is not idct rounding but
> a bug in a later stage just triggered by specific output from the idct
> 
> also potential future fate tests of simplemmx or other simd idcts
> require there to be a way to select a specific idct output
> 
> no strong oppinion on this ...

I admit I haven't considered whether I should be using this with
simplemmx.  I could change the code so that the new code isn't used for it.

If simplemmx is supposed to be its own algorithm available to anyone who
might wish to use it then I think that an error should occur when MMX is
not available.

Since the current behaviour is to have simple as the catch-all fallback
I will leave the code as is.  auto, simpleauto, simplemmx, and simple
will now all use the new code.

We can discuss these points all you want but I intend to push the
remaining 3 patches Soon(TM).  I have still not tried Gramner's
suggestion so you have some time to object and block.
Ronald S. Bultje June 26, 2017, 1:03 p.m.
Hi,

On Sun, Jun 25, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc
> wrote:

> On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote:
> > Hi,
> >
> > On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer
> <michael@niedermayer.cc
> > > wrote:
> >
> > > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote:
> > > > Includes add/put functions
> > > >
> > > > Rounding contributed by Ronald S. Bultje
> > > > ---
> > > >  libavcodec/tests/x86/dct.c                |  2 +
> > > >  libavcodec/x86/idctdsp_init.c             | 23 ++++++++
> > > >  libavcodec/x86/simple_idct.h              |  9 +++
> > > >  libavcodec/x86/simple_idct10.asm          | 92
> > > +++++++++++++++++++++++++++++++
> > > >  libavcodec/x86/simple_idct10_template.asm |  6 +-
> > > >  5 files changed, 130 insertions(+), 2 deletions(-)
> > >
> > > Sorry for the delay, testing this took me a bit longer than expected
> > > as many files change slightly and looking at differences manually
> > > is manual work ...
> > >
> >
> > Understood, and thanks for taking the time to do the testing.
> >
> >
> > > This patch changes the default IDCT on x86(64), which is intended IIUC
> > > It also changes the IDCT when simplemmx is set
> > >
> > > but on x86-32 simplemmx does after this patch not produce the same
> > > result as simplemmx on x86-64.
> > >
> > > iam not sure but
> > > maybe the changed code should enable on FF_IDCT_SIMPLE instead of
> > > FF_IDCT_SIMPLEMMX ?
> > > whats your oppinion on this ?
> > > the next patch would add FF_IDCT_SIMPLE but it also leaves
> > > FF_IDCT_SIMPLEMMX
> >
> >
> >  That's a good point, I also considered that question (not so much the
> > 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what
> > simplemmx means. Is it the exact binary result of the mmx function? Or is
> > it a way of saying "almost simple, but with some rounding diffs because
> > mmx"?
> >
> > If the second, then simple is a superset of simplemmx. If the first, then
> > we should remove simplemmx from the list of "supported" idcts for the
> > sse2/avx functions. I have no preference (I assumed it meant the first),
> > but if you'd prefer to use the second meaning, then that's an easy
> > modification to make and it won't practically have any impact for most
> use
> > cases I think...
>
> I didnt think about meaning, rather more about practice.
> if someone reports any issue using "simplemmx" and bitexact and
> that fails to be reproduced it could be confusing.
> This is especially plausible when the bug is not idct rounding but
> a bug in a later stage just triggered by specific output from the idct
>

Agreed. I'll leave it to James to decide on a final approach since it's his
patch. :-). It sounds like we're both fine with either approach.


> also potential future fate tests of simplemmx or other simd idcts
> require there to be a way to select a specific idct output


This is true, but don't forget that -cpuflags can also be used to cycle
between the various -idct flavours. (This requires some trial and error,
but it's not that many cpuflag-combinations.)

Ronald
Michael Niedermayer June 27, 2017, 3:34 p.m.
On Mon, Jun 26, 2017 at 02:20:03PM +0200, James Darnley wrote:
> On 2017-06-25 21:27, Michael Niedermayer wrote:
> > On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote:
> >> Hi,
> >>
> >> On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc wrote:
> >>
> >>> This patch changes the default IDCT on x86(64), which is intended IIUC
> >>> It also changes the IDCT when simplemmx is set
> >>>
> >>> but on x86-32 simplemmx does after this patch not produce the same
> >>> result as simplemmx on x86-64.
> >>>
> >>> iam not sure but
> >>> maybe the changed code should enable on FF_IDCT_SIMPLE instead of
> >>> FF_IDCT_SIMPLEMMX ?
> >>> whats your oppinion on this ?
> >>> the next patch would add FF_IDCT_SIMPLE but it also leaves
> >>> FF_IDCT_SIMPLEMMX
> >>
> >>
> >>  That's a good point, I also considered that question (not so much the
> >> 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what
> >> simplemmx means. Is it the exact binary result of the mmx function? Or is
> >> it a way of saying "almost simple, but with some rounding diffs because
> >> mmx"?
> >>
> >> If the second, then simple is a superset of simplemmx. If the first, then
> >> we should remove simplemmx from the list of "supported" idcts for the
> >> sse2/avx functions. I have no preference (I assumed it meant the first),
> >> but if you'd prefer to use the second meaning, then that's an easy
> >> modification to make and it won't practically have any impact for most use
> >> cases I think...
> > 
> > I didnt think about meaning, rather more about practice.
> > if someone reports any issue using "simplemmx" and bitexact and
> > that fails to be reproduced it could be confusing.
> > This is especially plausible when the bug is not idct rounding but
> > a bug in a later stage just triggered by specific output from the idct
> > 
> > also potential future fate tests of simplemmx or other simd idcts
> > require there to be a way to select a specific idct output
> > 
> > no strong oppinion on this ...
> 
> I admit I haven't considered whether I should be using this with
> simplemmx.  I could change the code so that the new code isn't used for it.
> 

> If simplemmx is supposed to be its own algorithm available to anyone who
> might wish to use it then I think that an error should occur when MMX is
> not available.

yes, that would make sense


> 
> Since the current behaviour is to have simple as the catch-all fallback
> I will leave the code as is.  auto, simpleauto, simplemmx, and simple
> will now all use the new code.
> 
> We can discuss these points all you want but I intend to push the
> remaining 3 patches Soon(TM).  I have still not tried Gramner's
> suggestion so you have some time to object and block.
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Patch hide | download patch | download mbox

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 34f5b8767b..317d973f9f 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,10 +88,12 @@  static const struct algo idct_tab_arch[] = {
 #if HAVE_YASM
 #if ARCH_X86_64
 #if HAVE_SSE2_EXTERNAL
+    { "SIMPLE8-SSE2",   ff_simple_idct8_sse2,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
     { "SIMPLE10-SSE2",  ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
     { "SIMPLE12-SSE2",  ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
 #endif
 #if HAVE_AVX_EXTERNAL
+    { "SIMPLE8-AVX",    ff_simple_idct8_avx,   FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
     { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
     { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX,  1 },
 #endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index f1c915aa00..9da60d1a1e 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -94,9 +94,32 @@  av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 c->idct_add  = ff_simple_idct_add_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
+
+        if (ARCH_X86_64 &&
+            !high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct8_sse2;
+                c->idct_put  = ff_simple_idct8_put_sse2;
+                c->idct_add  = ff_simple_idct8_add_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
     }
 
     if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (EXTERNAL_AVX(cpu_flags) &&
+            !high_bit_depth &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct8_avx;
+                c->idct_put  = ff_simple_idct8_put_avx;
+                c->idct_add  = ff_simple_idct8_add_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+
         if (avctx->bits_per_raw_sample == 10 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
              avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d17ef6a462..9b64cfe9bc 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -29,6 +29,15 @@  void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
+void ff_simple_idct8_sse2(int16_t *block);
+void ff_simple_idct8_avx(int16_t *block);
+
+void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
 void ff_simple_idct10_sse2(int16_t *block);
 void ff_simple_idct10_avx(int16_t *block);
 
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index b492303a57..069bb61378 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -31,11 +31,14 @@  SECTION_RODATA
 
 cextern pw_2
 cextern pw_16
+cextern pw_32
 cextern pw_1023
 cextern pw_4095
+pd_round_11: times 4 dd 1<<(11-1)
 pd_round_12: times 4 dd 1<<(12-1)
 pd_round_15: times 4 dd 1<<(15-1)
 pd_round_19: times 4 dd 1<<(19-1)
+pd_round_20: times 4 dd 1<<(20-1)
 
 %macro CONST_DEC  3
 const %1
@@ -77,8 +80,97 @@  CONST_DEC  w3_min_w7_lo,    W3sh2_lo, -W7sh2
 
 SECTION .text
 
+%macro STORE_HI_LO 12
+    movq   %1, %9
+    movq   %3, %10
+    movq   %5, %11
+    movq   %7, %12
+    movhps %2, %9
+    movhps %4, %10
+    movhps %6, %11
+    movhps %8, %12
+%endmacro
+
+%macro LOAD_ZXBW_8 16
+    pmovzxbw %1, %9
+    pmovzxbw %2, %10
+    pmovzxbw %3, %11
+    pmovzxbw %4, %12
+    pmovzxbw %5, %13
+    pmovzxbw %6, %14
+    pmovzxbw %7, %15
+    pmovzxbw %8, %16
+%endmacro
+
+%macro LOAD_ZXBW_4 9
+    movh %1, %5
+    movh %2, %6
+    movh %3, %7
+    movh %4, %8
+    punpcklbw %1, %9
+    punpcklbw %2, %9
+    punpcklbw %3, %9
+    punpcklbw %4, %9
+%endmacro
+
+%define PASS4ROWS(base, stride, stride3) \
+    [base], [base + stride], [base + 2*stride], [base + stride3]
+
 %macro idct_fn 0
 
+define_constants _lo
+
+cglobal simple_idct8, 1, 1, 16, 32, block
+    IDCT_FN    "", 11, pw_32, 20, "store"
+RET
+
+cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block
+    IDCT_FN    "", 11, pw_32, 20
+    lea       r3, [3*lsizeq]
+    lea       r2, [pixelsq + r3]
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
+RET
+
+cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block
+    IDCT_FN    "", 11, pw_32, 20
+    lea r2, [3*lsizeq]
+    %if cpuflag(sse4)
+        lea r3, [pixelsq + r2]
+        LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2)
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        paddsw m4, m12
+        paddsw m11, m13
+        paddsw m9, m14
+        paddsw m10, m15
+    %else
+        pxor m12, m12
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        lea r3, [pixelsq + 4*lsizeq]
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12
+        paddsw m4, m3
+        paddsw m11, m5
+        paddsw m9, m6
+        paddsw m10, m7
+        lea r3, [pixelsq + r2]
+    %endif
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9
+RET
+
 define_constants _hi
 
 cglobal simple_idct10, 1, 1, 16, block
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
index 51baf84c82..02fd445ec0 100644
--- a/libavcodec/x86/simple_idct10_template.asm
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -258,6 +258,10 @@ 
 
     IDCT_1D     %1, %2, %8
 %elif %2 == 11
+    ; This copies the DC-only shortcut.  When there is only a DC coefficient the
+    ; C shifts the value and splats it to all coeffs rather than multiplying and
+    ; doing the full IDCT.  This causes a difference on 8-bit because the
+    ; coefficient is 16383 rather than 16384 (which you can get with shifting).
     por     m1, m8, m13
     por     m1, m12
     por     m1, [blockq+ 16]       ; { row[1] }[0-7]
@@ -293,8 +297,6 @@ 
     por  m9, m6
     pand m10, m5
     por  m10, m6
-    pand m3, m5
-    por  m3, m6
 %else
     IDCT_1D     %1, %2
 %endif