Message ID | 20170619151104.31273-11-jdarnley@obe.tv |
---|---|
State | New |
Headers | show |
On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > Includes add/put functions > > Rounding contributed by Ronald S. Bultje > --- > libavcodec/tests/x86/dct.c | 2 + > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > libavcodec/x86/simple_idct.h | 9 +++ > libavcodec/x86/simple_idct10.asm | 92 +++++++++++++++++++++++++++++++ > libavcodec/x86/simple_idct10_template.asm | 6 +- > 5 files changed, 130 insertions(+), 2 deletions(-) this changes the output of: ./ffmpeg -an -i ~/tickets/4400/cartest_supers.mov -flags +bitexact out-ref.avi ls -alF out-ref.avi out.avi -rw-r----- 1 michael michael 761042 Jun 19 22:29 out.avi -rw-r----- 1 michael michael 761044 Jun 19 22:29 out-ref.avi file should be this: http://samples.ffmpeg.org/ffmpeg-bugs/trac/ticket4400/cartest_supers.mov or here: http://electronrotoscope.com/ffmpeg/cartest_supers.mov [...]
Hi, On Mon, Jun 19, 2017 at 4:32 PM, Michael Niedermayer <michael@niedermayer.cc > wrote: > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > > Includes add/put functions > > > > Rounding contributed by Ronald S. Bultje > > --- > > libavcodec/tests/x86/dct.c | 2 + > > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > > libavcodec/x86/simple_idct.h | 9 +++ > > libavcodec/x86/simple_idct10.asm | 92 > +++++++++++++++++++++++++++++++ > > libavcodec/x86/simple_idct10_template.asm | 6 +- > > 5 files changed, 130 insertions(+), 2 deletions(-) > > this changes the output of: > ./ffmpeg -an -i ~/tickets/4400/cartest_supers.mov -flags +bitexact > out-ref.avi > > ls -alF out-ref.avi out.avi > -rw-r----- 1 michael michael 761042 Jun 19 22:29 out.avi > -rw-r----- 1 michael michael 761044 Jun 19 22:29 out-ref.avi This is because you're comparing the non-bitexact mmx IDCT (which is enabled even if the bitexact flag is set) with the bitexact sse2 IDCT. Compare (without this patch) the C IDCT ("simple"): ./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct simple -flags +bitexact /tmp/out-ref-simple.avi -rw-r--r-- 1 ronaldbultje wheel 831994 Jun 20 08:56 /tmp/out-ref-simple.avi with the MMX IDCT ("simplemmx", which is selected by "auto" and enabled by default): ./ffmpeg -an -i ~/Downloads/cartest_supers.mov -flags +bitexact /tmp/out-ref.avi or ./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct simplemmx -flags +bitexact /tmp/out-ref.avi or ./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct simpleauto -flags +bitexact /tmp/out-ref.avi or ./ffmpeg -an -i ~/Downloads/cartest_supers.mov -idct auto -flags +bitexact /tmp/out-ref.avi -rw-r--r-- 1 ronaldbultje wheel 831998 Jun 20 08:54 /tmp/out-ref.avi After this patch, all of these (simplemmx, simpleauto, simple, auto) will refer to SSE2 instead of MMX, thus making their results identical to the C version again. Ronald
On 2017-06-19 17:11, James Darnley wrote: > diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm > index 51baf84c82..02fd445ec0 100644 > --- a/libavcodec/x86/simple_idct10_template.asm > +++ b/libavcodec/x86/simple_idct10_template.asm > @@ -258,6 +258,10 @@ > > IDCT_1D %1, %2, %8 > %elif %2 == 11 > + ; This copies the DC-only shortcut. When there is only a DC coefficient the > + ; C shifts the value and splats it to all coeffs rather than multiplying and > + ; doing the full IDCT. This causes a difference on 8-bit because the > + ; coefficient is 16383 rather than 16384 (which you can get with shifting). > por m1, m8, m13 > por m1, m12 > por m1, [blockq+ 16] ; { row[1] }[0-7] > @@ -293,8 +297,6 @@ > por m9, m6 > pand m10, m5 > por m10, m6 > - pand m3, m5 > - por m3, m6 > %else > IDCT_1D %1, %2 > %endif > Now I see where these went. I've moved these to the previous commit which added the DC-only hack and as I said earlier I will push that one soon.
On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > Includes add/put functions > > Rounding contributed by Ronald S. Bultje > --- > libavcodec/tests/x86/dct.c | 2 + > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > libavcodec/x86/simple_idct.h | 9 +++ > libavcodec/x86/simple_idct10.asm | 92 +++++++++++++++++++++++++++++++ > libavcodec/x86/simple_idct10_template.asm | 6 +- > 5 files changed, 130 insertions(+), 2 deletions(-) breaks: ./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1 -vf format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi [...]
Hi, On Tue, Jun 20, 2017 at 7:13 PM, Michael Niedermayer <michael@niedermayer.cc > wrote: > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > > Includes add/put functions > > > > Rounding contributed by Ronald S. Bultje > > --- > > libavcodec/tests/x86/dct.c | 2 + > > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > > libavcodec/x86/simple_idct.h | 9 +++ > > libavcodec/x86/simple_idct10.asm | 92 > +++++++++++++++++++++++++++++++ > > libavcodec/x86/simple_idct10_template.asm | 6 +- > > 5 files changed, 130 insertions(+), 2 deletions(-) > > breaks: > > ./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1 -vf > format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi Please define "breaks". Ronald
On Tue, Jun 20, 2017 at 08:24:55PM -0400, Ronald S. Bultje wrote: > Hi, > > On Tue, Jun 20, 2017 at 7:13 PM, Michael Niedermayer <michael@niedermayer.cc > > wrote: > > > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > > > Includes add/put functions > > > > > > Rounding contributed by Ronald S. Bultje > > > --- > > > libavcodec/tests/x86/dct.c | 2 + > > > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > > > libavcodec/x86/simple_idct.h | 9 +++ > > > libavcodec/x86/simple_idct10.asm | 92 > > +++++++++++++++++++++++++++++++ > > > libavcodec/x86/simple_idct10_template.asm | 6 +- > > > 5 files changed, 130 insertions(+), 2 deletions(-) > > > > breaks: > > > > ./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1 -vf > > format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi > > > Please define "breaks". it looked like the blocks where somehow using the wrong scantable that is recognizable but clearly not correct looking [...]
Hi, On Tue, Jun 20, 2017 at 7:13 PM, Michael Niedermayer <michael@niedermayer.cc > wrote: > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > > Includes add/put functions > > > > Rounding contributed by Ronald S. Bultje > > --- > > libavcodec/tests/x86/dct.c | 2 + > > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > > libavcodec/x86/simple_idct.h | 9 +++ > > libavcodec/x86/simple_idct10.asm | 92 > +++++++++++++++++++++++++++++++ > > libavcodec/x86/simple_idct10_template.asm | 6 +- > > 5 files changed, 130 insertions(+), 2 deletions(-) > > breaks: > > ./ffmpeg -i matrixbench_mpeg2.mpg -an -vframes 1 -vf > format=gbrp,spp=1:63,format=gbrp -qscale 1 file.avi Fixed in 97f7f831691f2a2bddbd258bcbe332516d64a91b. Ronald
On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > Includes add/put functions > > Rounding contributed by Ronald S. Bultje > --- > libavcodec/tests/x86/dct.c | 2 + > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > libavcodec/x86/simple_idct.h | 9 +++ > libavcodec/x86/simple_idct10.asm | 92 +++++++++++++++++++++++++++++++ > libavcodec/x86/simple_idct10_template.asm | 6 +- > 5 files changed, 130 insertions(+), 2 deletions(-) Sorry for the delay, testing this took me a bit longer than expected as many files change slightly and looking at differences manually is manual work ... This patch changes the default IDCT on x86(64), which is intended IIUC It also changes the IDCT when simplemmx is set but on x86-32 simplemmx does after this patch not produce the same result as simplemmx on x86-64. iam not sure but maybe the changed code should enable on FF_IDCT_SIMPLE instead of FF_IDCT_SIMPLEMMX ? whats your oppinion on this ? the next patch would add FF_IDCT_SIMPLE but it also leaves FF_IDCT_SIMPLEMMX [...]
Hi, On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc > wrote: > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > > Includes add/put functions > > > > Rounding contributed by Ronald S. Bultje > > --- > > libavcodec/tests/x86/dct.c | 2 + > > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > > libavcodec/x86/simple_idct.h | 9 +++ > > libavcodec/x86/simple_idct10.asm | 92 > +++++++++++++++++++++++++++++++ > > libavcodec/x86/simple_idct10_template.asm | 6 +- > > 5 files changed, 130 insertions(+), 2 deletions(-) > > Sorry for the delay, testing this took me a bit longer than expected > as many files change slightly and looking at differences manually > is manual work ... > Understood, and thanks for taking the time to do the testing. > This patch changes the default IDCT on x86(64), which is intended IIUC > It also changes the IDCT when simplemmx is set > > but on x86-32 simplemmx does after this patch not produce the same > result as simplemmx on x86-64. > > iam not sure but > maybe the changed code should enable on FF_IDCT_SIMPLE instead of > FF_IDCT_SIMPLEMMX ? > whats your oppinion on this ? > the next patch would add FF_IDCT_SIMPLE but it also leaves > FF_IDCT_SIMPLEMMX That's a good point, I also considered that question (not so much the 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what simplemmx means. Is it the exact binary result of the mmx function? Or is it a way of saying "almost simple, but with some rounding diffs because mmx"? If the second, then simple is a superset of simplemmx. If the first, then we should remove simplemmx from the list of "supported" idcts for the sse2/avx functions. I have no preference (I assumed it meant the first), but if you'd prefer to use the second meaning, then that's an easy modification to make and it won't practically have any impact for most use cases I think... Ronald
Hi, On Sat, Jun 24, 2017 at 6:30 PM, Ronald S. Bultje <rsbultje@gmail.com> wrote: > If the second, then simple is a superset of simplemmx. If the first, then > we should remove simplemmx from the list of "supported" idcts for the > sse2/avx functions. I have no preference (I assumed it meant the first) > Just to explain this a little more: I assumed the first, because the C function is assigned if simplemmx is set but cpuflags is set to 0 (or we're not on x86). However, that could also be considered a fallback so I understand both points of view. Ronald
On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote: > Hi, > > On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc > > wrote: > > > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > > > Includes add/put functions > > > > > > Rounding contributed by Ronald S. Bultje > > > --- > > > libavcodec/tests/x86/dct.c | 2 + > > > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > > > libavcodec/x86/simple_idct.h | 9 +++ > > > libavcodec/x86/simple_idct10.asm | 92 > > +++++++++++++++++++++++++++++++ > > > libavcodec/x86/simple_idct10_template.asm | 6 +- > > > 5 files changed, 130 insertions(+), 2 deletions(-) > > > > Sorry for the delay, testing this took me a bit longer than expected > > as many files change slightly and looking at differences manually > > is manual work ... > > > > Understood, and thanks for taking the time to do the testing. > > > > This patch changes the default IDCT on x86(64), which is intended IIUC > > It also changes the IDCT when simplemmx is set > > > > but on x86-32 simplemmx does after this patch not produce the same > > result as simplemmx on x86-64. > > > > iam not sure but > > maybe the changed code should enable on FF_IDCT_SIMPLE instead of > > FF_IDCT_SIMPLEMMX ? > > whats your oppinion on this ? > > the next patch would add FF_IDCT_SIMPLE but it also leaves > > FF_IDCT_SIMPLEMMX > > > That's a good point, I also considered that question (not so much the > 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what > simplemmx means. Is it the exact binary result of the mmx function? Or is > it a way of saying "almost simple, but with some rounding diffs because > mmx"? > > If the second, then simple is a superset of simplemmx. If the first, then > we should remove simplemmx from the list of "supported" idcts for the > sse2/avx functions. I have no preference (I assumed it meant the first), > but if you'd prefer to use the second meaning, then that's an easy > modification to make and it won't practically have any impact for most use > cases I think... I didnt think about meaning, rather more about practice. if someone reports any issue using "simplemmx" and bitexact and that fails to be reproduced it could be confusing. This is especially plausible when the bug is not idct rounding but a bug in a later stage just triggered by specific output from the idct also potential future fate tests of simplemmx or other simd idcts require there to be a way to select a specific idct output no strong oppinion on this ... [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB If you fake or manipulate statistics in a paper in physics you will never get a job again. If you fake or manipulate statistics in a paper in medicin you will get a job for life at the pharma industry.
On 2017-06-25 21:27, Michael Niedermayer wrote: > On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote: >> Hi, >> >> On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc wrote: >> >>> This patch changes the default IDCT on x86(64), which is intended IIUC >>> It also changes the IDCT when simplemmx is set >>> >>> but on x86-32 simplemmx does after this patch not produce the same >>> result as simplemmx on x86-64. >>> >>> iam not sure but >>> maybe the changed code should enable on FF_IDCT_SIMPLE instead of >>> FF_IDCT_SIMPLEMMX ? >>> whats your oppinion on this ? >>> the next patch would add FF_IDCT_SIMPLE but it also leaves >>> FF_IDCT_SIMPLEMMX >> >> >> That's a good point, I also considered that question (not so much the >> 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what >> simplemmx means. Is it the exact binary result of the mmx function? Or is >> it a way of saying "almost simple, but with some rounding diffs because >> mmx"? >> >> If the second, then simple is a superset of simplemmx. If the first, then >> we should remove simplemmx from the list of "supported" idcts for the >> sse2/avx functions. I have no preference (I assumed it meant the first), >> but if you'd prefer to use the second meaning, then that's an easy >> modification to make and it won't practically have any impact for most use >> cases I think... > > I didnt think about meaning, rather more about practice. > if someone reports any issue using "simplemmx" and bitexact and > that fails to be reproduced it could be confusing. > This is especially plausible when the bug is not idct rounding but > a bug in a later stage just triggered by specific output from the idct > > also potential future fate tests of simplemmx or other simd idcts > require there to be a way to select a specific idct output > > no strong oppinion on this ... I admit I haven't considered whether I should be using this with simplemmx. I could change the code so that the new code isn't used for it. If simplemmx is supposed to be its own algorithm available to anyone who might wish to use it then I think that an error should occur when MMX is not available. Since the current behaviour is to have simple as the catch-all fallback I will leave the code as is. auto, simpleauto, simplemmx, and simple will now all use the new code. We can discuss these points all you want but I intend to push the remaining 3 patches Soon(TM). I have still not tried Gramner's suggestion so you have some time to object and block.
Hi, On Sun, Jun 25, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc > wrote: > On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote: > > Hi, > > > > On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer > <michael@niedermayer.cc > > > wrote: > > > > > On Mon, Jun 19, 2017 at 05:11:03PM +0200, James Darnley wrote: > > > > Includes add/put functions > > > > > > > > Rounding contributed by Ronald S. Bultje > > > > --- > > > > libavcodec/tests/x86/dct.c | 2 + > > > > libavcodec/x86/idctdsp_init.c | 23 ++++++++ > > > > libavcodec/x86/simple_idct.h | 9 +++ > > > > libavcodec/x86/simple_idct10.asm | 92 > > > +++++++++++++++++++++++++++++++ > > > > libavcodec/x86/simple_idct10_template.asm | 6 +- > > > > 5 files changed, 130 insertions(+), 2 deletions(-) > > > > > > Sorry for the delay, testing this took me a bit longer than expected > > > as many files change slightly and looking at differences manually > > > is manual work ... > > > > > > > Understood, and thanks for taking the time to do the testing. > > > > > > > This patch changes the default IDCT on x86(64), which is intended IIUC > > > It also changes the IDCT when simplemmx is set > > > > > > but on x86-32 simplemmx does after this patch not produce the same > > > result as simplemmx on x86-64. > > > > > > iam not sure but > > > maybe the changed code should enable on FF_IDCT_SIMPLE instead of > > > FF_IDCT_SIMPLEMMX ? > > > whats your oppinion on this ? > > > the next patch would add FF_IDCT_SIMPLE but it also leaves > > > FF_IDCT_SIMPLEMMX > > > > > > That's a good point, I also considered that question (not so much the > > 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what > > simplemmx means. Is it the exact binary result of the mmx function? Or is > > it a way of saying "almost simple, but with some rounding diffs because > > mmx"? > > > > If the second, then simple is a superset of simplemmx. If the first, then > > we should remove simplemmx from the list of "supported" idcts for the > > sse2/avx functions. I have no preference (I assumed it meant the first), > > but if you'd prefer to use the second meaning, then that's an easy > > modification to make and it won't practically have any impact for most > use > > cases I think... > > I didnt think about meaning, rather more about practice. > if someone reports any issue using "simplemmx" and bitexact and > that fails to be reproduced it could be confusing. > This is especially plausible when the bug is not idct rounding but > a bug in a later stage just triggered by specific output from the idct > Agreed. I'll leave it to James to decide on a final approach since it's his patch. :-). It sounds like we're both fine with either approach. > also potential future fate tests of simplemmx or other simd idcts > require there to be a way to select a specific idct output This is true, but don't forget that -cpuflags can also be used to cycle between the various -idct flavours. (This requires some trial and error, but it's not that many cpuflag-combinations.) Ronald
On Mon, Jun 26, 2017 at 02:20:03PM +0200, James Darnley wrote: > On 2017-06-25 21:27, Michael Niedermayer wrote: > > On Sat, Jun 24, 2017 at 06:30:26PM -0400, Ronald S. Bultje wrote: > >> Hi, > >> > >> On Sat, Jun 24, 2017 at 3:27 PM, Michael Niedermayer <michael@niedermayer.cc wrote: > >> > >>> This patch changes the default IDCT on x86(64), which is intended IIUC > >>> It also changes the IDCT when simplemmx is set > >>> > >>> but on x86-32 simplemmx does after this patch not produce the same > >>> result as simplemmx on x86-64. > >>> > >>> iam not sure but > >>> maybe the changed code should enable on FF_IDCT_SIMPLE instead of > >>> FF_IDCT_SIMPLEMMX ? > >>> whats your oppinion on this ? > >>> the next patch would add FF_IDCT_SIMPLE but it also leaves > >>> FF_IDCT_SIMPLEMMX > >> > >> > >> That's a good point, I also considered that question (not so much the > >> 32bit vs. 64bit, but the mmx vs. sse2). The question is basically what > >> simplemmx means. Is it the exact binary result of the mmx function? Or is > >> it a way of saying "almost simple, but with some rounding diffs because > >> mmx"? > >> > >> If the second, then simple is a superset of simplemmx. If the first, then > >> we should remove simplemmx from the list of "supported" idcts for the > >> sse2/avx functions. I have no preference (I assumed it meant the first), > >> but if you'd prefer to use the second meaning, then that's an easy > >> modification to make and it won't practically have any impact for most use > >> cases I think... > > > > I didnt think about meaning, rather more about practice. > > if someone reports any issue using "simplemmx" and bitexact and > > that fails to be reproduced it could be confusing. > > This is especially plausible when the bug is not idct rounding but > > a bug in a later stage just triggered by specific output from the idct > > > > also potential future fate tests of simplemmx or other simd idcts > > require there to be a way to select a specific idct output > > > > no strong oppinion on this ... > > I admit I haven't considered whether I should be using this with > simplemmx. I could change the code so that the new code isn't used for it. > > If simplemmx is supposed to be its own algorithm available to anyone who > might wish to use it then I think that an error should occur when MMX is > not available. yes, that would make sense > > Since the current behaviour is to have simple as the catch-all fallback > I will leave the code as is. auto, simpleauto, simplemmx, and simple > will now all use the new code. > > We can discuss these points all you want but I intend to push the > remaining 3 patches Soon(TM). I have still not tried Gramner's > suggestion so you have some time to object and block. > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c index 34f5b8767b..317d973f9f 100644 --- a/libavcodec/tests/x86/dct.c +++ b/libavcodec/tests/x86/dct.c @@ -88,10 +88,12 @@ static const struct algo idct_tab_arch[] = { #if HAVE_YASM #if ARCH_X86_64 #if HAVE_SSE2_EXTERNAL + { "SIMPLE8-SSE2", ff_simple_idct8_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2}, { "SIMPLE10-SSE2", ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2}, { "SIMPLE12-SSE2", ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 }, #endif #if HAVE_AVX_EXTERNAL + { "SIMPLE8-AVX", ff_simple_idct8_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX}, { "SIMPLE10-AVX", ff_simple_idct10_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX}, { "SIMPLE12-AVX", ff_simple_idct12_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 }, #endif diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index f1c915aa00..9da60d1a1e 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -94,9 +94,32 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->idct_add = ff_simple_idct_add_sse2; c->perm_type = FF_IDCT_PERM_SIMPLE; } + + if (ARCH_X86_64 && + !high_bit_depth && + avctx->lowres == 0 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + c->idct = ff_simple_idct8_sse2; + c->idct_put = ff_simple_idct8_put_sse2; + c->idct_add = ff_simple_idct8_add_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } } if (ARCH_X86_64 && avctx->lowres == 0) { + if (EXTERNAL_AVX(cpu_flags) && + !high_bit_depth && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + c->idct = ff_simple_idct8_avx; + c->idct_put = ff_simple_idct8_put_avx; + c->idct_add = ff_simple_idct8_add_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + if (avctx->bits_per_raw_sample == 10 && (avctx->idct_algo == FF_IDCT_AUTO || avctx->idct_algo == FF_IDCT_SIMPLEAUTO || diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index d17ef6a462..9b64cfe9bc 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -29,6 +29,15 @@ void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_sse2(int16_t *block); +void ff_simple_idct8_avx(int16_t *block); + +void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + void ff_simple_idct10_sse2(int16_t *block); void ff_simple_idct10_avx(int16_t *block); diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index b492303a57..069bb61378 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -31,11 +31,14 @@ SECTION_RODATA cextern pw_2 cextern pw_16 +cextern pw_32 cextern pw_1023 cextern pw_4095 +pd_round_11: times 4 dd 1<<(11-1) pd_round_12: times 4 dd 1<<(12-1) pd_round_15: times 4 dd 1<<(15-1) pd_round_19: times 4 dd 1<<(19-1) +pd_round_20: times 4 dd 1<<(20-1) %macro CONST_DEC 3 const %1 @@ -77,8 +80,97 @@ CONST_DEC w3_min_w7_lo, W3sh2_lo, -W7sh2 SECTION .text +%macro STORE_HI_LO 12 + movq %1, %9 + movq %3, %10 + movq %5, %11 + movq %7, %12 + movhps %2, %9 + movhps %4, %10 + movhps %6, %11 + movhps %8, %12 +%endmacro + +%macro LOAD_ZXBW_8 16 + pmovzxbw %1, %9 + pmovzxbw %2, %10 + pmovzxbw %3, %11 + pmovzxbw %4, %12 + pmovzxbw %5, %13 + pmovzxbw %6, %14 + pmovzxbw %7, %15 + pmovzxbw %8, %16 +%endmacro + +%macro LOAD_ZXBW_4 9 + movh %1, %5 + movh %2, %6 + movh %3, %7 + movh %4, %8 + punpcklbw %1, %9 + punpcklbw %2, %9 + punpcklbw %3, %9 + punpcklbw %4, %9 +%endmacro + +%define PASS4ROWS(base, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base + stride3] + %macro idct_fn 0 +define_constants _lo + +cglobal simple_idct8, 1, 1, 16, 32, block + IDCT_FN "", 11, pw_32, 20, "store" +RET + +cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r3, [3*lsizeq] + lea r2, [pixelsq + r3] + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9 +RET + +cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r2, [3*lsizeq] + %if cpuflag(sse4) + lea r3, [pixelsq + r2] + LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2) + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + paddsw m4, m12 + paddsw m11, m13 + paddsw m9, m14 + paddsw m10, m15 + %else + pxor m12, m12 + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12 + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + lea r3, [pixelsq + 4*lsizeq] + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12 + paddsw m4, m3 + paddsw m11, m5 + paddsw m9, m6 + paddsw m10, m7 + lea r3, [pixelsq + r2] + %endif + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9 +RET + define_constants _hi cglobal simple_idct10, 1, 1, 16, block diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm index 51baf84c82..02fd445ec0 100644 --- a/libavcodec/x86/simple_idct10_template.asm +++ b/libavcodec/x86/simple_idct10_template.asm @@ -258,6 +258,10 @@ IDCT_1D %1, %2, %8 %elif %2 == 11 + ; This copies the DC-only shortcut. When there is only a DC coefficient the + ; C shifts the value and splats it to all coeffs rather than multiplying and + ; doing the full IDCT. This causes a difference on 8-bit because the + ; coefficient is 16383 rather than 16384 (which you can get with shifting). por m1, m8, m13 por m1, m12 por m1, [blockq+ 16] ; { row[1] }[0-7] @@ -293,8 +297,6 @@ por m9, m6 pand m10, m5 por m10, m6 - pand m3, m5 - por m3, m6 %else IDCT_1D %1, %2 %endif