Message ID | 20170405015328.5476-5-jdarnley@obe.tv |
---|---|
State | Superseded |
Headers | show |
On 4/4/2017 10:53 PM, James Darnley wrote: > Haswell: > - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext > > Skylake-U: > - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext Again, you should add an SSE2 version first, then an AVX one if it's measurably faster than the SSE2 one. > --- > libavcodec/x86/h264_idct.asm | 33 ++++++++++++++++++++++++++++++++- > libavcodec/x86/h264dsp_init.c | 3 +++ > 2 files changed, 35 insertions(+), 1 deletion(-) > > diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm > index bc4dce4..24fb4d2 100644 > --- a/libavcodec/x86/h264_idct.asm > +++ b/libavcodec/x86/h264_idct.asm > @@ -65,7 +65,15 @@ SECTION .text > > IDCT4_1D w, 0, 1, 2, 3, 4, 5 > mova m6, [pw_32] > - TRANSPOSE4x4W 0, 1, 2, 3, 4 > + %if mmsize == 8 > + TRANSPOSE4x4W 0, 1, 2, 3, 4 > + %else > + punpcklwd m0, m1 > + punpcklwd m2, m3 > + SBUTTERFLY dq, 0, 2, 4 > + MOVHL m1, m0 > + MOVHL m3, m2 > + %endif > paddw m0, m6 > IDCT4_1D w, 0, 1, 2, 3, 4, 5 > pxor m7, m7 > @@ -1131,3 +1139,26 @@ INIT_MMX mmx > IDCT_DC_DEQUANT 0 > INIT_MMX sse2 > IDCT_DC_DEQUANT 7 > + > +INIT_XMM avx > + > +; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet > +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride > + movd %3, [%7] > + movd %4, [%7+%8] > + psraw %1, %6 > + psraw %2, %6 > + punpcklbw %3, %5 > + punpcklbw %4, %5 > + paddw %3, %1 > + paddw %4, %2 > + packuswb %3, %5 > + packuswb %4, %5 > + movd [%7], %3 > + movd [%7+%8], %4 > +%endmacro > + > +cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ > + movsxdifnidn stride_q, stride_d > + IDCT4_ADD dst_q, block_q, stride_q > +RET > diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c > index 0643b37..8ba085f 100644 > --- a/libavcodec/x86/h264dsp_init.c > +++ b/libavcodec/x86/h264dsp_init.c > @@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ > int stride); > > IDCT_ADD_FUNC(, 8, mmx) > +IDCT_ADD_FUNC(, 8, avx) > IDCT_ADD_FUNC(, 10, sse2) > IDCT_ADD_FUNC(_dc, 8, mmxext) > IDCT_ADD_FUNC(_dc, 10, mmxext) > @@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, > c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx; > c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; > } > + > + c->h264_idct_add = ff_h264_idct_add_8_avx; > } > } else if (bit_depth == 10) { > if (EXTERNAL_MMXEXT(cpu_flags)) { >
On 2017-04-05 05:44, James Almer wrote: > On 4/4/2017 10:53 PM, James Darnley wrote: >> Haswell: >> - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext >> >> Skylake-U: >> - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext > > Again, you should add an SSE2 version first, then an AVX one if it's > measurably faster than the SSE2 one. On a Yorkfield sse2 is barely faster: 1.02x faster (728±2.1 vs. 710±3.9 decicycles). So 1 or 2 cycles On a Skylake-U sse2 is most of the speedup: 1.15x faster (661±2.2 vs 573±1.9). Then avx gains a mere 3 cycles: 547±0.5 On a Haswell sse2 provides only half the speedup: - sse2: 1.06x faster (525±2.5 vs 497±1.0 decicycles) - avx: 1.06x faster (497±1.0 vs 468±1.2 decicycles) (All on 64-bit Linux) On Nehalem and 64-bit Windows sse2 is slower: 0.92x faster (597±3.0 vs. 650±9.3 decicycles) And on that note I should probably recheck the deblock patches I pushed a little while ago. So... SSE2 for this function, yay or nay?
On 4/6/2017 12:34 PM, James Darnley wrote: > On 2017-04-05 05:44, James Almer wrote: >> On 4/4/2017 10:53 PM, James Darnley wrote: >>> Haswell: >>> - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext >>> >>> Skylake-U: >>> - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext >> >> Again, you should add an SSE2 version first, then an AVX one if it's >> measurably faster than the SSE2 one. > > On a Yorkfield sse2 is barely faster: 1.02x faster (728±2.1 vs. 710±3.9 > decicycles). So 1 or 2 cycles > > On a Skylake-U sse2 is most of the speedup: 1.15x faster (661±2.2 vs > 573±1.9). Then avx gains a mere 3 cycles: 547±0.5 > > On a Haswell sse2 provides only half the speedup: > - sse2: 1.06x faster (525±2.5 vs 497±1.0 decicycles) > - avx: 1.06x faster (497±1.0 vs 468±1.2 decicycles) > > (All on 64-bit Linux) > > On Nehalem and 64-bit Windows sse2 is slower: 0.92x faster (597±3.0 vs. > 650±9.3 decicycles) Slower than what? MMX or AVX? Your numbers are really confusing. Could you post the actual numbers for each function instead of doing comparisons? ff_h264_idct_add_8_mmx = ??? cycles ff_h264_idct_add_8_sse2 = ??? cycles ff_h264_idct_add_8_avx = ??? cycles Does checkasm support these functions? Maybe you could just run that and paste the results you get, which would be easier and faster. > > And on that note I should probably recheck the deblock patches I pushed > a little while ago. > > So... SSE2 for this function, yay or nay? By default, always add SSE2 if it's measurably faster than MMX, especially when you take advantage of the wider XMM regs, like you're doing here. What you need to check is if adding AVX is worth it on top of SSE2 when you're not taking advantage of the wider YMM regs, like it happens here.
On 2017-04-06 18:06, James Almer wrote: > Your numbers are really confusing. Could you post the actual numbers for > each function instead of doing comparisons? These figures are the actual numbers! Using the figures from Haswell above: > ff_h264_idct_add_8_mmx = 52 cycles > ff_h264_idct_add_8_sse2 = 49 cycles > ff_h264_idct_add_8_avx = 46 cycles Coming back to this draft I saved I removed a fair bit of ranting and cut it down to the essential point. Also, I forgot about the Pentium I tested previous patches on. I added SSE2. From that commit message: > Kaby Lake Pentium: > - ff_h264_idct_add_8_sse2: ~1.18x faster than mmxext > - ff_h264_idct_dc_add_8_sse2: ~1.07x faster than mmxext
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index bc4dce4..24fb4d2 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -65,7 +65,15 @@ SECTION .text IDCT4_1D w, 0, 1, 2, 3, 4, 5 mova m6, [pw_32] - TRANSPOSE4x4W 0, 1, 2, 3, 4 + %if mmsize == 8 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + %else + punpcklwd m0, m1 + punpcklwd m2, m3 + SBUTTERFLY dq, 0, 2, 4 + MOVHL m1, m0 + MOVHL m3, m2 + %endif paddw m0, m6 IDCT4_1D w, 0, 1, 2, 3, 4, 5 pxor m7, m7 @@ -1131,3 +1139,26 @@ INIT_MMX mmx IDCT_DC_DEQUANT 0 INIT_MMX sse2 IDCT_DC_DEQUANT 7 + +INIT_XMM avx + +; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movd %3, [%7] + movd %4, [%7+%8] + psraw %1, %6 + psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movd [%7], %3 + movd [%7+%8], %4 +%endmacro + +cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ + movsxdifnidn stride_q, stride_d + IDCT4_ADD dst_q, block_q, stride_q +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 0643b37..8ba085f 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ int stride); IDCT_ADD_FUNC(, 8, mmx) +IDCT_ADD_FUNC(, 8, avx) IDCT_ADD_FUNC(, 10, sse2) IDCT_ADD_FUNC(_dc, 8, mmxext) IDCT_ADD_FUNC(_dc, 10, mmxext) @@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx; c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; } + + c->h264_idct_add = ff_h264_idct_add_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) {