Message ID | 20170405015328.5476-6-jdarnley@obe.tv |
---|---|
State | Superseded |
Headers | show |
On 4/4/2017 10:53 PM, James Darnley wrote: > Haswell: > - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext > > Skylake-U: > - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext > --- > libavcodec/x86/h264_idct.asm | 20 ++++++++++++++++++++ > libavcodec/x86/h264dsp_init.c | 2 ++ > 2 files changed, 22 insertions(+) > > diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm > index 24fb4d2..7fd57d3 100644 > --- a/libavcodec/x86/h264_idct.asm > +++ b/libavcodec/x86/h264_idct.asm > @@ -1158,7 +1158,27 @@ INIT_XMM avx > movd [%7+%8], %4 > %endmacro > > +%macro DC_ADD_INIT 1 > + add %1d, 32 > + sar %1d, 6 > + movd m0, %1d > + SPLATW m0, m0, 0 Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be enough. This macro calls two instructions to fill the entire XMM register, and there's no need for that. You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining said dwords with punpk* into fewer registers to reduce the amount of padd* and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm And again, SSE2 first, AVX only if measurably faster. But since you're not making use of the wider XMM regs here at all, the only chips that will see any real speed up are those slow in mmx (like Skylake seems to be). > + lea %1, [3*stride_q] > + pxor m1, m1 > + psubw m1, m0 > + packuswb m0, m0 > + packuswb m1, m1 > +%endmacro > + > cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ > movsxdifnidn stride_q, stride_d > IDCT4_ADD dst_q, block_q, stride_q > RET > + > +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_ > + movsxdifnidn stride_q, stride_d > + movsx r3d, word [block_q] > + mov dword [block_q], 0 > + DC_ADD_INIT r3 > + DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 > +RET > diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c > index 8ba085f..bf74937 100644 > --- a/libavcodec/x86/h264dsp_init.c > +++ b/libavcodec/x86/h264dsp_init.c > @@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx) > IDCT_ADD_FUNC(, 8, avx) > IDCT_ADD_FUNC(, 10, sse2) > IDCT_ADD_FUNC(_dc, 8, mmxext) > +IDCT_ADD_FUNC(_dc, 8, avx) > IDCT_ADD_FUNC(_dc, 10, mmxext) > IDCT_ADD_FUNC(8_dc, 8, mmxext) > IDCT_ADD_FUNC(8_dc, 10, sse2) > @@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, > } > > c->h264_idct_add = ff_h264_idct_add_8_avx; > + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; > } > } else if (bit_depth == 10) { > if (EXTERNAL_MMXEXT(cpu_flags)) { >
On 2017-04-05 06:05, James Almer wrote: > On 4/4/2017 10:53 PM, James Darnley wrote: >> Haswell: >> - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext >> >> Skylake-U: >> - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext >> --- >> libavcodec/x86/h264_idct.asm | 20 ++++++++++++++++++++ >> libavcodec/x86/h264dsp_init.c | 2 ++ >> 2 files changed, 22 insertions(+) >> >> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm >> index 24fb4d2..7fd57d3 100644 >> --- a/libavcodec/x86/h264_idct.asm >> +++ b/libavcodec/x86/h264_idct.asm >> @@ -1158,7 +1158,27 @@ INIT_XMM avx >> movd [%7+%8], %4 >> %endmacro >> >> +%macro DC_ADD_INIT 1 >> + add %1d, 32 >> + sar %1d, 6 >> + movd m0, %1d >> + SPLATW m0, m0, 0 > > Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be > enough. This macro calls two instructions to fill the entire XMM register, > and there's no need for that. Noted, I made that change butit doesn't seemto change much in terms of performance. > You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining > said dwords with punpk* into fewer registers to reduce the amount of padd* > and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm Noted. Maybe in the future. > And again, SSE2 first, AVX only if measurably faster. But since you're not > making use of the wider XMM regs here at all, the only chips that will see > any real speed up are those slow in mmx (like Skylake seems to be). Yorkfield gets no benefit from sse2 (575±0.4 vs. 574±0.3 decicycles). Haswell gets most of its benefit from sse2 (404±0.6 vs. 390±0.3 vs. 388±0.3). Skylake-U gets all of its speedup from sse2 (533±3.0 vs 488±2.0 vs 497±1.4). Nehalem and 64-bit also gets no benefit from sse2. Again: SSE2 yay or nay? Maybe I should just drop this; I'm not sure 5 cycles is worth it. (I will now go and modify my script to divide the recorded decicycle count by 10.) >> +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_ ^ Fixed this bug.
On 4/6/2017 1:01 PM, James Darnley wrote: > On 2017-04-05 06:05, James Almer wrote: >> On 4/4/2017 10:53 PM, James Darnley wrote: >>> Haswell: >>> - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext >>> >>> Skylake-U: >>> - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext >>> --- >>> libavcodec/x86/h264_idct.asm | 20 ++++++++++++++++++++ >>> libavcodec/x86/h264dsp_init.c | 2 ++ >>> 2 files changed, 22 insertions(+) >>> >>> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm >>> index 24fb4d2..7fd57d3 100644 >>> --- a/libavcodec/x86/h264_idct.asm >>> +++ b/libavcodec/x86/h264_idct.asm >>> @@ -1158,7 +1158,27 @@ INIT_XMM avx >>> movd [%7+%8], %4 >>> %endmacro >>> >>> +%macro DC_ADD_INIT 1 >>> + add %1d, 32 >>> + sar %1d, 6 >>> + movd m0, %1d >>> + SPLATW m0, m0, 0 >> >> Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be >> enough. This macro calls two instructions to fill the entire XMM register, >> and there's no need for that. > > Noted, I made that change butit doesn't seemto change much in terms of > performance. > >> You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining >> said dwords with punpk* into fewer registers to reduce the amount of padd* >> and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm > > Noted. Maybe in the future. > >> And again, SSE2 first, AVX only if measurably faster. But since you're not >> making use of the wider XMM regs here at all, the only chips that will see >> any real speed up are those slow in mmx (like Skylake seems to be). > > Yorkfield gets no benefit from sse2 (575±0.4 vs. 574±0.3 decicycles). > Haswell gets most of its benefit from sse2 (404±0.6 vs. 390±0.3 vs. > 388±0.3). > Skylake-U gets all of its speedup from sse2 (533±3.0 vs 488±2.0 vs 497±1.4). > > Nehalem and 64-bit also gets no benefit from sse2. > > Again: SSE2 yay or nay? Maybe I should just drop this; I'm not sure 5 > cycles is worth it. Yes, add SSE2 as it's the source of pretty much any speedup. AVX seems to not make a difference on top of SSE2 if i'm reading your numbers right (as i asked in the previous reply, please post actual numbers for each version), so you can just skip it. > > (I will now go and modify my script to divide the recorded decicycle > count by 10.) > >>> +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_ > ^ > Fixed this bug. > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 24fb4d2..7fd57d3 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -1158,7 +1158,27 @@ INIT_XMM avx movd [%7+%8], %4 %endmacro +%macro DC_ADD_INIT 1 + add %1d, 32 + sar %1d, 6 + movd m0, %1d + SPLATW m0, m0, 0 + lea %1, [3*stride_q] + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 +%endmacro + cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ movsxdifnidn stride_q, stride_d IDCT4_ADD dst_q, block_q, stride_q RET + +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_ + movsxdifnidn stride_q, stride_d + movsx r3d, word [block_q] + mov dword [block_q], 0 + DC_ADD_INIT r3 + DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 8ba085f..bf74937 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx) IDCT_ADD_FUNC(, 8, avx) IDCT_ADD_FUNC(, 10, sse2) IDCT_ADD_FUNC(_dc, 8, mmxext) +IDCT_ADD_FUNC(_dc, 8, avx) IDCT_ADD_FUNC(_dc, 10, mmxext) IDCT_ADD_FUNC(8_dc, 8, mmxext) IDCT_ADD_FUNC(8_dc, 10, sse2) @@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, } c->h264_idct_add = ff_h264_idct_add_8_avx; + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) {