Message ID | 20170626173432.17248-1-onemda@gmail.com |
---|---|
State | New |
Headers | show |
On 6/26/2017 2:34 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol <onemda@gmail.com> > --- > libavcodec/proresdec_lgpl.c | 8 ++++++-- > 1 file changed, 6 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/proresdec_lgpl.c b/libavcodec/proresdec_lgpl.c > index bc5bdb5..e91abaf 100644 > --- a/libavcodec/proresdec_lgpl.c > +++ b/libavcodec/proresdec_lgpl.c > @@ -34,6 +34,7 @@ > > #include "libavutil/intmath.h" > #include "avcodec.h" > +#include "blockdsp.h" > #include "idctdsp.h" > #include "internal.h" > #include "proresdata.h" > @@ -52,6 +53,7 @@ typedef struct ProresThreadData { > } ProresThreadData; > > typedef struct ProresContext { > + BlockDSPContext bdsp; > ProresDSPContext dsp; > AVFrame *frame; > ScanTable scantable; > @@ -86,6 +88,7 @@ static av_cold int decode_init(AVCodecContext *avctx) > ctx->slice_data = NULL; > > avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE; > + ff_blockdsp_init(&ctx->bdsp, avctx); > ff_proresdsp_init(&ctx->dsp, avctx); > > ctx->scantable_type = -1; // set scantable type to uninitialized > @@ -431,11 +434,12 @@ static int decode_slice_plane(ProresContext *ctx, ProresThreadData *td, > { > GetBitContext gb; > int16_t *block_ptr; > - int mb_num, blocks_per_slice, ret; > + int i, mb_num, blocks_per_slice, ret; > > blocks_per_slice = mbs_per_slice * blocks_per_mb; > > - memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks)); > + for (i = 0; i < blocks_per_slice; i++) > + ctx->bdsp.clear_block(td->blocks + (i << 6)); Is this intended for speed, or to actually clear all blocks instead of just a few? Because I'm not sure this is going to be faster, especially if you're not using clear_blocks. Any relatively recent libc will use AVX internally on new x86 CPUs that will most likely be faster than calling SSE clear_block in a loop. > > init_get_bits(&gb, buf, data_size << 3); > >
On 6/26/17, James Almer <jamrial@gmail.com> wrote: > On 6/26/2017 2:34 PM, Paul B Mahol wrote: >> Signed-off-by: Paul B Mahol <onemda@gmail.com> >> --- >> libavcodec/proresdec_lgpl.c | 8 ++++++-- >> 1 file changed, 6 insertions(+), 2 deletions(-) >> >> diff --git a/libavcodec/proresdec_lgpl.c b/libavcodec/proresdec_lgpl.c >> index bc5bdb5..e91abaf 100644 >> --- a/libavcodec/proresdec_lgpl.c >> +++ b/libavcodec/proresdec_lgpl.c >> @@ -34,6 +34,7 @@ >> >> #include "libavutil/intmath.h" >> #include "avcodec.h" >> +#include "blockdsp.h" >> #include "idctdsp.h" >> #include "internal.h" >> #include "proresdata.h" >> @@ -52,6 +53,7 @@ typedef struct ProresThreadData { >> } ProresThreadData; >> >> typedef struct ProresContext { >> + BlockDSPContext bdsp; >> ProresDSPContext dsp; >> AVFrame *frame; >> ScanTable scantable; >> @@ -86,6 +88,7 @@ static av_cold int decode_init(AVCodecContext *avctx) >> ctx->slice_data = NULL; >> >> avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE; >> + ff_blockdsp_init(&ctx->bdsp, avctx); >> ff_proresdsp_init(&ctx->dsp, avctx); >> >> ctx->scantable_type = -1; // set scantable type to uninitialized >> @@ -431,11 +434,12 @@ static int decode_slice_plane(ProresContext *ctx, >> ProresThreadData *td, >> { >> GetBitContext gb; >> int16_t *block_ptr; >> - int mb_num, blocks_per_slice, ret; >> + int i, mb_num, blocks_per_slice, ret; >> >> blocks_per_slice = mbs_per_slice * blocks_per_mb; >> >> - memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks)); >> + for (i = 0; i < blocks_per_slice; i++) >> + ctx->bdsp.clear_block(td->blocks + (i << 6)); > > Is this intended for speed, or to actually clear all blocks instead of > just a few? Because I'm not sure this is going to be faster, especially > if you're not using clear_blocks. > Any relatively recent libc will use AVX internally on new x86 CPUs that > will most likely be faster than calling SSE clear_block in a loop. Another prores decoder uses it. And apparently make it faster. Feel free to try.
diff --git a/libavcodec/proresdec_lgpl.c b/libavcodec/proresdec_lgpl.c index bc5bdb5..e91abaf 100644 --- a/libavcodec/proresdec_lgpl.c +++ b/libavcodec/proresdec_lgpl.c @@ -34,6 +34,7 @@ #include "libavutil/intmath.h" #include "avcodec.h" +#include "blockdsp.h" #include "idctdsp.h" #include "internal.h" #include "proresdata.h" @@ -52,6 +53,7 @@ typedef struct ProresThreadData { } ProresThreadData; typedef struct ProresContext { + BlockDSPContext bdsp; ProresDSPContext dsp; AVFrame *frame; ScanTable scantable; @@ -86,6 +88,7 @@ static av_cold int decode_init(AVCodecContext *avctx) ctx->slice_data = NULL; avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE; + ff_blockdsp_init(&ctx->bdsp, avctx); ff_proresdsp_init(&ctx->dsp, avctx); ctx->scantable_type = -1; // set scantable type to uninitialized @@ -431,11 +434,12 @@ static int decode_slice_plane(ProresContext *ctx, ProresThreadData *td, { GetBitContext gb; int16_t *block_ptr; - int mb_num, blocks_per_slice, ret; + int i, mb_num, blocks_per_slice, ret; blocks_per_slice = mbs_per_slice * blocks_per_mb; - memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks)); + for (i = 0; i < blocks_per_slice; i++) + ctx->bdsp.clear_block(td->blocks + (i << 6)); init_get_bits(&gb, buf, data_size << 3);
Signed-off-by: Paul B Mahol <onemda@gmail.com> --- libavcodec/proresdec_lgpl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)