[FFmpeg-devel] avcodec/proresdec_lgpl: make use of BlockDSP

Submitted by Paul B Mahol on June 26, 2017, 5:34 p.m.

Details

Message ID 20170626173432.17248-1-onemda@gmail.com
State New
Headers show

Commit Message

Paul B Mahol June 26, 2017, 5:34 p.m.
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/proresdec_lgpl.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

Comments

James Almer June 26, 2017, 5:57 p.m.
On 6/26/2017 2:34 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavcodec/proresdec_lgpl.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/libavcodec/proresdec_lgpl.c b/libavcodec/proresdec_lgpl.c
> index bc5bdb5..e91abaf 100644
> --- a/libavcodec/proresdec_lgpl.c
> +++ b/libavcodec/proresdec_lgpl.c
> @@ -34,6 +34,7 @@
>  
>  #include "libavutil/intmath.h"
>  #include "avcodec.h"
> +#include "blockdsp.h"
>  #include "idctdsp.h"
>  #include "internal.h"
>  #include "proresdata.h"
> @@ -52,6 +53,7 @@ typedef struct ProresThreadData {
>  } ProresThreadData;
>  
>  typedef struct ProresContext {
> +    BlockDSPContext bdsp;
>      ProresDSPContext dsp;
>      AVFrame    *frame;
>      ScanTable  scantable;
> @@ -86,6 +88,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
>      ctx->slice_data       = NULL;
>  
>      avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE;
> +    ff_blockdsp_init(&ctx->bdsp, avctx);
>      ff_proresdsp_init(&ctx->dsp, avctx);
>  
>      ctx->scantable_type = -1;   // set scantable type to uninitialized
> @@ -431,11 +434,12 @@ static int decode_slice_plane(ProresContext *ctx, ProresThreadData *td,
>  {
>      GetBitContext gb;
>      int16_t *block_ptr;
> -    int mb_num, blocks_per_slice, ret;
> +    int i, mb_num, blocks_per_slice, ret;
>  
>      blocks_per_slice = mbs_per_slice * blocks_per_mb;
>  
> -    memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks));
> +    for (i = 0; i < blocks_per_slice; i++)
> +        ctx->bdsp.clear_block(td->blocks + (i << 6));

Is this intended for speed, or to actually clear all blocks instead of
just a few? Because I'm not sure this is going to be faster, especially
if you're not using clear_blocks.
Any relatively recent libc will use AVX internally on new x86 CPUs that
will most likely be faster than calling SSE clear_block in a loop.

>  
>      init_get_bits(&gb, buf, data_size << 3);
>  
>
Paul B Mahol June 26, 2017, 6:11 p.m.
On 6/26/17, James Almer <jamrial@gmail.com> wrote:
> On 6/26/2017 2:34 PM, Paul B Mahol wrote:
>> Signed-off-by: Paul B Mahol <onemda@gmail.com>
>> ---
>>  libavcodec/proresdec_lgpl.c | 8 ++++++--
>>  1 file changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/libavcodec/proresdec_lgpl.c b/libavcodec/proresdec_lgpl.c
>> index bc5bdb5..e91abaf 100644
>> --- a/libavcodec/proresdec_lgpl.c
>> +++ b/libavcodec/proresdec_lgpl.c
>> @@ -34,6 +34,7 @@
>>
>>  #include "libavutil/intmath.h"
>>  #include "avcodec.h"
>> +#include "blockdsp.h"
>>  #include "idctdsp.h"
>>  #include "internal.h"
>>  #include "proresdata.h"
>> @@ -52,6 +53,7 @@ typedef struct ProresThreadData {
>>  } ProresThreadData;
>>
>>  typedef struct ProresContext {
>> +    BlockDSPContext bdsp;
>>      ProresDSPContext dsp;
>>      AVFrame    *frame;
>>      ScanTable  scantable;
>> @@ -86,6 +88,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
>>      ctx->slice_data       = NULL;
>>
>>      avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE;
>> +    ff_blockdsp_init(&ctx->bdsp, avctx);
>>      ff_proresdsp_init(&ctx->dsp, avctx);
>>
>>      ctx->scantable_type = -1;   // set scantable type to uninitialized
>> @@ -431,11 +434,12 @@ static int decode_slice_plane(ProresContext *ctx,
>> ProresThreadData *td,
>>  {
>>      GetBitContext gb;
>>      int16_t *block_ptr;
>> -    int mb_num, blocks_per_slice, ret;
>> +    int i, mb_num, blocks_per_slice, ret;
>>
>>      blocks_per_slice = mbs_per_slice * blocks_per_mb;
>>
>> -    memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks));
>> +    for (i = 0; i < blocks_per_slice; i++)
>> +        ctx->bdsp.clear_block(td->blocks + (i << 6));
>
> Is this intended for speed, or to actually clear all blocks instead of
> just a few? Because I'm not sure this is going to be faster, especially
> if you're not using clear_blocks.
> Any relatively recent libc will use AVX internally on new x86 CPUs that
> will most likely be faster than calling SSE clear_block in a loop.

Another prores decoder uses it. And apparently make it faster.
Feel free to try.

Patch hide | download patch | download mbox

diff --git a/libavcodec/proresdec_lgpl.c b/libavcodec/proresdec_lgpl.c
index bc5bdb5..e91abaf 100644
--- a/libavcodec/proresdec_lgpl.c
+++ b/libavcodec/proresdec_lgpl.c
@@ -34,6 +34,7 @@ 
 
 #include "libavutil/intmath.h"
 #include "avcodec.h"
+#include "blockdsp.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "proresdata.h"
@@ -52,6 +53,7 @@  typedef struct ProresThreadData {
 } ProresThreadData;
 
 typedef struct ProresContext {
+    BlockDSPContext bdsp;
     ProresDSPContext dsp;
     AVFrame    *frame;
     ScanTable  scantable;
@@ -86,6 +88,7 @@  static av_cold int decode_init(AVCodecContext *avctx)
     ctx->slice_data       = NULL;
 
     avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE;
+    ff_blockdsp_init(&ctx->bdsp, avctx);
     ff_proresdsp_init(&ctx->dsp, avctx);
 
     ctx->scantable_type = -1;   // set scantable type to uninitialized
@@ -431,11 +434,12 @@  static int decode_slice_plane(ProresContext *ctx, ProresThreadData *td,
 {
     GetBitContext gb;
     int16_t *block_ptr;
-    int mb_num, blocks_per_slice, ret;
+    int i, mb_num, blocks_per_slice, ret;
 
     blocks_per_slice = mbs_per_slice * blocks_per_mb;
 
-    memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks));
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->bdsp.clear_block(td->blocks + (i << 6));
 
     init_get_bits(&gb, buf, data_size << 3);