diff mbox series

[FFmpeg-devel,RFC/PATCH] bitpacked_dec: Optimization for bitpacked_dec decoder performance

Message ID 1683323657-20687-1-git-send-email-dheitmueller@ltnglobal.com
State New
Headers show
Series [FFmpeg-devel,RFC/PATCH] bitpacked_dec: Optimization for bitpacked_dec decoder performance | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Devin Heitmueller May 5, 2023, 9:54 p.m. UTC
Rework the code a bit to speed up the 10-bit bitpacked decoding
routine.  This is probably about as fast as I can get it without
switching to assembly language.

Demonstratable with:

./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2 -frames:v 1 source.yuv
./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le out.yuv

On my development system, it went from 80ms for a 2160p frame
down to 20ms (i.e. a 4X speedup).  Good enough for now, I hope...

Signed-off-by: Devin Heitmueller <dheitmueller@ltnglobal.com>
---
 libavcodec/bitpacked_dec.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

Comments

Lance Wang May 6, 2023, 11:32 a.m. UTC | #1
On Sat, May 6, 2023 at 4:58 AM Devin Heitmueller <
devin.heitmueller@ltnglobal.com> wrote:

> Rework the code a bit to speed up the 10-bit bitpacked decoding
> routine.  This is probably about as fast as I can get it without
> switching to assembly language.
>
> Demonstratable with:
>
> ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2
> -frames:v 1 source.yuv
> ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i
> source.yuv -pix_fmt yuv422p10le out.yuv
>
> On my development system, it went from 80ms for a 2160p frame
> down to 20ms (i.e. a 4X speedup).  Good enough for now, I hope...
>
>
FYI, on my development system, I run two time for the original and modified
version and no obvious difference:
./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -frames:v 25
source.yuv
time ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked
-i source.yuv -pix_fmt yuv422p10le out.yuv
frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
bitrate=6912000.0kbits/s speed=1.13x

real 0m0.961s
user 0m1.086s
sys 0m1.360s

frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
bitrate=6912000.0kbits/s speed=1.16x

real 0m0.936s
user 0m1.358s
sys 0m1.350s

after apply the patch:
frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
bitrate=6912000.0kbits/s speed=1.14x

real 0m0.953s
user 0m0.906s
sys 0m1.438s

frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
bitrate=6912000.0kbits/s speed=1.17x

real 0m0.922s
user 0m0.926s
sys 0m1.066s



> Signed-off-by: Devin Heitmueller <dheitmueller@ltnglobal.com>
> ---
>  libavcodec/bitpacked_dec.c | 17 +++++++----------
>  1 file changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/libavcodec/bitpacked_dec.c b/libavcodec/bitpacked_dec.c
> index a1ffef1..96aba27 100644
> --- a/libavcodec/bitpacked_dec.c
> +++ b/libavcodec/bitpacked_dec.c
> @@ -28,7 +28,6 @@
>
>  #include "avcodec.h"
>  #include "codec_internal.h"
> -#include "get_bits.h"
>  #include "libavutil/imgutils.h"
>  #include "thread.h"
>
> @@ -65,7 +64,7 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
> *avctx, AVFrame *frame,
>  {
>      uint64_t frame_size = (uint64_t)avctx->width *
> (uint64_t)avctx->height * 20;
>      uint64_t packet_size = (uint64_t)avpkt->size * 8;
> -    GetBitContext bc;
> +    uint8_t *src;
>      uint16_t *y, *u, *v;
>      int ret, i, j;
>
> @@ -79,20 +78,18 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
> *avctx, AVFrame *frame,
>      if (avctx->width % 2)
>          return AVERROR_PATCHWELCOME;
>
> -    ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height *
> 20);
> -    if (ret)
> -        return ret;
> -
> +    src = avpkt->data;
>      for (i = 0; i < avctx->height; i++) {
>          y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
>          u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
>          v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
>
>          for (j = 0; j < avctx->width; j += 2) {
> -            *u++ = get_bits(&bc, 10);
> -            *y++ = get_bits(&bc, 10);
> -            *v++ = get_bits(&bc, 10);
> -            *y++ = get_bits(&bc, 10);
> +            *u++ = (src[0] << 2) | (src[1] >> 6);
> +            *y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff;
> +            *v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff;
> +            *y++ = ((src[3] << 8) | (src[4]))      & 0x3ff;
> +            src += 5;
>          }
>      }
>
> --
> 1.8.3.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Devin Heitmueller May 6, 2023, 11:49 a.m. UTC | #2
Hi Lance,

On Sat, May 6, 2023 at 7:32 AM Lance Wang <lance.lmwang@gmail.com> wrote:
> FYI, on my development system, I run two time for the original and modified
> version and no obvious difference:

Simply running "time" against the binary isn't an accurate way to
measure a 60ms difference for a single frame being processed.  For any
such execution of ffmpeg the bulk of the time is spent loading the
application and in this case loading the 20MB file from disk into
memory.

In my case I added instrumentation to the decoder to measure how much
time it took to perform the actual decode operation.  I discarded the
patch already since it was like six lines of code and I did the work
several weeks ago, but if there is really a dispute about the
performance benefit I can obviously recreate it (as can anyone who
wants to benchmark it themselves).  You would just have to insert a
couple of calls to gettimeofday() in libavcodec/bitpacked_dec.c before
and after the decoding operation.

This is one of those cases where you won't notice the performance
difference doing any operation once, but it becomes important when
you're processing a live RTP stream of 2160p59 video at 10.4 Gbps.

Devin
Paul B Mahol May 6, 2023, 11:52 a.m. UTC | #3
On Sat, May 6, 2023 at 1:32 PM Lance Wang <lance.lmwang@gmail.com> wrote:

> On Sat, May 6, 2023 at 4:58 AM Devin Heitmueller <
> devin.heitmueller@ltnglobal.com> wrote:
>
> > Rework the code a bit to speed up the 10-bit bitpacked decoding
> > routine.  This is probably about as fast as I can get it without
> > switching to assembly language.
> >
> > Demonstratable with:
> >
> > ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2
> > -frames:v 1 source.yuv
> > ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i
> > source.yuv -pix_fmt yuv422p10le out.yuv
> >
> > On my development system, it went from 80ms for a 2160p frame
> > down to 20ms (i.e. a 4X speedup).  Good enough for now, I hope...
> >
> >
> FYI, on my development system, I run two time for the original and modified
> version and no obvious difference:
> ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -frames:v 25
> source.yuv
> time ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked
> -i source.yuv -pix_fmt yuv422p10le out.yuv
> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> bitrate=6912000.0kbits/s speed=1.13x
>
> real 0m0.961s
> user 0m1.086s
> sys 0m1.360s
>
> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> bitrate=6912000.0kbits/s speed=1.16x
>
> real 0m0.936s
> user 0m1.358s
> sys 0m1.350s
>
> after apply the patch:
> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> bitrate=6912000.0kbits/s speed=1.14x
>
> real 0m0.953s
> user 0m0.906s
> sys 0m1.438s
>
> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> bitrate=6912000.0kbits/s speed=1.17x
>
> real 0m0.922s
> user 0m0.926s
> sys 0m1.066s
>

Only 25 frames?
This is flawed.


>
>
>
> > Signed-off-by: Devin Heitmueller <dheitmueller@ltnglobal.com>
> > ---
> >  libavcodec/bitpacked_dec.c | 17 +++++++----------
> >  1 file changed, 7 insertions(+), 10 deletions(-)
> >
> > diff --git a/libavcodec/bitpacked_dec.c b/libavcodec/bitpacked_dec.c
> > index a1ffef1..96aba27 100644
> > --- a/libavcodec/bitpacked_dec.c
> > +++ b/libavcodec/bitpacked_dec.c
> > @@ -28,7 +28,6 @@
> >
> >  #include "avcodec.h"
> >  #include "codec_internal.h"
> > -#include "get_bits.h"
> >  #include "libavutil/imgutils.h"
> >  #include "thread.h"
> >
> > @@ -65,7 +64,7 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
> > *avctx, AVFrame *frame,
> >  {
> >      uint64_t frame_size = (uint64_t)avctx->width *
> > (uint64_t)avctx->height * 20;
> >      uint64_t packet_size = (uint64_t)avpkt->size * 8;
> > -    GetBitContext bc;
> > +    uint8_t *src;
> >      uint16_t *y, *u, *v;
> >      int ret, i, j;
> >
> > @@ -79,20 +78,18 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
> > *avctx, AVFrame *frame,
> >      if (avctx->width % 2)
> >          return AVERROR_PATCHWELCOME;
> >
> > -    ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height *
> > 20);
> > -    if (ret)
> > -        return ret;
> > -
> > +    src = avpkt->data;
> >      for (i = 0; i < avctx->height; i++) {
> >          y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
> >          u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
> >          v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
> >
> >          for (j = 0; j < avctx->width; j += 2) {
> > -            *u++ = get_bits(&bc, 10);
> > -            *y++ = get_bits(&bc, 10);
> > -            *v++ = get_bits(&bc, 10);
> > -            *y++ = get_bits(&bc, 10);
> > +            *u++ = (src[0] << 2) | (src[1] >> 6);
> > +            *y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff;
> > +            *v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff;
> > +            *y++ = ((src[3] << 8) | (src[4]))      & 0x3ff;
> > +            src += 5;
> >          }
> >      }
> >
> > --
> > 1.8.3.1
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Devin Heitmueller May 6, 2023, 12:13 p.m. UTC | #4
I added some instrumentation via the attached patch.  You can see the
benefits here:

Before=1683378057.243350 After 1683378057.264239
Before=1683378083.335424 After 1683378083.356440
Before=1683378089.675400 After 1683378089.696512
Before=1683378151.792324 After 1683378151.813579
21 ms per run

After patch:
Before=1683378222.167796 After 1683378222.175760
Before=1683378233.131416 After 1683378233.139326
Before=1683378243.591895 After 1683378243.599840
8 ms per run

Note: this is a different platform than I did the original development
on, and apparently the improvement on this particular box is only 2.5x
rather than 4x.

Devin


On Sat, May 6, 2023 at 7:53 AM Paul B Mahol <onemda@gmail.com> wrote:
>
> On Sat, May 6, 2023 at 1:32 PM Lance Wang <lance.lmwang@gmail.com> wrote:
>
> > On Sat, May 6, 2023 at 4:58 AM Devin Heitmueller <
> > devin.heitmueller@ltnglobal.com> wrote:
> >
> > > Rework the code a bit to speed up the 10-bit bitpacked decoding
> > > routine.  This is probably about as fast as I can get it without
> > > switching to assembly language.
> > >
> > > Demonstratable with:
> > >
> > > ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2
> > > -frames:v 1 source.yuv
> > > ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i
> > > source.yuv -pix_fmt yuv422p10le out.yuv
> > >
> > > On my development system, it went from 80ms for a 2160p frame
> > > down to 20ms (i.e. a 4X speedup).  Good enough for now, I hope...
> > >
> > >
> > FYI, on my development system, I run two time for the original and modified
> > version and no obvious difference:
> > ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -frames:v 25
> > source.yuv
> > time ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked
> > -i source.yuv -pix_fmt yuv422p10le out.yuv
> > frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> > bitrate=6912000.0kbits/s speed=1.13x
> >
> > real 0m0.961s
> > user 0m1.086s
> > sys 0m1.360s
> >
> > frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> > bitrate=6912000.0kbits/s speed=1.16x
> >
> > real 0m0.936s
> > user 0m1.358s
> > sys 0m1.350s
> >
> > after apply the patch:
> > frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> > bitrate=6912000.0kbits/s speed=1.14x
> >
> > real 0m0.953s
> > user 0m0.906s
> > sys 0m1.438s
> >
> > frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
> > bitrate=6912000.0kbits/s speed=1.17x
> >
> > real 0m0.922s
> > user 0m0.926s
> > sys 0m1.066s
> >
>
> Only 25 frames?
> This is flawed.
>
>
> >
> >
> >
> > > Signed-off-by: Devin Heitmueller <dheitmueller@ltnglobal.com>
> > > ---
> > >  libavcodec/bitpacked_dec.c | 17 +++++++----------
> > >  1 file changed, 7 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/libavcodec/bitpacked_dec.c b/libavcodec/bitpacked_dec.c
> > > index a1ffef1..96aba27 100644
> > > --- a/libavcodec/bitpacked_dec.c
> > > +++ b/libavcodec/bitpacked_dec.c
> > > @@ -28,7 +28,6 @@
> > >
> > >  #include "avcodec.h"
> > >  #include "codec_internal.h"
> > > -#include "get_bits.h"
> > >  #include "libavutil/imgutils.h"
> > >  #include "thread.h"
> > >
> > > @@ -65,7 +64,7 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
> > > *avctx, AVFrame *frame,
> > >  {
> > >      uint64_t frame_size = (uint64_t)avctx->width *
> > > (uint64_t)avctx->height * 20;
> > >      uint64_t packet_size = (uint64_t)avpkt->size * 8;
> > > -    GetBitContext bc;
> > > +    uint8_t *src;
> > >      uint16_t *y, *u, *v;
> > >      int ret, i, j;
> > >
> > > @@ -79,20 +78,18 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
> > > *avctx, AVFrame *frame,
> > >      if (avctx->width % 2)
> > >          return AVERROR_PATCHWELCOME;
> > >
> > > -    ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height *
> > > 20);
> > > -    if (ret)
> > > -        return ret;
> > > -
> > > +    src = avpkt->data;
> > >      for (i = 0; i < avctx->height; i++) {
> > >          y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
> > >          u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
> > >          v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
> > >
> > >          for (j = 0; j < avctx->width; j += 2) {
> > > -            *u++ = get_bits(&bc, 10);
> > > -            *y++ = get_bits(&bc, 10);
> > > -            *v++ = get_bits(&bc, 10);
> > > -            *y++ = get_bits(&bc, 10);
> > > +            *u++ = (src[0] << 2) | (src[1] >> 6);
> > > +            *y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff;
> > > +            *v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff;
> > > +            *y++ = ((src[3] << 8) | (src[4]))      & 0x3ff;
> > > +            src += 5;
> > >          }
> > >      }
> > >
> > > --
> > > 1.8.3.1
> > >
> > > _______________________________________________
> > > ffmpeg-devel mailing list
> > > ffmpeg-devel@ffmpeg.org
> > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > >
> > > To unsubscribe, visit link above, or email
> > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> > >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
James Almer May 6, 2023, 12:16 p.m. UTC | #5
On 5/6/2023 9:13 AM, Devin Heitmueller wrote:
> I added some instrumentation via the attached patch.  You can see the
> benefits here:
> 
> Before=1683378057.243350 After 1683378057.264239
> Before=1683378083.335424 After 1683378083.356440
> Before=1683378089.675400 After 1683378089.696512
> Before=1683378151.792324 After 1683378151.813579
> 21 ms per run
> 
> After patch:
> Before=1683378222.167796 After 1683378222.175760
> Before=1683378233.131416 After 1683378233.139326
> Before=1683378243.591895 After 1683378243.599840
> 8 ms per run
> 
> Note: this is a different platform than I did the original development
> on, and apparently the improvement on this particular box is only 2.5x
> rather than 4x.
> 
> Devin

Can you bench with the START_TIMER and STOP_TIMER macros in timer.h?
Also, define CACHED_BITSTREAM_READER in bitpacked_dec.c before including 
git_bits.h and test the actual implementation again, to see if it makes 
any difference.

> 
> 
> On Sat, May 6, 2023 at 7:53 AM Paul B Mahol <onemda@gmail.com> wrote:
>>
>> On Sat, May 6, 2023 at 1:32 PM Lance Wang <lance.lmwang@gmail.com> wrote:
>>
>>> On Sat, May 6, 2023 at 4:58 AM Devin Heitmueller <
>>> devin.heitmueller@ltnglobal.com> wrote:
>>>
>>>> Rework the code a bit to speed up the 10-bit bitpacked decoding
>>>> routine.  This is probably about as fast as I can get it without
>>>> switching to assembly language.
>>>>
>>>> Demonstratable with:
>>>>
>>>> ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2
>>>> -frames:v 1 source.yuv
>>>> ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i
>>>> source.yuv -pix_fmt yuv422p10le out.yuv
>>>>
>>>> On my development system, it went from 80ms for a 2160p frame
>>>> down to 20ms (i.e. a 4X speedup).  Good enough for now, I hope...
>>>>
>>>>
>>> FYI, on my development system, I run two time for the original and modified
>>> version and no obvious difference:
>>> ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -frames:v 25
>>> source.yuv
>>> time ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked
>>> -i source.yuv -pix_fmt yuv422p10le out.yuv
>>> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
>>> bitrate=6912000.0kbits/s speed=1.13x
>>>
>>> real 0m0.961s
>>> user 0m1.086s
>>> sys 0m1.360s
>>>
>>> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
>>> bitrate=6912000.0kbits/s speed=1.16x
>>>
>>> real 0m0.936s
>>> user 0m1.358s
>>> sys 0m1.350s
>>>
>>> after apply the patch:
>>> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
>>> bitrate=6912000.0kbits/s speed=1.14x
>>>
>>> real 0m0.953s
>>> user 0m0.906s
>>> sys 0m1.438s
>>>
>>> frame=   25 fps=0.0 q=-0.0 Lsize=  810000kB time=00:00:00.96
>>> bitrate=6912000.0kbits/s speed=1.17x
>>>
>>> real 0m0.922s
>>> user 0m0.926s
>>> sys 0m1.066s
>>>
>>
>> Only 25 frames?
>> This is flawed.
>>
>>
>>>
>>>
>>>
>>>> Signed-off-by: Devin Heitmueller <dheitmueller@ltnglobal.com>
>>>> ---
>>>>   libavcodec/bitpacked_dec.c | 17 +++++++----------
>>>>   1 file changed, 7 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/libavcodec/bitpacked_dec.c b/libavcodec/bitpacked_dec.c
>>>> index a1ffef1..96aba27 100644
>>>> --- a/libavcodec/bitpacked_dec.c
>>>> +++ b/libavcodec/bitpacked_dec.c
>>>> @@ -28,7 +28,6 @@
>>>>
>>>>   #include "avcodec.h"
>>>>   #include "codec_internal.h"
>>>> -#include "get_bits.h"
>>>>   #include "libavutil/imgutils.h"
>>>>   #include "thread.h"
>>>>
>>>> @@ -65,7 +64,7 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
>>>> *avctx, AVFrame *frame,
>>>>   {
>>>>       uint64_t frame_size = (uint64_t)avctx->width *
>>>> (uint64_t)avctx->height * 20;
>>>>       uint64_t packet_size = (uint64_t)avpkt->size * 8;
>>>> -    GetBitContext bc;
>>>> +    uint8_t *src;
>>>>       uint16_t *y, *u, *v;
>>>>       int ret, i, j;
>>>>
>>>> @@ -79,20 +78,18 @@ static int bitpacked_decode_yuv422p10(AVCodecContext
>>>> *avctx, AVFrame *frame,
>>>>       if (avctx->width % 2)
>>>>           return AVERROR_PATCHWELCOME;
>>>>
>>>> -    ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height *
>>>> 20);
>>>> -    if (ret)
>>>> -        return ret;
>>>> -
>>>> +    src = avpkt->data;
>>>>       for (i = 0; i < avctx->height; i++) {
>>>>           y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
>>>>           u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
>>>>           v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
>>>>
>>>>           for (j = 0; j < avctx->width; j += 2) {
>>>> -            *u++ = get_bits(&bc, 10);
>>>> -            *y++ = get_bits(&bc, 10);
>>>> -            *v++ = get_bits(&bc, 10);
>>>> -            *y++ = get_bits(&bc, 10);
>>>> +            *u++ = (src[0] << 2) | (src[1] >> 6);
>>>> +            *y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff;
>>>> +            *v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff;
>>>> +            *y++ = ((src[3] << 8) | (src[4]))      & 0x3ff;
>>>> +            src += 5;
>>>>           }
>>>>       }
>>>>
>>>> --
>>>> 1.8.3.1
>>>>
>>>> _______________________________________________
>>>> ffmpeg-devel mailing list
>>>> ffmpeg-devel@ffmpeg.org
>>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>>
>>>> To unsubscribe, visit link above, or email
>>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>>>
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 
> 
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Devin Heitmueller May 6, 2023, 12:40 p.m. UTC | #6
On Sat, May 6, 2023 at 8:16 AM James Almer <jamrial@gmail.com> wrote:
> Can you bench with the START_TIMER and STOP_TIMER macros in timer.h?
> Also, define CACHED_BITSTREAM_READER in bitpacked_dec.c before including
> git_bits.h and test the actual implementation again, to see if it makes
> any difference.

Original code:
671661910 decicycles in bitpacked_dec,       1 runs,      0 skips
669736380 decicycles in bitpacked_dec,       1 runs,      0 skips
669370700 decicycles in bitpacked_dec,       1 runs,      0 skips

Original code with CACHED_BITSTREAM_READER defined
352599030 decicycles in bitpacked_dec,       1 runs,      0 skips
336163810 decicycles in bitpacked_dec,       1 runs,      0 skips
344628350 decicycles in bitpacked_dec,       1 runs,      0 skips

My proposed versioned:
257353330 decicycles in bitpacked_dec,       1 runs,      0 skips
271527000 decicycles in bitpacked_dec,       1 runs,      0 skips
252701500 decicycles in bitpacked_dec,       1 runs,      0 skips

Devin
Lance Wang May 10, 2023, 11:16 a.m. UTC | #7
On Sat, May 6, 2023 at 8:41 PM Devin Heitmueller <
devin.heitmueller@ltnglobal.com> wrote:

> On Sat, May 6, 2023 at 8:16 AM James Almer <jamrial@gmail.com> wrote:
> > Can you bench with the START_TIMER and STOP_TIMER macros in timer.h?
> > Also, define CACHED_BITSTREAM_READER in bitpacked_dec.c before including
> > git_bits.h and test the actual implementation again, to see if it makes
> > any difference.
>
> Original code:
> 671661910 decicycles in bitpacked_dec,       1 runs,      0 skips
> 669736380 decicycles in bitpacked_dec,       1 runs,      0 skips
> 669370700 decicycles in bitpacked_dec,       1 runs,      0 skips
>
> Original code with CACHED_BITSTREAM_READER defined
> 352599030 decicycles in bitpacked_dec,       1 runs,      0 skips
> 336163810 decicycles in bitpacked_dec,       1 runs,      0 skips
> 344628350 decicycles in bitpacked_dec,       1 runs,      0 skips
>
> My proposed versioned:
> 257353330 decicycles in bitpacked_dec,       1 runs,      0 skips
> 271527000 decicycles in bitpacked_dec,       1 runs,      0 skips
> 252701500 decicycles in bitpacked_dec,       1 runs,      0 skips
>
>
Yes, it's show better performance, so LGTM if nobody have plan to optimize
the bitstream
function.

Devin
>
> --
> Devin Heitmueller, Senior Software Engineer
> LTN Global Communications
> o: +1 (301) 363-1001
> w: https://ltnglobal.com  e: devin.heitmueller@ltnglobal.com
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Marton Balint May 11, 2023, 10:20 p.m. UTC | #8
On Wed, 10 May 2023, Lance Wang wrote:

> On Sat, May 6, 2023 at 8:41 PM Devin Heitmueller <
> devin.heitmueller@ltnglobal.com> wrote:
>
>> On Sat, May 6, 2023 at 8:16 AM James Almer <jamrial@gmail.com> wrote:
>> > Can you bench with the START_TIMER and STOP_TIMER macros in timer.h?
>> > Also, define CACHED_BITSTREAM_READER in bitpacked_dec.c before including
>> > git_bits.h and test the actual implementation again, to see if it makes
>> > any difference.
>>
>> Original code:
>> 671661910 decicycles in bitpacked_dec,       1 runs,      0 skips
>> 669736380 decicycles in bitpacked_dec,       1 runs,      0 skips
>> 669370700 decicycles in bitpacked_dec,       1 runs,      0 skips
>>
>> Original code with CACHED_BITSTREAM_READER defined
>> 352599030 decicycles in bitpacked_dec,       1 runs,      0 skips
>> 336163810 decicycles in bitpacked_dec,       1 runs,      0 skips
>> 344628350 decicycles in bitpacked_dec,       1 runs,      0 skips
>>
>> My proposed versioned:
>> 257353330 decicycles in bitpacked_dec,       1 runs,      0 skips
>> 271527000 decicycles in bitpacked_dec,       1 runs,      0 skips
>> 252701500 decicycles in bitpacked_dec,       1 runs,      0 skips
>>
>>
> Yes, it's show better performance, so LGTM if nobody have plan to optimize
> the bitstream
> function.

Actually the cached bitstream reader was faster here than the manual 
approach:

./ffmpeg -stream_loop 128 -threads 1 -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le -f null none -loglevel error

Old code:

821050920 decicycles in bitpacked,       1 runs,      0 skips
815402160 decicycles in bitpacked,       2 runs,      0 skips
814108410 decicycles in bitpacked,       4 runs,      0 skips
814213800 decicycles in bitpacked,       8 runs,      0 skips
815048325 decicycles in bitpacked,      16 runs,      0 skips
812866713 decicycles in bitpacked,      32 runs,      0 skips
809186523 decicycles in bitpacked,      64 runs,      0 skips
808317601 decicycles in bitpacked,     128 runs,      0 skips

With the patch:

379879920 decicycles in bitpacked,       1 runs,      0 skips
387491580 decicycles in bitpacked,       2 runs,      0 skips
397720260 decicycles in bitpacked,       4 runs,      0 skips
389581560 decicycles in bitpacked,       8 runs,      0 skips
381820635 decicycles in bitpacked,      16 runs,      0 skips
379791675 decicycles in bitpacked,      32 runs,      0 skips
379246303 decicycles in bitpacked,      64 runs,      0 skips
379221671 decicycles in bitpacked,     128 runs,      0 skips

Old code and #defined CACHED_BITSTREAM_READER 1

345122280 decicycles in bitpacked,       1 runs,      0 skips
343663020 decicycles in bitpacked,       2 runs,      0 skips
343372680 decicycles in bitpacked,       4 runs,      0 skips
342554535 decicycles in bitpacked,       8 runs,      0 skips
340816522 decicycles in bitpacked,      16 runs,      0 skips
340225672 decicycles in bitpacked,      32 runs,      0 skips
340283520 decicycles in bitpacked,      64 runs,      0 skips
339643105 decicycles in bitpacked,     128 runs,      0 skips

Regards,
Marton
Devin Heitmueller May 12, 2023, 3:26 p.m. UTC | #9
On Thu, May 11, 2023 at 6:20 PM Marton Balint <cus@passwd.hu> wrote:
> Actually the cached bitstream reader was faster here than the manual
> approach:
>
> ./ffmpeg -stream_loop 128 -threads 1 -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le -f null none -loglevel error
>
> Old code:
>
> 821050920 decicycles in bitpacked,       1 runs,      0 skips
> 815402160 decicycles in bitpacked,       2 runs,      0 skips
> 814108410 decicycles in bitpacked,       4 runs,      0 skips
> 814213800 decicycles in bitpacked,       8 runs,      0 skips
> 815048325 decicycles in bitpacked,      16 runs,      0 skips
> 812866713 decicycles in bitpacked,      32 runs,      0 skips
> 809186523 decicycles in bitpacked,      64 runs,      0 skips
> 808317601 decicycles in bitpacked,     128 runs,      0 skips
>
> With the patch:
>
> 379879920 decicycles in bitpacked,       1 runs,      0 skips
> 387491580 decicycles in bitpacked,       2 runs,      0 skips
> 397720260 decicycles in bitpacked,       4 runs,      0 skips
> 389581560 decicycles in bitpacked,       8 runs,      0 skips
> 381820635 decicycles in bitpacked,      16 runs,      0 skips
> 379791675 decicycles in bitpacked,      32 runs,      0 skips
> 379246303 decicycles in bitpacked,      64 runs,      0 skips
> 379221671 decicycles in bitpacked,     128 runs,      0 skips
>
> Old code and #defined CACHED_BITSTREAM_READER 1
>
> 345122280 decicycles in bitpacked,       1 runs,      0 skips
> 343663020 decicycles in bitpacked,       2 runs,      0 skips
> 343372680 decicycles in bitpacked,       4 runs,      0 skips
> 342554535 decicycles in bitpacked,       8 runs,      0 skips
> 340816522 decicycles in bitpacked,      16 runs,      0 skips
> 340225672 decicycles in bitpacked,      32 runs,      0 skips
> 340283520 decicycles in bitpacked,      64 runs,      0 skips
> 339643105 decicycles in bitpacked,     128 runs,      0 skips

I don't have a good explanation for this.  I could speculate that some
of it comes down to the processor architecture, how much onboard cache
it has, gcc version (and what sort of optimization/vectorization it
does, if any), etc.  In my case I was testing on Haswell and Skylake
(both with 12MB cache) with gcc 4.8.

I would welcome feedback from others.

Looking at the code to libavcodec/git_bits.h, it might also be worth
looking at setting #define LONG_BITSTREAM_READER, as that might speed
things up as well for such large files.

Devin
Paul B Mahol June 12, 2023, 4:05 p.m. UTC | #10
On Fri, May 12, 2023 at 12:20 AM Marton Balint <cus@passwd.hu> wrote:

>
>
> On Wed, 10 May 2023, Lance Wang wrote:
>
> > On Sat, May 6, 2023 at 8:41 PM Devin Heitmueller <
> > devin.heitmueller@ltnglobal.com> wrote:
> >
> >> On Sat, May 6, 2023 at 8:16 AM James Almer <jamrial@gmail.com> wrote:
> >> > Can you bench with the START_TIMER and STOP_TIMER macros in timer.h?
> >> > Also, define CACHED_BITSTREAM_READER in bitpacked_dec.c before
> including
> >> > git_bits.h and test the actual implementation again, to see if it
> makes
> >> > any difference.
> >>
> >> Original code:
> >> 671661910 decicycles in bitpacked_dec,       1 runs,      0 skips
> >> 669736380 decicycles in bitpacked_dec,       1 runs,      0 skips
> >> 669370700 decicycles in bitpacked_dec,       1 runs,      0 skips
> >>
> >> Original code with CACHED_BITSTREAM_READER defined
> >> 352599030 decicycles in bitpacked_dec,       1 runs,      0 skips
> >> 336163810 decicycles in bitpacked_dec,       1 runs,      0 skips
> >> 344628350 decicycles in bitpacked_dec,       1 runs,      0 skips
> >>
> >> My proposed versioned:
> >> 257353330 decicycles in bitpacked_dec,       1 runs,      0 skips
> >> 271527000 decicycles in bitpacked_dec,       1 runs,      0 skips
> >> 252701500 decicycles in bitpacked_dec,       1 runs,      0 skips
> >>
> >>
> > Yes, it's show better performance, so LGTM if nobody have plan to
> optimize
> > the bitstream
> > function.
>
> Actually the cached bitstream reader was faster here than the manual
> approach:
>
> ./ffmpeg -stream_loop 128 -threads 1 -f bitpacked -pix_fmt yuv422p10le -s
> 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le -f null none
> -loglevel error
>
> Old code:
>
> 821050920 decicycles in bitpacked,       1 runs,      0 skips
> 815402160 decicycles in bitpacked,       2 runs,      0 skips
> 814108410 decicycles in bitpacked,       4 runs,      0 skips
> 814213800 decicycles in bitpacked,       8 runs,      0 skips
> 815048325 decicycles in bitpacked,      16 runs,      0 skips
> 812866713 decicycles in bitpacked,      32 runs,      0 skips
> 809186523 decicycles in bitpacked,      64 runs,      0 skips
> 808317601 decicycles in bitpacked,     128 runs,      0 skips
>
> With the patch:
>
> 379879920 decicycles in bitpacked,       1 runs,      0 skips
> 387491580 decicycles in bitpacked,       2 runs,      0 skips
> 397720260 decicycles in bitpacked,       4 runs,      0 skips
> 389581560 decicycles in bitpacked,       8 runs,      0 skips
> 381820635 decicycles in bitpacked,      16 runs,      0 skips
> 379791675 decicycles in bitpacked,      32 runs,      0 skips
> 379246303 decicycles in bitpacked,      64 runs,      0 skips
> 379221671 decicycles in bitpacked,     128 runs,      0 skips
>
> Old code and #defined CACHED_BITSTREAM_READER 1
>
> 345122280 decicycles in bitpacked,       1 runs,      0 skips
> 343663020 decicycles in bitpacked,       2 runs,      0 skips
> 343372680 decicycles in bitpacked,       4 runs,      0 skips
> 342554535 decicycles in bitpacked,       8 runs,      0 skips
> 340816522 decicycles in bitpacked,      16 runs,      0 skips
> 340225672 decicycles in bitpacked,      32 runs,      0 skips
> 340283520 decicycles in bitpacked,      64 runs,      0 skips
> 339643105 decicycles in bitpacked,     128 runs,      0 skips
>

Could someone send patch for this ?


>
> Regards,
> Marton
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/bitpacked_dec.c b/libavcodec/bitpacked_dec.c
index a1ffef1..96aba27 100644
--- a/libavcodec/bitpacked_dec.c
+++ b/libavcodec/bitpacked_dec.c
@@ -28,7 +28,6 @@ 
 
 #include "avcodec.h"
 #include "codec_internal.h"
-#include "get_bits.h"
 #include "libavutil/imgutils.h"
 #include "thread.h"
 
@@ -65,7 +64,7 @@  static int bitpacked_decode_yuv422p10(AVCodecContext *avctx, AVFrame *frame,
 {
     uint64_t frame_size = (uint64_t)avctx->width * (uint64_t)avctx->height * 20;
     uint64_t packet_size = (uint64_t)avpkt->size * 8;
-    GetBitContext bc;
+    uint8_t *src;
     uint16_t *y, *u, *v;
     int ret, i, j;
 
@@ -79,20 +78,18 @@  static int bitpacked_decode_yuv422p10(AVCodecContext *avctx, AVFrame *frame,
     if (avctx->width % 2)
         return AVERROR_PATCHWELCOME;
 
-    ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height * 20);
-    if (ret)
-        return ret;
-
+    src = avpkt->data;
     for (i = 0; i < avctx->height; i++) {
         y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
         u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
         v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
 
         for (j = 0; j < avctx->width; j += 2) {
-            *u++ = get_bits(&bc, 10);
-            *y++ = get_bits(&bc, 10);
-            *v++ = get_bits(&bc, 10);
-            *y++ = get_bits(&bc, 10);
+            *u++ = (src[0] << 2) | (src[1] >> 6);
+            *y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff;
+            *v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff;
+            *y++ = ((src[3] << 8) | (src[4]))      & 0x3ff;
+            src += 5;
         }
     }