diff mbox series

[FFmpeg-devel] MULTI VLC decoding boost

Message ID CAPYw7P4LYXDZw070aFxFZFtNuJcX9ZKLfR4ov2026JWykACsVQ@mail.gmail.com
State New
Headers show
Series [FFmpeg-devel] MULTI VLC decoding boost | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 fail Make failed
andriy/make_x86 fail Make failed

Commit Message

Paul B Mahol Aug. 28, 2023, 5:36 p.m. UTC
Patches attached.

Thanks for kurosu for pointing unmerged branches.

The UNCACHED_PATH for x86_32 is broken with this for 2 codecs it touches.
Fix is trivial, to be fixed later.

Comments

Paul B Mahol Sept. 4, 2023, 4:08 p.m. UTC | #1
Will apply soon.
Michael Niedermayer Sept. 14, 2023, 10:01 p.m. UTC | #2
On Mon, Aug 28, 2023 at 07:36:17PM +0200, Paul B Mahol wrote:
> Patches attached.
> 
> Thanks for kurosu for pointing unmerged branches.
> 
> The UNCACHED_PATH for x86_32 is broken with this for 2 codecs it touches.
> Fix is trivial, to be fixed later.
[...]

> +int ff_init_vlc_multi_from_lengths(VLC *vlc, VLC_MULTI *multi, int nb_bits, int nb_elems,
> +		                   int nb_codes, const int8_t *lens, int lens_wrap,
> +                                   const void *symbols, int symbols_wrap, int symbols_size,
> +                                   int offset, int flags, void *logctx)
> +{
> +    VLCcode localbuf[LOCALBUF_ELEMS], *buf = localbuf;
> +    uint64_t code;
> +    int ret, j, len_max = FFMIN(32, 3 * nb_bits);
> +
> +    ret = vlc_common_init(vlc, nb_bits, nb_codes, &buf, flags);
> +    if (ret < 0)
> +        return ret;
> +
> +    multi->table = av_malloc(sizeof(*multi->table) << nb_bits);
> +    if (!multi->table)
> +        return AVERROR(ENOMEM);
> +
> +    j = code = 0;
> +    for (int i = 0; i < nb_codes; i++, lens += lens_wrap) {
> +        int len = *lens;
> +        if (len > 0) {
> +            unsigned sym;
> +
> +            buf[j].bits = len;
> +            if (symbols)
> +                GET_DATA(sym, symbols, i, symbols_wrap, symbols_size)
> +            else
> +                sym = i;
> +            buf[j].symbol = sym + offset;
> +            buf[j++].code = code;
> +        } else if (len <  0) {
> +            len = -len;
> +        } else
> +            continue;
> +        if (len > len_max || code & ((1U << (32 - len)) - 1)) {
> +            av_log(logctx, AV_LOG_ERROR, "Invalid VLC (length %u)\n", len);
> +            goto fail;
> +        }
> +        code += 1U << (32 - len);
> +        if (code > UINT32_MAX + 1ULL) {
> +            av_log(logctx, AV_LOG_ERROR, "Overdetermined VLC tree\n");
> +            goto fail;
> +        }
> +    }
> +    ret = vlc_common_end(vlc, nb_bits, j, buf, flags, localbuf);
> +    if (ret < 0)
> +        goto fail;
> +    return vlc_multi_gen(multi->table, vlc, nb_elems, j, nb_bits, buf, logctx);
> +fail:
> +    if (buf != localbuf)
> +        av_free(buf);
> +    return AVERROR_INVALIDDATA;
> +}

this is copy and pasted from

int ff_vlc_init_from_lengths(VLC *vlc, int nb_bits, int nb_codes,

leading to code duplication, it would be better if you could
factor the duplication out

thx

[...]
Paul B Mahol Sept. 14, 2023, 10:05 p.m. UTC | #3
On Fri, Sep 15, 2023 at 12:01 AM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> On Mon, Aug 28, 2023 at 07:36:17PM +0200, Paul B Mahol wrote:
> > Patches attached.
> >
> > Thanks for kurosu for pointing unmerged branches.
> >
> > The UNCACHED_PATH for x86_32 is broken with this for 2 codecs it touches.
> > Fix is trivial, to be fixed later.
> [...]
>
> > +int ff_init_vlc_multi_from_lengths(VLC *vlc, VLC_MULTI *multi, int
> nb_bits, int nb_elems,
> > +                                int nb_codes, const int8_t *lens, int
> lens_wrap,
> > +                                   const void *symbols, int
> symbols_wrap, int symbols_size,
> > +                                   int offset, int flags, void *logctx)
> > +{
> > +    VLCcode localbuf[LOCALBUF_ELEMS], *buf = localbuf;
> > +    uint64_t code;
> > +    int ret, j, len_max = FFMIN(32, 3 * nb_bits);
> > +
> > +    ret = vlc_common_init(vlc, nb_bits, nb_codes, &buf, flags);
> > +    if (ret < 0)
> > +        return ret;
> > +
> > +    multi->table = av_malloc(sizeof(*multi->table) << nb_bits);
> > +    if (!multi->table)
> > +        return AVERROR(ENOMEM);
> > +
> > +    j = code = 0;
> > +    for (int i = 0; i < nb_codes; i++, lens += lens_wrap) {
> > +        int len = *lens;
> > +        if (len > 0) {
> > +            unsigned sym;
> > +
> > +            buf[j].bits = len;
> > +            if (symbols)
> > +                GET_DATA(sym, symbols, i, symbols_wrap, symbols_size)
> > +            else
> > +                sym = i;
> > +            buf[j].symbol = sym + offset;
> > +            buf[j++].code = code;
> > +        } else if (len <  0) {
> > +            len = -len;
> > +        } else
> > +            continue;
> > +        if (len > len_max || code & ((1U << (32 - len)) - 1)) {
> > +            av_log(logctx, AV_LOG_ERROR, "Invalid VLC (length %u)\n",
> len);
> > +            goto fail;
> > +        }
> > +        code += 1U << (32 - len);
> > +        if (code > UINT32_MAX + 1ULL) {
> > +            av_log(logctx, AV_LOG_ERROR, "Overdetermined VLC tree\n");
> > +            goto fail;
> > +        }
> > +    }
> > +    ret = vlc_common_end(vlc, nb_bits, j, buf, flags, localbuf);
> > +    if (ret < 0)
> > +        goto fail;
> > +    return vlc_multi_gen(multi->table, vlc, nb_elems, j, nb_bits, buf,
> logctx);
> > +fail:
> > +    if (buf != localbuf)
> > +        av_free(buf);
> > +    return AVERROR_INVALIDDATA;
> > +}
>
> this is copy and pasted from
>
> int ff_vlc_init_from_lengths(VLC *vlc, int nb_bits, int nb_codes,
>
> leading to code duplication, it would be better if you could
> factor the duplication out
>
> thx
>

-1


>
> [...]
>
> --
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> "You are 36 times more likely to die in a bathtub than at the hands of a
> terrorist. Also, you are 2.5 times more likely to become a president and
> 2 times more likely to become an astronaut, than to die in a terrorist
> attack." -- Thoughty2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Michael Niedermayer Oct. 22, 2023, 6:01 p.m. UTC | #4
On Mon, Aug 28, 2023 at 07:36:17PM +0200, Paul B Mahol wrote:
> Patches attached.
> 
> Thanks for kurosu for pointing unmerged branches.
> 


[...]

> +static void add_level(VLC_MULTI_ELEM *table, const int nb_elems,
> +		      const int num, const int numbits,
> +                      const VLCcode *buf,
> +                      uint32_t curcode, int curlen,
> +                      int curlimit, int curlevel,
> +                      const int minlen, const int max,
> +                      unsigned* levelcnt, VLC_MULTI_ELEM *info)
> +{

> +    if (nb_elems > 256 && curlevel > 2)
> +        return; // No room

this and


> +    for (int i = num-1; i > max; i--) {
> +        for (int j = 0; j < 2; j++) {
> +            int newlimit, sym;
> +            int t = j ? i-1 : i;
> +            int l = buf[t].bits;
> +            uint32_t code;
> +
> +            sym = buf[t].symbol;
> +            if (l > curlimit)
> +                return;
> +            code = curcode + (buf[t].code >> curlen);
> +            newlimit = curlimit - l;
> +            l  += curlen;
> +            if (nb_elems>256) AV_WN16(info->val+2*curlevel, sym);
> +            else info->val[curlevel] = sym&0xFF;
> +
> +            if (curlevel) { // let's not add single entries
> +                uint32_t val = code >> (32 - numbits);
> +                uint32_t  nb = val + (1U << (numbits - l));
> +                info->len = l;
> +                info->num = curlevel+1;
> +                for (; val < nb; val++)
> +                    AV_COPY64(table+val, info);
> +                levelcnt[curlevel-1]++;
> +            }
> +

> +            if (curlevel+1 < VLC_MULTI_MAX_SYMBOLS && newlimit >= minlen) {

this are 2 checks doing the same thing for 8 and 16 bit
what mess is this ?
for 8bit we have VLC_MULTI_MAX_SYMBOLS space (6) in the array so we skip beyond that
for 16bit we have VLC_MULTI_MAX_SYMBOLS/2 space which is 3 and the skip instead
is inside add_level() above with hardcoded litteral number
(nb_elems > 256 is a check for if its 8 or 16bit)

why is such totally hacked up code pushed with standing objections and no
review ?

yes, ill fix this one but i have the feeling this code has more surprises


> +                add_level(table, nb_elems, num, numbits, buf,
> +                          code, l, newlimit, curlevel+1,
> +                          minlen, max, levelcnt, info);
> +            }
> +        }
> +    }
> +}

[...]
diff mbox series

Patch

From 4250d74dad2bfb4c8d01fc26c9635c56293fc74c Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <christophe.gisquet@gmail.com>
Date: Sun, 9 Jul 2017 12:56:35 +0000
Subject: [PATCH 3/3] avcodec/utvideodec: add vlc multi support

Faster decoding, by average 50% faster overall.

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/utvideo.h    |  1 +
 libavcodec/utvideodec.c | 91 ++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/libavcodec/utvideo.h b/libavcodec/utvideo.h
index 9da9329ff3..d274b6586d 100644
--- a/libavcodec/utvideo.h
+++ b/libavcodec/utvideo.h
@@ -80,6 +80,7 @@  typedef struct UtvideoContext {
 
     ptrdiff_t slice_stride;
     uint8_t *slice_bits, *slice_buffer[4];
+    void    *buffer;
     int      slice_bits_size;
 
     const uint8_t *packed_stream[4][256];
diff --git a/libavcodec/utvideodec.c b/libavcodec/utvideodec.c
index 1f00c58950..0b0352b7ec 100644
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@@ -46,7 +46,7 @@  typedef struct HuffEntry {
 } HuffEntry;
 
 static int build_huff(UtvideoContext *c, const uint8_t *src, VLC *vlc,
-                      int *fsym, unsigned nb_elems)
+                      VLC_MULTI *multi, int *fsym, unsigned nb_elems)
 {
     int i;
     HuffEntry he[1024];
@@ -82,11 +82,34 @@  static int build_huff(UtvideoContext *c, const uint8_t *src, VLC *vlc,
         he[--codes_count[bits[i]]] = (HuffEntry) { bits[i], i };
 
 #define VLC_BITS 11
-    return ff_init_vlc_from_lengths(vlc, VLC_BITS, codes_count[0],
+    return ff_init_vlc_multi_from_lengths(vlc, multi, VLC_BITS, nb_elems, codes_count[0],
                                     &he[0].len, sizeof(*he),
                                     &he[0].sym, sizeof(*he), 2, 0, 0, c->avctx);
 }
 
+#define READ_PLANE(b, end) \
+{ \
+    buf = !use_pred ? dest : c->buffer; \
+    for (i = 0; CACHED_BITSTREAM_READER && i < width-end && get_bits_left(&gb) > 0;) {\
+        ret = get_vlc_multi(&gb, (uint8_t *)buf + i * b, multi.table, \
+                            vlc.table, VLC_BITS, 3); \
+        if (ret > 0) \
+            i += ret; \
+        if (ret <= 0) \
+            goto fail; \
+    } \
+    for (; i < width && get_bits_left(&gb) > 0; i++) \
+        buf[i] = get_vlc2(&gb, vlc.table, VLC_BITS, 3); \
+    if (use_pred) { \
+        if (b == 2) \
+            c->llviddsp.add_left_pred_int16((uint16_t *)dest, (const uint16_t *)buf, 0x3ff, width, prev); \
+        else \
+            c->llviddsp.add_left_pred((uint8_t *)dest, (const uint8_t *)buf, width, prev); \
+    } \
+    prev = dest[width-1]; \
+    dest += stride; \
+}
+
 static int decode_plane10(UtvideoContext *c, int plane_no,
                           uint16_t *dst, ptrdiff_t stride,
                           int width, int height,
@@ -95,11 +118,12 @@  static int decode_plane10(UtvideoContext *c, int plane_no,
 {
     int i, j, slice, pix, ret;
     int sstart, send;
+    VLC_MULTI multi;
     VLC vlc;
     GetBitContext gb;
     int prev, fsym;
 
-    if ((ret = build_huff(c, huff, &vlc, &fsym, 1024)) < 0) {
+    if ((ret = build_huff(c, huff, &vlc, &multi, &fsym, 1024)) < 0) {
         av_log(c->avctx, AV_LOG_ERROR, "Cannot build Huffman codes\n");
         return ret;
     }
@@ -131,7 +155,7 @@  static int decode_plane10(UtvideoContext *c, int plane_no,
 
     send = 0;
     for (slice = 0; slice < c->slices; slice++) {
-        uint16_t *dest;
+        uint16_t *dest, *buf;
         int slice_data_start, slice_data_end, slice_size;
 
         sstart = send;
@@ -156,37 +180,20 @@  static int decode_plane10(UtvideoContext *c, int plane_no,
         init_get_bits(&gb, c->slice_bits, slice_size * 8);
 
         prev = 0x200;
-        for (j = sstart; j < send; j++) {
-            for (i = 0; i < width; i++) {
-                pix = get_vlc2(&gb, vlc.table, VLC_BITS, 3);
-                if (pix < 0) {
-                    av_log(c->avctx, AV_LOG_ERROR, "Decoding error\n");
-                    goto fail;
-                }
-                if (use_pred) {
-                    prev += pix;
-                    prev &= 0x3FF;
-                    pix   = prev;
-                }
-                dest[i] = pix;
-            }
-            dest += stride;
-            if (get_bits_left(&gb) < 0) {
-                av_log(c->avctx, AV_LOG_ERROR,
-                        "Slice decoding ran out of bits\n");
-                goto fail;
-            }
-        }
+        for (j = sstart; j < send; j++)
+            READ_PLANE(2, 3)
         if (get_bits_left(&gb) > 32)
             av_log(c->avctx, AV_LOG_WARNING,
                    "%d bits left after decoding slice\n", get_bits_left(&gb));
     }
 
     ff_free_vlc(&vlc);
+    ff_free_vlc_multi(&multi);
 
     return 0;
 fail:
     ff_free_vlc(&vlc);
+    ff_free_vlc_multi(&multi);
     return AVERROR_INVALIDDATA;
 }
 
@@ -207,6 +214,7 @@  static int decode_plane(UtvideoContext *c, int plane_no,
 {
     int i, j, slice, pix;
     int sstart, send;
+    VLC_MULTI multi;
     VLC vlc;
     GetBitContext gb;
     int ret, prev, fsym;
@@ -259,7 +267,7 @@  static int decode_plane(UtvideoContext *c, int plane_no,
         return 0;
     }
 
-    if (build_huff(c, src, &vlc, &fsym, 256)) {
+    if (build_huff(c, src, &vlc, &multi, &fsym, 256)) {
         av_log(c->avctx, AV_LOG_ERROR, "Cannot build Huffman codes\n");
         return AVERROR_INVALIDDATA;
     }
@@ -292,7 +300,7 @@  static int decode_plane(UtvideoContext *c, int plane_no,
 
     send = 0;
     for (slice = 0; slice < c->slices; slice++) {
-        uint8_t *dest;
+        uint8_t *dest, *buf;
         int slice_data_start, slice_data_end, slice_size;
 
         sstart = send;
@@ -317,36 +325,20 @@  static int decode_plane(UtvideoContext *c, int plane_no,
         init_get_bits(&gb, c->slice_bits, slice_size * 8);
 
         prev = 0x80;
-        for (j = sstart; j < send; j++) {
-            for (i = 0; i < width; i++) {
-                pix = get_vlc2(&gb, vlc.table, VLC_BITS, 3);
-                if (pix < 0) {
-                    av_log(c->avctx, AV_LOG_ERROR, "Decoding error\n");
-                    goto fail;
-                }
-                if (use_pred) {
-                    prev += pix;
-                    pix   = prev;
-                }
-                dest[i] = pix;
-            }
-            if (get_bits_left(&gb) < 0) {
-                av_log(c->avctx, AV_LOG_ERROR,
-                        "Slice decoding ran out of bits\n");
-                goto fail;
-            }
-            dest += stride;
-        }
+        for (j = sstart; j < send; j++)
+            READ_PLANE(1, 5)
         if (get_bits_left(&gb) > 32)
             av_log(c->avctx, AV_LOG_WARNING,
                    "%d bits left after decoding slice\n", get_bits_left(&gb));
     }
 
     ff_free_vlc(&vlc);
+    ff_free_vlc_multi(&multi);
 
     return 0;
 fail:
     ff_free_vlc(&vlc);
+    ff_free_vlc_multi(&multi);
     return AVERROR_INVALIDDATA;
 }
 
@@ -992,6 +984,10 @@  static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
+    c->buffer = av_calloc(avctx->width, c->pro?2:1);
+    if (!c->buffer)
+        return AVERROR(ENOMEM);
+
     av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &h_shift, &v_shift);
     if ((avctx->width  & ((1<<h_shift)-1)) ||
         (avctx->height & ((1<<v_shift)-1))) {
@@ -1047,6 +1043,7 @@  static av_cold int decode_end(AVCodecContext *avctx)
     UtvideoContext * const c = avctx->priv_data;
 
     av_freep(&c->slice_bits);
+    av_freep(&c->buffer);
 
     return 0;
 }
-- 
2.39.1