[FFmpeg-devel] patch: the fastest video decoder ever? :)

Submitted by u-9iep@aetey.se on Jan. 27, 2017, 2:56 p.m.

Details

Message ID 20170127145655.GI1516@example.net
State New
Headers show

Commit Message

u-9iep@aetey.se Jan. 27, 2017, 2:56 p.m.
Here comes a slightly better version, with rounding to nearest instead
of rounding down at color bits truncation. I was unable to reliably measure
any speed difference compared to the simplistic former version.

The output quality now fully corresponds to the capabilities of rgb565
and the decoding is 12 - 17% faster than to rgb24.

The most remarkable gain of course is by skipping any extra format
conversion when on an rgb565 screen.

Posting the patch for perusal of possible interested parties.

A runtime output format option might be worth to be added,
and possibly support for more output formats as well?

Cheers,
Rune

On Fri, Jan 06, 2017 at 02:13:41PM +0100, u-9iep@aetey.se wrote:
> Hello,
> 
> On slow hardware a 16-bit framebuffer depth makes a remarkable difference
> in performance and still provides a good look.
> 
> When videos are to be played on slow terminals there is a single
> best speed performer, cinepak (tell me if you know a better codec in this
> respect, the only comparable one I know of is VQA-15, but there is no
> reasonable encoder for it, nor a safe decoder).
> 
> Unfortunately cinepak as present in ffmpeg yields rgb24 which has to be
> repacked depending on the framebuffer format.
> 
> The attached patch makes cinepak to provide rgb565 in native endianness.
> This cuts in half (!) the mplayer CPU consumption on i686 with Xorg
> in the corresponding mode/depth, compared to decoding to rgb24.
> 
> The optimization is possible because cinepak is a VQ codec and pixel
> format packing at codebook fill is much more efficient than at a later
> stage. (Thanks Michael for the suggestion, long ago).
> 
> Of course the picture quality is slightly affected if this decoder version
> is used with larger framebuffer depths. A run-time switch would be to
> prefer, if this optimization against all odds would be welcome upstream :)
> 
> I am posting this as a proof of concept, lacking run-time format
> switching. As said, this patch yields half CPU consumption or double
> speed at decoding for rgb565 terminals.

Patch hide | download patch | download mbox

--- libavcodec/cinepak.c.rgb24	2017-01-27 11:54:31.316799141 +0100
+++ libavcodec/cinepak.c	2017-01-27 12:25:33.674095602 +0100
@@ -31,6 +31,8 @@ 
  *
  * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB
  * @author Cinepak colorspace, Rl, Aetey Global Technologies AB
+ * Cinepak rgb565 format (c) 2017 Rl, Aetey Global Technologies AB
+ * @author Cinepak rgb565, Rl, Aetey Global Technologies AB
  */
 
 #include <stdio.h>
@@ -42,8 +44,12 @@ 
 #include "avcodec.h"
 #include "internal.h"
 
+/* feel free to replace with a better mapping implementation
+ * (keeping in mind slow, not very "intelligent" hardware)
+ */
+#define PACK_RGB_RGB565(r,g,b) (((av_clip_uint8((r)+4)>>3)<<11)|((av_clip_uint8((g)+2)>>2)<<5)|(av_clip_uint8((b)+4)>>3))
 
-typedef uint8_t cvid_codebook[12];
+typedef uint16_t cvid_codebook[4]; /* in the native endian rgb565 format */
 
 #define MAX_STRIPS      32
 
@@ -73,13 +79,13 @@ 
     uint32_t pal[256];
 } CinepakContext;
 
-static void cinepak_decode_codebook (cvid_codebook *codebook,
+static void cinepak_decode_codebook (cvid_codebook *codebook, int palette_video,
                                      int chunk_id, int size, const uint8_t *data)
 {
     const uint8_t *eod = (data + size);
     uint32_t flag, mask;
     int      i, n;
-    uint8_t *p;
+    uint16_t *p;
 
     /* check if this chunk contains 4- or 6-element vectors */
     n    = (chunk_id & 0x04) ? 4 : 6;
@@ -98,33 +104,36 @@ 
         }
 
         if (!(chunk_id & 0x01) || (flag & mask)) {
-            int k, kk;
+            int k;
 
             if ((data + n) > eod)
                 break;
 
-            for (k = 0; k < 4; ++k) {
-                int r = *data++;
-                for (kk = 0; kk < 3; ++kk)
-                    *p++ = r;
-            }
-            if (n == 6) {
-                int r, g, b, u, v;
+            if (n == 4)
+/* 8-bit palette indexes or otherwise grayscale values,
+ * they need different handling */
+                if (palette_video)
+                    for (k = 0; k < 4; ++k)
+                        *p++ = *data++;
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int r = *data++;
+                        *p++ = PACK_RGB_RGB565(r,r,r);
+                    }
+            else {
+                int y[4], u, v;
+                for (k = 0; k < 4; ++k)
+/* 8-bit luminance values */
+                    y[k] = *data++;
                 u = *(int8_t *)data++;
                 v = *(int8_t *)data++;
-                p -= 12;
-                for(k=0; k<4; ++k) {
-                    r = *p++ + v*2;
-                    g = *p++ - (u/2) - v;
-                    b = *p   + u*2;
-                    p -= 2;
-                    *p++ = av_clip_uint8(r);
-                    *p++ = av_clip_uint8(g);
-                    *p++ = av_clip_uint8(b);
-                }
+                for(k=0; k<4; ++k)
+                    *p++ = PACK_RGB_RGB565(y[k] + v*2,
+                                           y[k] - (u/2) - v,
+                                           y[k] + u*2);
             }
         } else {
-            p += 12;
+            p += 4;
         }
     }
 }
@@ -134,8 +143,8 @@ 
 {
     const uint8_t   *eod = (data + size);
     uint32_t         flag, mask;
-    uint8_t         *cb0, *cb1, *cb2, *cb3;
-    int             x, y;
+    uint16_t        *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
     char            *ip0, *ip1, *ip2, *ip3;
 
     flag = 0;
@@ -145,7 +154,7 @@ 
 
 /* take care of y dimension not being multiple of 4, such streams exist */
         ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
-          (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]);
+          (s->palette_video?strip->x1:strip->x1*2) + (y * s->frame->linesize[0]);
         if(s->avctx->height - y > 1) {
             ip1 = ip0 + s->frame->linesize[0];
             if(s->avctx->height - y > 2) {
@@ -181,29 +190,25 @@ 
                 }
 
                 if ((chunk_id & 0x02) || (~flag & mask)) {
-                    uint8_t *p;
+                    uint16_t *p;
                     if (data >= eod)
                         return AVERROR_INVALIDDATA;
 
                     p = strip->v1_codebook[*data++];
                     if (s->palette_video) {
-                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6];
-                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9];
+                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2];
+                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3];
                         ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
-                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3];
+                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1];
                     } else {
-                        p += 6;
-                        memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
-                        memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
-                        p += 3; /* ... + 9 */
-                        memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
-                        memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
-                        p -= 9; /* ... + 0 */
-                        memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
-                        memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
-                        p += 3; /* ... + 3 */
-                        memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
-                        memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
+                        * (uint16_t *)ip3    = *((uint16_t *)ip3+1) =
+                        * (uint16_t *)ip2    = *((uint16_t *)ip2+1) = p[2];
+                        *((uint16_t *)ip3+2) = *((uint16_t *)ip3+3) =
+                        *((uint16_t *)ip2+2) = *((uint16_t *)ip2+3) = p[3];
+                        * (uint16_t *)ip1    = *((uint16_t *)ip1+1) =
+                        * (uint16_t *)ip0    = *((uint16_t *)ip0+1) = p[0];
+                        *((uint16_t *)ip1+2) = *((uint16_t *)ip1+3) =
+                        *((uint16_t *)ip0+2) = *((uint16_t *)ip0+3) = p[1];
                     }
 
                 } else if (flag & mask) {
@@ -217,34 +222,34 @@ 
                     if (s->palette_video) {
                         uint8_t *p;
                         p = ip3;
-                        *p++ = cb2[6];
-                        *p++ = cb2[9];
-                        *p++ = cb3[6];
-                        *p   = cb3[9];
+                        *p++ = cb2[2];
+                        *p++ = cb2[3];
+                        *p++ = cb3[2];
+                        *p   = cb3[3];
                         p = ip2;
                         *p++ = cb2[0];
-                        *p++ = cb2[3];
+                        *p++ = cb2[1];
                         *p++ = cb3[0];
-                        *p   = cb3[3];
+                        *p   = cb3[1];
                         p = ip1;
-                        *p++ = cb0[6];
-                        *p++ = cb0[9];
-                        *p++ = cb1[6];
-                        *p   = cb1[9];
+                        *p++ = cb0[2];
+                        *p++ = cb0[3];
+                        *p++ = cb1[2];
+                        *p   = cb1[3];
                         p = ip0;
                         *p++ = cb0[0];
-                        *p++ = cb0[3];
+                        *p++ = cb0[1];
                         *p++ = cb1[0];
-                        *p   = cb1[3];
+                        *p   = cb1[1];
                     } else {
-                        memcpy(ip3 + 0, cb2 + 6, 6);
-                        memcpy(ip3 + 6, cb3 + 6, 6);
-                        memcpy(ip2 + 0, cb2 + 0, 6);
-                        memcpy(ip2 + 6, cb3 + 0, 6);
-                        memcpy(ip1 + 0, cb0 + 6, 6);
-                        memcpy(ip1 + 6, cb1 + 6, 6);
-                        memcpy(ip0 + 0, cb0 + 0, 6);
-                        memcpy(ip0 + 6, cb1 + 0, 6);
+                        memcpy(ip3 + 0, cb2 + 2, 4);
+                        memcpy(ip3 + 4, cb3 + 2, 4);
+                        memcpy(ip2 + 0, cb2 + 0, 4);
+                        memcpy(ip2 + 4, cb3 + 0, 4);
+                        memcpy(ip1 + 0, cb0 + 2, 4);
+                        memcpy(ip1 + 4, cb1 + 2, 4);
+                        memcpy(ip0 + 0, cb0 + 0, 4);
+                        memcpy(ip0 + 4, cb1 + 0, 4);
                     }
 
                 }
@@ -254,8 +259,8 @@ 
                 ip0 += 4;  ip1 += 4;
                 ip2 += 4;  ip3 += 4;
             } else {
-                ip0 += 12;  ip1 += 12;
-                ip2 += 12;  ip3 += 12;
+                ip0 += 8;  ip1 += 8;
+                ip2 += 8;  ip3 += 8;
             }
         }
     }
@@ -290,16 +295,16 @@ 
         case 0x21:
         case 0x24:
         case 0x25:
-            cinepak_decode_codebook (strip->v4_codebook, chunk_id,
-                chunk_size, data);
+            cinepak_decode_codebook (strip->v4_codebook, s->palette_video,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x22:
         case 0x23:
         case 0x26:
         case 0x27:
-            cinepak_decode_codebook (strip->v1_codebook, chunk_id,
-                chunk_size, data);
+            cinepak_decode_codebook (strip->v1_codebook, s->palette_video,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x30:
@@ -412,10 +417,16 @@ 
 
     s->sega_film_skip_bytes = -1;  /* uninitialized state */
 
-    // check for paletted data
+    /* check for paletted data */
     if (avctx->bits_per_coded_sample != 8) {
         s->palette_video = 0;
-        avctx->pix_fmt = AV_PIX_FMT_RGB24;
+/* we choose to produce video data in a certain 16-bit format,
+ * for the sake of video output on slow hardware with this native format
+ * FIXME: this should be made selectable via an option, not hardcoded,
+ * FIXME: to make the same binary usable with different hardware,
+ * FIXME: for the moment we make this a compile-time choice by using
+ * FIXME: a certain version of the cinepak.c file */
+        avctx->pix_fmt = AV_PIX_FMT_RGB565;
     } else {
         s->palette_video = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;