[FFmpeg-devel] patch: the fastest video decoder ever? :)

Message ID	20170127145655.GI1516@example.net
State	New
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Date: Fri, 27 Jan 2017 15:56:55 +0100 From: u-9iep@aetey.se To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Message-ID: <20170127145655.GI1516@example.net> References: <20170106131341.GO5271@example.net> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="u3/rZRmxL6MmkK24" Content-Disposition: inline In-Reply-To: <20170106131341.GO5271@example.net> Subject: Re: [FFmpeg-devel] patch: the fastest video decoder ever? :) Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

--- libavcodec/cinepak.c.rgb24 2017-01-27 11:54:31.316799141 +0100 +++ libavcodec/cinepak.c 2017-01-27 12:25:33.674095602 +0100 @@ -31,6 +31,8 @@ * * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB * @author Cinepak colorspace, Rl, Aetey Global Technologies AB + * Cinepak rgb565 format (c) 2017 Rl, Aetey Global Technologies AB + * @author Cinepak rgb565, Rl, Aetey Global Technologies AB */ #include <stdio.h> @@ -42,8 +44,12 @@ #include "avcodec.h" #include "internal.h" +/* feel free to replace with a better mapping implementation + * (keeping in mind slow, not very "intelligent" hardware) + */ +#define PACK_RGB_RGB565(r,g,b) (((av_clip_uint8((r)+4)>>3)<<11)|((av_clip_uint8((g)+2)>>2)<<5)|(av_clip_uint8((b)+4)>>3)) -typedef uint8_t cvid_codebook[12]; +typedef uint16_t cvid_codebook[4]; /* in the native endian rgb565 format */ #define MAX_STRIPS 32 @@ -73,13 +79,13 @@ uint32_t pal[256]; } CinepakContext; -static void cinepak_decode_codebook (cvid_codebook *codebook, +static void cinepak_decode_codebook (cvid_codebook *codebook, int palette_video, int chunk_id, int size, const uint8_t *data) { const uint8_t *eod = (data + size); uint32_t flag, mask; int i, n; - uint8_t *p; + uint16_t *p; /* check if this chunk contains 4- or 6-element vectors */ n = (chunk_id & 0x04) ? 4 : 6; @@ -98,33 +104,36 @@ } if (!(chunk_id & 0x01) || (flag & mask)) { - int k, kk; + int k; if ((data + n) > eod) break; - for (k = 0; k < 4; ++k) { - int r = *data++; - for (kk = 0; kk < 3; ++kk) - *p++ = r; - } - if (n == 6) { - int r, g, b, u, v; + if (n == 4) +/* 8-bit palette indexes or otherwise grayscale values, + * they need different handling */ + if (palette_video) + for (k = 0; k < 4; ++k) + *p++ = *data++; + else + for (k = 0; k < 4; ++k) { + int r = *data++; + *p++ = PACK_RGB_RGB565(r,r,r); + } + else { + int y[4], u, v; + for (k = 0; k < 4; ++k) +/* 8-bit luminance values */ + y[k] = *data++; u = *(int8_t *)data++; v = *(int8_t *)data++; - p -= 12; - for(k=0; k<4; ++k) { - r = *p++ + v*2; - g = *p++ - (u/2) - v; - b = *p + u*2; - p -= 2; - *p++ = av_clip_uint8(r); - *p++ = av_clip_uint8(g); - *p++ = av_clip_uint8(b); - } + for(k=0; k<4; ++k) + *p++ = PACK_RGB_RGB565(y[k] + v*2, + y[k] - (u/2) - v, + y[k] + u*2); } } else { - p += 12; + p += 4; } } } @@ -134,8 +143,8 @@ { const uint8_t *eod = (data + size); uint32_t flag, mask; - uint8_t *cb0, *cb1, *cb2, *cb3; - int x, y; + uint16_t *cb0, *cb1, *cb2, *cb3; + int x, y; char *ip0, *ip1, *ip2, *ip3; flag = 0; @@ -145,7 +154,7 @@ /* take care of y dimension not being multiple of 4, such streams exist */ ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + - (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]); + (s->palette_video?strip->x1:strip->x1*2) + (y * s->frame->linesize[0]); if(s->avctx->height - y > 1) { ip1 = ip0 + s->frame->linesize[0]; if(s->avctx->height - y > 2) { @@ -181,29 +190,25 @@ } if ((chunk_id & 0x02) || (~flag & mask)) { - uint8_t *p; + uint16_t *p; if (data >= eod) return AVERROR_INVALIDDATA; p = strip->v1_codebook[*data++]; if (s->palette_video) { - ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6]; - ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9]; + ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2]; + ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3]; ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0]; - ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3]; + ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1]; } else { - p += 6; - memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3); - memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3); - p += 3; /* ... + 9 */ - memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3); - memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3); - p -= 9; /* ... + 0 */ - memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3); - memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3); - p += 3; /* ... + 3 */ - memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3); - memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3); + * (uint16_t *)ip3 = *((uint16_t *)ip3+1) = + * (uint16_t *)ip2 = *((uint16_t *)ip2+1) = p[2]; + *((uint16_t *)ip3+2) = *((uint16_t *)ip3+3) = + *((uint16_t *)ip2+2) = *((uint16_t *)ip2+3) = p[3]; + * (uint16_t *)ip1 = *((uint16_t *)ip1+1) = + * (uint16_t *)ip0 = *((uint16_t *)ip0+1) = p[0]; + *((uint16_t *)ip1+2) = *((uint16_t *)ip1+3) = + *((uint16_t *)ip0+2) = *((uint16_t *)ip0+3) = p[1]; } } else if (flag & mask) { @@ -217,34 +222,34 @@ if (s->palette_video) { uint8_t *p; p = ip3; - *p++ = cb2[6]; - *p++ = cb2[9]; - *p++ = cb3[6]; - *p = cb3[9]; + *p++ = cb2[2]; + *p++ = cb2[3]; + *p++ = cb3[2]; + *p = cb3[3]; p = ip2; *p++ = cb2[0]; - *p++ = cb2[3]; + *p++ = cb2[1]; *p++ = cb3[0]; - *p = cb3[3]; + *p = cb3[1]; p = ip1; - *p++ = cb0[6]; - *p++ = cb0[9]; - *p++ = cb1[6]; - *p = cb1[9]; + *p++ = cb0[2]; + *p++ = cb0[3]; + *p++ = cb1[2]; + *p = cb1[3]; p = ip0; *p++ = cb0[0]; - *p++ = cb0[3]; + *p++ = cb0[1]; *p++ = cb1[0]; - *p = cb1[3]; + *p = cb1[1]; } else { - memcpy(ip3 + 0, cb2 + 6, 6); - memcpy(ip3 + 6, cb3 + 6, 6); - memcpy(ip2 + 0, cb2 + 0, 6); - memcpy(ip2 + 6, cb3 + 0, 6); - memcpy(ip1 + 0, cb0 + 6, 6); - memcpy(ip1 + 6, cb1 + 6, 6); - memcpy(ip0 + 0, cb0 + 0, 6); - memcpy(ip0 + 6, cb1 + 0, 6); + memcpy(ip3 + 0, cb2 + 2, 4); + memcpy(ip3 + 4, cb3 + 2, 4); + memcpy(ip2 + 0, cb2 + 0, 4); + memcpy(ip2 + 4, cb3 + 0, 4); + memcpy(ip1 + 0, cb0 + 2, 4); + memcpy(ip1 + 4, cb1 + 2, 4); + memcpy(ip0 + 0, cb0 + 0, 4); + memcpy(ip0 + 4, cb1 + 0, 4); } } @@ -254,8 +259,8 @@ ip0 += 4; ip1 += 4; ip2 += 4; ip3 += 4; } else { - ip0 += 12; ip1 += 12; - ip2 += 12; ip3 += 12; + ip0 += 8; ip1 += 8; + ip2 += 8; ip3 += 8; } } } @@ -290,16 +295,16 @@ case 0x21: case 0x24: case 0x25: - cinepak_decode_codebook (strip->v4_codebook, chunk_id, - chunk_size, data); + cinepak_decode_codebook (strip->v4_codebook, s->palette_video, + chunk_id, chunk_size, data); break; case 0x22: case 0x23: case 0x26: case 0x27: - cinepak_decode_codebook (strip->v1_codebook, chunk_id, - chunk_size, data); + cinepak_decode_codebook (strip->v1_codebook, s->palette_video, + chunk_id, chunk_size, data); break; case 0x30: @@ -412,10 +417,16 @@ s->sega_film_skip_bytes = -1; /* uninitialized state */ - // check for paletted data + /* check for paletted data */ if (avctx->bits_per_coded_sample != 8) { s->palette_video = 0; - avctx->pix_fmt = AV_PIX_FMT_RGB24; +/* we choose to produce video data in a certain 16-bit format, + * for the sake of video output on slow hardware with this native format + * FIXME: this should be made selectable via an option, not hardcoded, + * FIXME: to make the same binary usable with different hardware, + * FIXME: for the moment we make this a compile-time choice by using + * FIXME: a certain version of the cinepak.c file */ + avctx->pix_fmt = AV_PIX_FMT_RGB565; } else { s->palette_video = 1; avctx->pix_fmt = AV_PIX_FMT_PAL8;

[FFmpeg-devel] patch: the fastest video decoder ever? :)

Commit Message

Patch