[FFmpeg-devel] Cinepak: speed up decoding several-fold, depending on the scenario, by supporting multiple output pixel formats.

Message ID	20170205112430.GH1516@example.net
State	New
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Date: Sun, 5 Feb 2017 12:24:30 +0100 From: u-9iep@aetey.se To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Message-ID: <20170205112430.GH1516@example.net> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="Y4VYWfdFdHwFhC6D" Content-Disposition: inline Subject: [FFmpeg-devel] [PATCH] Cinepak: speed up decoding several-fold, depending on the scenario, by supporting multiple output pixel formats. Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c index d657e9c0c1..386ce98299 100644 --- a/libavcodec/cinepak.c +++ b/libavcodec/cinepak.c @@ -31,6 +31,8 @@ * * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB * @author Cinepak colorspace, Rl, Aetey Global Technologies AB + * Extra output formats / optimizations (c) 2017 Rl, Aetey Global Technologies AB + * @author Extra output formats / optimizations, Rl, Aetey Global Technologies AB */ #include <stdio.h> @@ -39,23 +41,54 @@ #include "libavutil/common.h" #include "libavutil/intreadwrite.h" +#include "libavutil/opt.h" +/* #include "libavutil/avassert.h" */ #include "avcodec.h" #include "internal.h" +/* rounding to nearest; truncation would be slightly faster + * but it noticeably affects the picture quality; + * unless we become extremely desperate to use every single cycle + * we do not bother implementing a choice -- rl */ +#define PACK_RGB_RGB565(r,g,b) (((av_clip_uint8((r)+4)>>3)<<11)|((av_clip_uint8((g)+2)>>2)<<5)|(av_clip_uint8((b)+4)>>3)) -typedef uint8_t cvid_codebook[12]; +/* + * more "desperate/ultimate" optimization possibilites: + * - possibly (hardly?) spare a cycle or two by not ensuring to stay + * inside the frame at vector decoding (the frame is allocated with + * a margin for us as an extra precaution, we can as well use this) + * - skip filling in opacity when it is not needed by the data consumer, + * in many cases rgb32 is almost as fast as rgb565, with full quality, + * improving its speed can make sense + * + * possible style improvements: + * - rewrite the codebook/vector chunk parsing to isolate + * the flags logic (the variable length coding) from the rest, + * so that it could be templated safely - a tiny piece but + * it is being repeated many times + */ + +typedef union cvid_codebook { + uint32_t rgb32[256][ 4]; + uint8_t rgb24[256][12]; + uint16_t rgb565[256][ 4]; + uint8_t yuv420[256][ 6]; + uint8_t pal8[256][ 4]; +} cvid_codebook; -#define MAX_STRIPS 32 +#define MAX_STRIPS 32 /* an arbitrary limit -- rl */ typedef struct cvid_strip { uint16_t id; uint16_t x1, y1; uint16_t x2, y2; - cvid_codebook v4_codebook[256]; - cvid_codebook v1_codebook[256]; + cvid_codebook v4_codebook; + cvid_codebook v1_codebook; } cvid_strip; -typedef struct CinepakContext { +typedef struct CinepakContext CinepakContext; +struct CinepakContext { + const AVClass *class; AVCodecContext *avctx; AVFrame *frame; @@ -71,24 +104,111 @@ typedef struct CinepakContext { int sega_film_skip_bytes; uint32_t pal[256]; -} CinepakContext; -static void cinepak_decode_codebook (cvid_codebook *codebook, - int chunk_id, int size, const uint8_t *data) + void (*decode_codebook)(CinepakContext *s, + cvid_codebook *codebook, int chunk_id, + int size, const uint8_t *data); + int (*decode_vectors)(CinepakContext *s, cvid_strip *strip, + int chunk_id, int size, const uint8_t *data); +/* options */ + enum AVPixelFormat out_pixfmt; +}; + +#define OFFSET(x) offsetof(CinepakContext, x) +#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM +static const AVOption options[] = { +{"output_pixel_format", "set output pixel format: rgb24/rgb32/rgb565/yuv420p/pal8; yuv420p is approximate", OFFSET(out_pixfmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_NONE}, -1, INT_MAX, VD }, + { NULL }, +}; + +static const AVClass cinepak_class = { + .class_name = "cinepak decoder", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static void cinepak_decode_codebook_rgb32 (CinepakContext *s, + cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + int i, n; + uint32_t *p; + int palette_video = s->palette_video; + int selective_update = (chunk_id & 0x01); + + /* check if this chunk contains 4- or 6-element vectors */ + n = (chunk_id & 0x04) ? 4 : 6; + flag = 0; + mask = 0; + + p = codebook->rgb32[0]; + for (i=0; i < 256; i++) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + break; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + int k; + + if ((data + n) > eod) + break; + + if (n == 4) + if (palette_video) + for (k = 0; k < 4; ++k) + *p++ = s->pal[*data++]; /* this is easy */ + else + for (k = 0; k < 4; ++k) { + int r = *data++; +/* in some situations we might not have to set opacity */ + *p++ = /**/ (255<<24)| /**/ (r<<16)|(r<<8)|r; + } + else { /* n == 6 */ + int y, u, v; + u = (int8_t)data[4]; + v = (int8_t)data[5]; + for(k=0; k<4; ++k) { + y = *data++; +/* in some situations we might not have to set opacity */ + *p++ = /**/ (255<<24)| /**/ +/* here the cinepak color space excels */ + (av_clip_uint8(y + v*2)<<16)| + (av_clip_uint8(y - (u/2) - v)<<8)| + av_clip_uint8(y + u*2); + } + data += 2; + } + } else { + p += 4; + } + } +} + +static void cinepak_decode_codebook_rgb24 (CinepakContext *s, + cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data) { const uint8_t *eod = (data + size); uint32_t flag, mask; int i, n; uint8_t *p; + int palette_video = s->palette_video; + int selective_update = (chunk_id & 0x01); /* check if this chunk contains 4- or 6-element vectors */ n = (chunk_id & 0x04) ? 4 : 6; flag = 0; mask = 0; - p = codebook[0]; + p = codebook->rgb24[0]; for (i=0; i < 256; i++) { - if ((chunk_id & 0x01) && !(mask >>= 1)) { + if (selective_update && !(mask >>= 1)) { if ((data + 4) > eod) break; @@ -97,31 +217,38 @@ static void cinepak_decode_codebook (cvid_codebook *codebook, mask = 0x80000000; } - if (!(chunk_id & 0x01) || (flag & mask)) { + if (!selective_update || (flag & mask)) { int k, kk; if ((data + n) > eod) break; - for (k = 0; k < 4; ++k) { - int r = *data++; - for (kk = 0; kk < 3; ++kk) - *p++ = r; - } - if (n == 6) { - int r, g, b, u, v; - u = *(int8_t *)data++; - v = *(int8_t *)data++; - p -= 12; + if (n == 4) + if (palette_video) + for (k = 0; k < 4; ++k) { + uint32_t r = s->pal[*data++]; + *p++ = (r>>16)&0xff; + *p++ = (r>>8) &0xff; + *p++ = r &0xff; + } + else + for (k = 0; k < 4; ++k) { + int r = *data++; + for (kk = 0; kk < 3; ++kk) + *p++ = r; + } + else { /* n == 6 */ + int y, u, v; + u = (int8_t)data[4]; + v = (int8_t)data[5]; for(k=0; k<4; ++k) { - r = *p++ + v*2; - g = *p++ - (u/2) - v; - b = *p + u*2; - p -= 2; - *p++ = av_clip_uint8(r); - *p++ = av_clip_uint8(g); - *p++ = av_clip_uint8(b); + y = *data++; +/* here the cinepak color space excels */ + *p++ = av_clip_uint8(y + v*2); + *p++ = av_clip_uint8(y - (u/2) - v); + *p++ = av_clip_uint8(y + u*2); } + data += 2; } } else { p += 12; @@ -129,14 +256,296 @@ static void cinepak_decode_codebook (cvid_codebook *codebook, } } -static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip, +static void cinepak_decode_codebook_rgb565 (CinepakContext *s, + cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + int i, n; + uint16_t *p; + int palette_video = s->palette_video; + int selective_update = (chunk_id & 0x01); + + /* check if this chunk contains 4- or 6-element vectors */ + n = (chunk_id & 0x04) ? 4 : 6; + flag = 0; + mask = 0; + + p = codebook->rgb565[0]; + for (i=0; i < 256; i++) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + break; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + int k; + + if ((data + n) > eod) + break; + + if (n == 4) + if (palette_video) + for (k = 0; k < 4; ++k) { + uint32_t r = s->pal[*data++]; + *p++ = PACK_RGB_RGB565((r>>16)&0xff, + (r>>8)&0xff, + r&0xff); + } + else + for (k = 0; k < 4; ++k) { + int r = *data++; + *p++ = PACK_RGB_RGB565(r,r,r); + } + else { /* n == 6 */ + int y, u, v; + u = (int8_t)data[4]; + v = (int8_t)data[5]; + for(k=0; k<4; ++k) { + y = *data++; +/* here the cinepak color space excels */ + *p++ = PACK_RGB_RGB565(y + v*2, + y - (u/2) - v, + y + u*2); + } + data += 2; + } + } else { + p += 4; + } + } +} + +/* a simplistic version to begin with, it is also fast -- rl */ +static void cinepak_decode_codebook_yuv420 (CinepakContext *s, + cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + int i, n; + uint8_t *p; + int palette_video = s->palette_video; + int selective_update = (chunk_id & 0x01); + + /* check if this chunk contains 4- or 6-element vectors */ + n = (chunk_id & 0x04) ? 4 : 6; + flag = 0; + mask = 0; + + p = codebook->yuv420[0]; + for (i=0; i < 256; i++) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + break; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + int k; + + if ((data + n) > eod) + break; + + if (n == 4) + if (palette_video) { +/* here we have kind of "more" data than the output format can express */ + int r, g, b, u = 0, v = 0; + for (k = 0; k < 4; ++k) { + uint32_t rr = s->pal[*data++]; + r = (rr>>16)&0xff; + g = (rr>>8) &0xff; + b = rr &0xff; +/* calculate the components (https://en.wikipedia.org/wiki/YUV) */ + *p++ = ((r*66+g*129+b*25+128)>>8)+16; + u += (-r*38-g*74+b*112+128)>>8; + v += (r*112-g*94-b*18+128)>>8; + } + *p++ = (u+2)/4+128; + *p++ = (v+2)/4+128; + } else { /* grayscale, easy */ + for (k = 0; k < 4; ++k) { + *p++ = *data++; + } + *p++ = 128; + *p++ = 128; + } + else { /* n == 6 */ +/* here we'd have to handle double format conversion + * Cinepak=>rgb24 and then rgb24=>yuv420p, which can not be shortcut; + * for the moment just copying as-is, for simplicity and speed, + * color will be slightly off but not much */ + *p++ = *data++; + *p++ = *data++; + *p++ = *data++; + *p++ = *data++; + *p++ = *data++ + 128; + *p++ = *data++ + 128; + } + } else { + p += 6; + } + } +} + +/* here we do not expect anything besides palettized video, + * nor check the data for validity, which should be ok + * as long as we do not write beyond the bounds */ +static void cinepak_decode_codebook_pal8 (CinepakContext *s, + cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + int i; + uint8_t *p; + int selective_update = (chunk_id & 0x01); + +#define PAL8_VECTOR_LENGTH 4 +/* av_assert0(chunk_id & 0x04); */ + + flag = 0; + mask = 0; + + p = codebook->pal8[0]; + for (i=0; i < 256; i++) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + break; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + int k; + + if ((data + PAL8_VECTOR_LENGTH) > eod) + break; + + for (k = 0; k < 4; ++k) + *p++ = *data++; + } else { + p += 4; + } + } +} + +static int cinepak_decode_vectors_rgb32 (CinepakContext *s, cvid_strip *strip, + int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + uint32_t *cb0, *cb1, *cb2, *cb3; + int x, y; + char *ip0, *ip1, *ip2, *ip3; + int selective_update = (chunk_id & 0x01); + int v1_only = (chunk_id & 0x02); + + flag = 0; + mask = 0; + + for (y=strip->y1; y < strip->y2; y+=4) { + +/* take care of y dimension not being multiple of 4, such streams exist */ + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1*4 + y*s->frame->linesize[0]; + if(s->avctx->height - y > 1) { + ip1 = ip0 + s->frame->linesize[0]; + if(s->avctx->height - y > 2) { + ip2 = ip1 + s->frame->linesize[0]; + if(s->avctx->height - y > 3) { + ip3 = ip2 + s->frame->linesize[0]; + } + } + } +/* to get the correct picture for not-multiple-of-4 cases let us fill each + * block from the bottom up, thus possibly overwriting the bottommost line + * more than once but ending with the correct data in place + * (instead of in-loop checking) */ + + for (x=strip->x1; x < strip->x2; x+=4) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + if (!v1_only && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (v1_only || (~flag & mask)) { + uint32_t *p; + if (data >= eod) + return AVERROR_INVALIDDATA; + + p = strip->v1_codebook.rgb32[*data++] + 2; /* ... + 8 */ + memcpy(ip3 + 0, p, 4); memcpy(ip3 + 4, p, 4); + memcpy(ip2 + 0, p, 4); memcpy(ip2 + 4, p, 4); + p += 1; /* ... + 12 */ + memcpy(ip3 + 8, p, 4); memcpy(ip3 + 12, p, 4); + memcpy(ip2 + 8, p, 4); memcpy(ip2 + 12, p, 4); + p -= 3; /* ... + 0 */ + memcpy(ip1 + 0, p, 4); memcpy(ip1 + 4, p, 4); + memcpy(ip0 + 0, p, 4); memcpy(ip0 + 4, p, 4); + p += 1; /* ... + 4 */ + memcpy(ip1 + 8, p, 4); memcpy(ip1 + 12, p, 4); + memcpy(ip0 + 8, p, 4); memcpy(ip0 + 12, p, 4); + + } else if (flag & mask) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + cb0 = strip->v4_codebook.rgb32[*data++]; + cb1 = strip->v4_codebook.rgb32[*data++]; + cb2 = strip->v4_codebook.rgb32[*data++]; + cb3 = strip->v4_codebook.rgb32[*data++]; + memcpy(ip3 + 0, cb2 + 2, 8); + memcpy(ip3 + 8, cb3 + 2, 8); + memcpy(ip2 + 0, cb2 + 0, 8); + memcpy(ip2 + 8, cb3 + 0, 8); + memcpy(ip1 + 0, cb0 + 2, 8); + memcpy(ip1 + 8, cb1 + 2, 8); + memcpy(ip0 + 0, cb0 + 0, 8); + memcpy(ip0 + 8, cb1 + 0, 8); + + } + } + + ip0 += 16; ip1 += 16; + ip2 += 16; ip3 += 16; + } + } + + return 0; +} + +static int cinepak_decode_vectors_rgb24 (CinepakContext *s, cvid_strip *strip, int chunk_id, int size, const uint8_t *data) { const uint8_t *eod = (data + size); uint32_t flag, mask; uint8_t *cb0, *cb1, *cb2, *cb3; - int x, y; + int x, y; char *ip0, *ip1, *ip2, *ip3; + int selective_update = (chunk_id & 0x01); + int v1_only = (chunk_id & 0x02); flag = 0; mask = 0; @@ -145,7 +554,7 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip, /* take care of y dimension not being multiple of 4, such streams exist */ ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + - (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]); + strip->x1*3 + y*s->frame->linesize[0]; if(s->avctx->height - y > 1) { ip1 = ip0 + s->frame->linesize[0]; if(s->avctx->height - y > 2) { @@ -161,7 +570,7 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip, * (instead of in-loop checking) */ for (x=strip->x1; x < strip->x2; x+=4) { - if ((chunk_id & 0x01) && !(mask >>= 1)) { + if (selective_update && !(mask >>= 1)) { if ((data + 4) > eod) return AVERROR_INVALIDDATA; @@ -170,8 +579,8 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip, mask = 0x80000000; } - if (!(chunk_id & 0x01) || (flag & mask)) { - if (!(chunk_id & 0x02) && !(mask >>= 1)) { + if (!selective_update || (flag & mask)) { + if (!v1_only && !(mask >>= 1)) { if ((data + 4) > eod) return AVERROR_INVALIDDATA; @@ -180,83 +589,356 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip, mask = 0x80000000; } - if ((chunk_id & 0x02) || (~flag & mask)) { + if (v1_only || (~flag & mask)) { uint8_t *p; if (data >= eod) return AVERROR_INVALIDDATA; - p = strip->v1_codebook[*data++]; - if (s->palette_video) { - ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6]; - ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9]; - ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0]; - ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3]; - } else { - p += 6; - memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3); - memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3); - p += 3; /* ... + 9 */ - memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3); - memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3); - p -= 9; /* ... + 0 */ - memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3); - memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3); - p += 3; /* ... + 3 */ - memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3); - memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3); - } + p = strip->v1_codebook.rgb24[*data++] + 6; + memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3); + memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3); + p += 3; /* ... + 9 */ + memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3); + memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3); + p -= 9; /* ... + 0 */ + memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3); + memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3); + p += 3; /* ... + 3 */ + memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3); + memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3); } else if (flag & mask) { if ((data + 4) > eod) return AVERROR_INVALIDDATA; - cb0 = strip->v4_codebook[*data++]; - cb1 = strip->v4_codebook[*data++]; - cb2 = strip->v4_codebook[*data++]; - cb3 = strip->v4_codebook[*data++]; - if (s->palette_video) { - uint8_t *p; - p = ip3; - *p++ = cb2[6]; - *p++ = cb2[9]; - *p++ = cb3[6]; - *p = cb3[9]; - p = ip2; - *p++ = cb2[0]; - *p++ = cb2[3]; - *p++ = cb3[0]; - *p = cb3[3]; - p = ip1; - *p++ = cb0[6]; - *p++ = cb0[9]; - *p++ = cb1[6]; - *p = cb1[9]; - p = ip0; - *p++ = cb0[0]; - *p++ = cb0[3]; - *p++ = cb1[0]; - *p = cb1[3]; - } else { - memcpy(ip3 + 0, cb2 + 6, 6); - memcpy(ip3 + 6, cb3 + 6, 6); - memcpy(ip2 + 0, cb2 + 0, 6); - memcpy(ip2 + 6, cb3 + 0, 6); - memcpy(ip1 + 0, cb0 + 6, 6); - memcpy(ip1 + 6, cb1 + 6, 6); - memcpy(ip0 + 0, cb0 + 0, 6); - memcpy(ip0 + 6, cb1 + 0, 6); - } + cb0 = strip->v4_codebook.rgb24[*data++]; + cb1 = strip->v4_codebook.rgb24[*data++]; + cb2 = strip->v4_codebook.rgb24[*data++]; + cb3 = strip->v4_codebook.rgb24[*data++]; + memcpy(ip3 + 0, cb2 + 6, 6); + memcpy(ip3 + 6, cb3 + 6, 6); + memcpy(ip2 + 0, cb2 + 0, 6); + memcpy(ip2 + 6, cb3 + 0, 6); + memcpy(ip1 + 0, cb0 + 6, 6); + memcpy(ip1 + 6, cb1 + 6, 6); + memcpy(ip0 + 0, cb0 + 0, 6); + memcpy(ip0 + 6, cb1 + 0, 6); } } - if (s->palette_video) { - ip0 += 4; ip1 += 4; - ip2 += 4; ip3 += 4; - } else { - ip0 += 12; ip1 += 12; - ip2 += 12; ip3 += 12; + ip0 += 12; ip1 += 12; + ip2 += 12; ip3 += 12; + } + } + + return 0; +} + +static int cinepak_decode_vectors_rgb565 (CinepakContext *s, cvid_strip *strip, + int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + uint16_t *cb0, *cb1, *cb2, *cb3; + int x, y; + char *ip0, *ip1, *ip2, *ip3; + int selective_update = (chunk_id & 0x01); + int v1_only = (chunk_id & 0x02); + + flag = 0; + mask = 0; + + for (y=strip->y1; y < strip->y2; y+=4) { + +/* take care of y dimension not being multiple of 4, such streams exist */ + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1*2 + y*s->frame->linesize[0]; + if(s->avctx->height - y > 1) { + ip1 = ip0 + s->frame->linesize[0]; + if(s->avctx->height - y > 2) { + ip2 = ip1 + s->frame->linesize[0]; + if(s->avctx->height - y > 3) { + ip3 = ip2 + s->frame->linesize[0]; + } + } + } +/* to get the correct picture for not-multiple-of-4 cases let us fill each + * block from the bottom up, thus possibly overwriting the bottommost line + * more than once but ending with the correct data in place + * (instead of in-loop checking) */ + + for (x=strip->x1; x < strip->x2; x+=4) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + if (!v1_only && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (v1_only || (~flag & mask)) { + uint16_t *p; + if (data >= eod) + return AVERROR_INVALIDDATA; + + p = strip->v1_codebook.rgb565[*data++]; + * (uint16_t *)ip3 = *((uint16_t *)ip3+1) = + * (uint16_t *)ip2 = *((uint16_t *)ip2+1) = p[2]; + *((uint16_t *)ip3+2) = *((uint16_t *)ip3+3) = + *((uint16_t *)ip2+2) = *((uint16_t *)ip2+3) = p[3]; + * (uint16_t *)ip1 = *((uint16_t *)ip1+1) = + * (uint16_t *)ip0 = *((uint16_t *)ip0+1) = p[0]; + *((uint16_t *)ip1+2) = *((uint16_t *)ip1+3) = + *((uint16_t *)ip0+2) = *((uint16_t *)ip0+3) = p[1]; + + } else if (flag & mask) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + cb0 = strip->v4_codebook.rgb565[*data++]; + cb1 = strip->v4_codebook.rgb565[*data++]; + cb2 = strip->v4_codebook.rgb565[*data++]; + cb3 = strip->v4_codebook.rgb565[*data++]; + memcpy(ip3 + 0, cb2 + 2, 4); + memcpy(ip3 + 4, cb3 + 2, 4); + memcpy(ip2 + 0, cb2 + 0, 4); + memcpy(ip2 + 4, cb3 + 0, 4); + memcpy(ip1 + 0, cb0 + 2, 4); + memcpy(ip1 + 4, cb1 + 2, 4); + memcpy(ip0 + 0, cb0 + 0, 4); + memcpy(ip0 + 4, cb1 + 0, 4); + + } } + + ip0 += 8; ip1 += 8; + ip2 += 8; ip3 += 8; + } + } + + return 0; +} + +static int cinepak_decode_vectors_yuv420 (CinepakContext *s, cvid_strip *strip, + int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + uint8_t *cb0, *cb1, *cb2, *cb3; + int x, y; + char *ip0, *ip1, *ip2, *ip3, + *up01, *up23, *vp01, *vp23; + int selective_update = (chunk_id & 0x01); + int v1_only = (chunk_id & 0x02); + + flag = 0; + mask = 0; + + for (y=strip->y1; y < strip->y2; y+=4) { + +/* take care of y dimension not being multiple of 4, such streams exist */ + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1*3 + y*s->frame->linesize[0]; + up01 = up23 = s->frame->data[1] + strip->x1 + y/2*s->frame->linesize[1]; + vp01 = vp23 = s->frame->data[2] + strip->x1 + y/2*s->frame->linesize[2]; + if(s->avctx->height - y > 1) { + ip1 = ip0 + s->frame->linesize[0]; + if(s->avctx->height - y > 2) { + ip2 = ip1 + s->frame->linesize[0]; + up23 = up01 + s->frame->linesize[1]; + vp23 = vp01 + s->frame->linesize[2]; + if(s->avctx->height - y > 3) { + ip3 = ip2 + s->frame->linesize[0]; + } + } + } + +/* to get the correct picture for not-multiple-of-4 cases let us fill each + * block from the bottom up, thus possibly overwriting the bottommost line + * more than once but ending with the correct data in place + * (instead of in-loop checking) */ + + for (x=strip->x1; x < strip->x2; x+=4) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + if (!v1_only && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (v1_only || (~flag & mask)) { + uint8_t *p; + if (data >= eod) + return AVERROR_INVALIDDATA; + + p = strip->v1_codebook.yuv420[*data++]; + ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2]; + ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3]; + ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0]; + ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1]; + p += 4; + up01[0] = up01[1] = up23[0] = up23[1] = *p++; + vp01[0] = vp01[1] = vp23[0] = vp23[1] = *p++; + + } else if (flag & mask) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + cb0 = strip->v4_codebook.yuv420[*data++]; + cb1 = strip->v4_codebook.yuv420[*data++]; + cb2 = strip->v4_codebook.yuv420[*data++]; + cb3 = strip->v4_codebook.yuv420[*data++]; + memcpy(ip3 + 0, cb2 + 2, 2); + memcpy(ip3 + 2, cb3 + 2, 2); + memcpy(ip2 + 0, cb2 + 0, 2); + memcpy(ip2 + 2, cb3 + 0, 2); + memcpy(ip1 + 0, cb0 + 2, 2); + memcpy(ip1 + 2, cb1 + 2, 2); + memcpy(ip0 + 0, cb0 + 0, 2); + memcpy(ip0 + 2, cb1 + 0, 2); + cb0 += 4; cb1 += 4; cb2 += 4; cb3 += 4; + up23[0] = *cb2++; vp23[0] = *cb2; + up23[1] = *cb3++; vp23[1] = *cb3; + up01[0] = *cb0++; vp01[0] = *cb0; + up01[1] = *cb1++; vp01[1] = *cb1; + + } + } + + ip0 += 4; ip1 += 4; + ip2 += 4; ip3 += 4; + up01 += 2; up23 += 2; + vp01 += 2; vp23 += 2; + } + } + + return 0; +} + +static int cinepak_decode_vectors_pal8 (CinepakContext *s, cvid_strip *strip, + int chunk_id, int size, const uint8_t *data) +{ + const uint8_t *eod = (data + size); + uint32_t flag, mask; + uint8_t *cb0, *cb1, *cb2, *cb3; + int x, y; + char *ip0, *ip1, *ip2, *ip3; + int selective_update = (chunk_id & 0x01); + int v1_only = (chunk_id & 0x02); + + flag = 0; + mask = 0; + + for (y=strip->y1; y < strip->y2; y+=4) { + +/* take care of y dimension not being multiple of 4, such streams exist */ + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1 + y*s->frame->linesize[0]; + if(s->avctx->height - y > 1) { + ip1 = ip0 + s->frame->linesize[0]; + if(s->avctx->height - y > 2) { + ip2 = ip1 + s->frame->linesize[0]; + if(s->avctx->height - y > 3) { + ip3 = ip2 + s->frame->linesize[0]; + } + } + } + +/* to get the correct picture for not-multiple-of-4 cases let us fill each + * block from the bottom up, thus possibly overwriting the bottommost line + * more than once but ending with the correct data in place + * (instead of in-loop checking) */ + + for (x=strip->x1; x < strip->x2; x+=4) { + if (selective_update && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (!selective_update || (flag & mask)) { + if (!v1_only && !(mask >>= 1)) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + flag = AV_RB32 (data); + data += 4; + mask = 0x80000000; + } + + if (v1_only || (~flag & mask)) { + uint8_t *p; + if (data >= eod) + return AVERROR_INVALIDDATA; + + p = strip->v1_codebook.pal8[*data++]; + ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2]; + ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3]; + ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0]; + ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1]; + + } else if (flag & mask) { + uint8_t *p; + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + cb0 = strip->v4_codebook.pal8[*data++]; + cb1 = strip->v4_codebook.pal8[*data++]; + cb2 = strip->v4_codebook.pal8[*data++]; + cb3 = strip->v4_codebook.pal8[*data++]; + p = ip3; + *p++ = cb2[2]; + *p++ = cb2[3]; + *p++ = cb3[2]; + *p = cb3[3]; + p = ip2; + *p++ = cb2[0]; + *p++ = cb2[1]; + *p++ = cb3[0]; + *p = cb3[1]; + p = ip1; + *p++ = cb0[2]; + *p++ = cb0[3]; + *p++ = cb1[2]; + *p = cb1[3]; + p = ip0; + *p++ = cb0[0]; + *p++ = cb0[1]; + *p++ = cb1[0]; + *p = cb1[1]; + + } + } + + ip0 += 4; ip1 += 4; + ip2 += 4; ip3 += 4; } } @@ -290,22 +972,22 @@ static int cinepak_decode_strip (CinepakContext *s, case 0x21: case 0x24: case 0x25: - cinepak_decode_codebook (strip->v4_codebook, chunk_id, - chunk_size, data); + s->decode_codebook(s, &strip->v4_codebook, + chunk_id, chunk_size, data); break; case 0x22: case 0x23: case 0x26: case 0x27: - cinepak_decode_codebook (strip->v1_codebook, chunk_id, - chunk_size, data); + s->decode_codebook (s, &strip->v1_codebook, + chunk_id, chunk_size, data); break; case 0x30: case 0x31: case 0x32: - return cinepak_decode_vectors (s, strip, chunk_id, + return s->decode_vectors (s, strip, chunk_id, chunk_size, data); } @@ -385,9 +1067,9 @@ static int cinepak_decode (CinepakContext *s) strip_size = ((s->data + strip_size) > eod) ? (eod - s->data) : strip_size; if ((i > 0) && !(frame_flags & 0x01)) { - memcpy (s->strips[i].v4_codebook, s->strips[i-1].v4_codebook, + memcpy (&s->strips[i].v4_codebook, &s->strips[i-1].v4_codebook, sizeof(s->strips[i].v4_codebook)); - memcpy (s->strips[i].v1_codebook, s->strips[i-1].v1_codebook, + memcpy (&s->strips[i].v1_codebook, &s->strips[i-1].v1_codebook, sizeof(s->strips[i].v1_codebook)); } @@ -402,9 +1084,34 @@ static int cinepak_decode (CinepakContext *s) return 0; } +/* given a palettized input */ +static const enum AVPixelFormat ff_cinepak_pixfmt_list[] = { + AV_PIX_FMT_RGB24, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_RGB565, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_PAL8, /* only when input is palettized */ + AV_PIX_FMT_NONE +}; + +/* given a non-palettized input */ +static const enum AVPixelFormat ff_cinepak_pixfmt_list_2[] = { + AV_PIX_FMT_RGB24, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_RGB565, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NONE +}; + static av_cold int cinepak_decode_init(AVCodecContext *avctx) { CinepakContext *s = avctx->priv_data; +#ifdef CINEPAK_DECODE_PIXFMT_OVERRIDE + char *out_fmt_override = getenv("CINEPAK_DECODE_PIXFMT_OVERRIDE"); +#endif + +/* we take advantage of VQ to efficiently support + * multiple output formats */ s->avctx = avctx; s->width = (avctx->width + 3) & ~3; @@ -412,13 +1119,84 @@ static av_cold int cinepak_decode_init(AVCodecContext *avctx) s->sega_film_skip_bytes = -1; /* uninitialized state */ - // check for paletted data - if (avctx->bits_per_coded_sample != 8) { - s->palette_video = 0; - avctx->pix_fmt = AV_PIX_FMT_RGB24; - } else { - s->palette_video = 1; - avctx->pix_fmt = AV_PIX_FMT_PAL8; + /* check for paletted data */ + s->palette_video = (avctx->bits_per_coded_sample == 8); +/* av_log(avctx, AV_LOG_INFO, "this is %sa palette video\n", s->palette_video?"":"not "); */ + +/* + * Checking an environment variable for out-of-band control + * of the output pixel format: + * + * get_format() does _not_ help when you can not modify the applications + * to use it, let alone to use it appropriately under varying practical + * curcumstances, there is no general criteria capable to choose + * the most suitable pixel format for each case; + * that's why the availability of an out-of-band control channel + * is extremely useful, sometimes there is no alternative [sic] + * + * leaving this disabled by default, to avoid any surprises, because + * it is not part of the official ABI -- rl */ + +#ifdef CINEPAK_DECODE_PIXFMT_OVERRIDE + if (out_fmt_override && *out_fmt_override) { + if ( !strcmp(out_fmt_override, "rgb32")) { + avctx->pix_fmt = AV_PIX_FMT_RGB32; + } else if (!strcmp(out_fmt_override, "rgb24")) { + avctx->pix_fmt = AV_PIX_FMT_RGB24; + } else if (!strcmp(out_fmt_override, "rgb565")) { + avctx->pix_fmt = AV_PIX_FMT_RGB565; + } else if (!strcmp(out_fmt_override, "yuv420p")) { + avctx->pix_fmt = AV_PIX_FMT_YUV420P; + } else if (!strcmp(out_fmt_override, "pal8")) { + avctx->pix_fmt = AV_PIX_FMT_PAL8; + } else { + av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format override '%s'\n", + out_fmt_override); + return AVERROR(EINVAL); + } + } else +#endif + if (s->out_pixfmt != AV_PIX_FMT_NONE) /* the option is set to something */ + avctx->pix_fmt = s->out_pixfmt; + else + if (s->palette_video) + avctx->pix_fmt = ff_get_format(avctx, ff_cinepak_pixfmt_list); + else + avctx->pix_fmt = ff_get_format(avctx, ff_cinepak_pixfmt_list_2); + + switch (avctx->pix_fmt) { + case AV_PIX_FMT_RGB32: +/* av_log(avctx, AV_LOG_INFO, "Codec output pixel format is rgb32\n"); */ + s->decode_codebook = cinepak_decode_codebook_rgb32; + s->decode_vectors = cinepak_decode_vectors_rgb32; + break; + case AV_PIX_FMT_RGB24: +/* av_log(avctx, AV_LOG_INFO, "Codec output pixel format is rgb24\n"); */ + s->decode_codebook = cinepak_decode_codebook_rgb24; + s->decode_vectors = cinepak_decode_vectors_rgb24; + break; + case AV_PIX_FMT_RGB565: +/* av_log(avctx, AV_LOG_INFO, "Codec output pixel format is rgb565\n"); */ + s->decode_codebook = cinepak_decode_codebook_rgb565; + s->decode_vectors = cinepak_decode_vectors_rgb565; + break; + case AV_PIX_FMT_YUV420P: +/* av_log(avctx, AV_LOG_INFO, "Codec output pixel format is approximate yuv420p\n"); */ + s->decode_codebook = cinepak_decode_codebook_yuv420; + s->decode_vectors = cinepak_decode_vectors_yuv420; + break; + case AV_PIX_FMT_PAL8: + if (!s->palette_video) { + av_log(avctx, AV_LOG_ERROR, "Palettized output not supported without palettized input\n"); + return AVERROR(EINVAL); + } +/* av_log(avctx, AV_LOG_INFO, "Codec output pixel format is pal8\n"); */ + s->decode_codebook = cinepak_decode_codebook_pal8; + s->decode_vectors = cinepak_decode_vectors_pal8; + break; + default: + av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format %d\n", avctx->pix_fmt); + return AVERROR(EINVAL); } s->frame = av_frame_alloc(); @@ -457,7 +1235,7 @@ static int cinepak_decode_frame(AVCodecContext *avctx, av_log(avctx, AV_LOG_ERROR, "cinepak_decode failed\n"); } - if (s->palette_video) + if (avctx->pix_fmt == AV_PIX_FMT_PAL8) memcpy (s->frame->data[1], s->pal, AVPALETTE_SIZE); if ((ret = av_frame_ref(data, s->frame)) < 0) @@ -488,4 +1266,6 @@ AVCodec ff_cinepak_decoder = { .close = cinepak_decode_end, .decode = cinepak_decode_frame, .capabilities = AV_CODEC_CAP_DR1, + .pix_fmts = ff_cinepak_pixfmt_list, + .priv_class = &cinepak_class, };

[FFmpeg-devel] Cinepak: speed up decoding several-fold, depending on the scenario, by supporting multiple output pixel formats.

Commit Message

Comments

Patch