[FFmpeg-devel,1/2] Re: deduplicated [PATCH] Cinepak: speed up decoding several-fold, depending on the scenario, by supporting multiple output pixel formats.

Message ID	20170225200358.GG32749@example.net
State	New
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Date: Sat, 25 Feb 2017 21:03:58 +0100 From: u-9iep@aetey.se To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Message-ID: <20170225200358.GG32749@example.net> References: <20170213131945.GG5776@nb4> <CAPYw7P4s06ARd=+eBw=nnz6yZgXpoPCa6YE5V-86_jF8erq9yw@mail.gmail.com> <20170213175139.GB32749@example.net> <20170214065146.0810cb48@debian> <20170214085154.GD32749@example.net> <20170214100003.013390c4@debian> <CAPYw7P6vOLKix3OYr0VCxCsB_h+AokvdOmNHPZ_eMs-pSCjMJA@mail.gmail.com> <20170214111404.GE32749@example.net> <CAEEMt2=shogv+wN09G1D_xvLDV2PJ44F1eHsuq7=f3sM7UfiyQ@mail.gmail.com> <20170215110155.GG32749@example.net> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="5vNYLRcllDrimb99" Content-Disposition: inline In-Reply-To: <20170215110155.GG32749@example.net> Subject: [FFmpeg-devel] [PATCH 1/2] Re: deduplicated [PATCH] Cinepak: speed up decoding several-fold, depending on the scenario, by supporting multiple output pixel formats. Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c index d657e9c0c1..97836b3ab1 100644 --- a/libavcodec/cinepak.c +++ b/libavcodec/cinepak.c @@ -31,6 +31,8 @@ * * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB * @author Cinepak colorspace, Rl, Aetey Global Technologies AB + * Extra output formats / optimizations (c) 2017 Rl, Aetey Global Technologies AB + * @author Extra output formats / optimizations, Rl, Aetey Global Technologies AB */ #include <stdio.h> @@ -39,23 +41,49 @@ #include "libavutil/common.h" #include "libavutil/intreadwrite.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" #include "avcodec.h" #include "internal.h" +/* rounding to nearest; truncation would be slightly faster + * but it noticeably affects the picture quality; + * unless we become extremely desperate to use every single cycle + * we do not bother implementing a choice -- rl */ +#define PACK_RGB_RGB565(r,g,b) (((av_clip_uint8((r)+4)>>3)<<11)|((av_clip_uint8((g)+2)>>2)<<5)|(av_clip_uint8((b)+4)>>3)) -typedef uint8_t cvid_codebook[12]; +/* + * more "desperate/ultimate" optimization possibilites: + * - possibly (hardly?) spare a cycle or two by not ensuring to stay + * inside the frame at vector decoding (the frame is allocated with + * a margin for us as an extra precaution, we can as well use this) + * - skip filling in opacity when it is not needed by the data consumer, + * in many cases rgb32 is almost as fast as rgb565, with full quality, + * improving its speed can make sense + * - add "fast rgb565" with truncation instead of rounding + */ + +typedef union cvid_codebook { + uint32_t rgb32[256][ 4]; + uint8_t rgb24[256][12]; + uint16_t rgb565[256][ 4]; + uint8_t yuv420p[256][ 6]; + uint8_t pal8[256][ 4]; +} cvid_codebook; -#define MAX_STRIPS 32 +#define MAX_STRIPS 32 /* an arbitrary limit -- rl */ typedef struct cvid_strip { uint16_t id; uint16_t x1, y1; uint16_t x2, y2; - cvid_codebook v4_codebook[256]; - cvid_codebook v1_codebook[256]; + cvid_codebook v4_codebook; + cvid_codebook v1_codebook; } cvid_strip; -typedef struct CinepakContext { +typedef struct CinepakContext CinepakContext; +struct CinepakContext { + const AVClass *class; AVCodecContext *avctx; AVFrame *frame; @@ -71,57 +99,259 @@ typedef struct CinepakContext { int sega_film_skip_bytes; uint32_t pal[256]; -} CinepakContext; -static void cinepak_decode_codebook (cvid_codebook *codebook, - int chunk_id, int size, const uint8_t *data) -{ - const uint8_t *eod = (data + size); - uint32_t flag, mask; - int i, n; - uint8_t *p; - - /* check if this chunk contains 4- or 6-element vectors */ - n = (chunk_id & 0x04) ? 4 : 6; - flag = 0; - mask = 0; - - p = codebook[0]; - for (i=0; i < 256; i++) { - if ((chunk_id & 0x01) && !(mask >>= 1)) { - if ((data + 4) > eod) - break; - - flag = AV_RB32 (data); - data += 4; - mask = 0x80000000; + void (*decode_codebook)(CinepakContext *s, + cvid_codebook *codebook, int chunk_id, + int size, const uint8_t *data); + int (*decode_vectors)(CinepakContext *s, cvid_strip *strip, + int chunk_id, int size, const uint8_t *data); +/* options */ + enum AVPixelFormat out_pixfmt; +}; + +#define OFFSET(x) offsetof(CinepakContext, x) +#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM +static const AVOption options[] = { +{"output_pixel_format", "set output pixel format like rgb24/rgb32/rgb565/yuv420p/pal8; yuv420p is approximate", OFFSET(out_pixfmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_NONE}, -1, INT_MAX, VD }, + { NULL }, +}; + +static const AVClass cinepak_class = { + .class_name = "cinepak decoder", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +/* this is an attempt to reduce code duplication + * feel free to do this in a more elegant fashion, but keep the speed + * -- rl */ +#define CODEBOOK_PROLOGUE(pixel_format) \ +static void cinepak_decode_codebook_##pixel_format (CinepakContext *s,\ + cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)\ +{\ + const uint8_t *eod;\ + uint32_t flag, mask;\ + int selective_update;\ + int i;\ + +#define DECODE_CODEBOOK(pixel_format) \ +CODEBOOK_PROLOGUE(pixel_format) \ + int n;\ + int palette_video;\ + +#define DECODE_CODEBOOK_PAL8 \ +CODEBOOK_PROLOGUE(pal8) \ + +#define CODEBOOK_STREAM_PARSING \ + for (i=0; i < 256; i++) {\ + if (selective_update && !(mask >>= 1)) {\ + if ((data + 4) > eod)\ + break;\ +\ + flag = AV_RB32 (data);\ + data += 4;\ + mask = 0x80000000;\ + }\ +\ + if (!selective_update || (flag & mask)) {\ + int k;\ +\ + if ((data + n) > eod)\ + break;\ + +#define CODEBOOK_INTRO \ + selective_update = (chunk_id & 0x01);\ + eod = (data + size);\ + flag = 0;\ + mask = 0;\ + +#define CODEBOOK_FULL_COLOR \ + /* check if this chunk contains 4- or 6-element vectors */\ + n = (chunk_id & 0x04) ? 4 : 6;\ + palette_video = s->palette_video;\ + CODEBOOK_INTRO\ + CODEBOOK_STREAM_PARSING\ + +#define DECODE_VECTORS(pixel_format) \ +static int cinepak_decode_vectors_##pixel_format (CinepakContext *s, cvid_strip *strip,\ + int chunk_id, int size, const uint8_t *data)\ +{\ + const uint8_t *eod;\ + uint32_t flag, mask;\ + int x, y;\ + char *ip0, *ip1, *ip2, *ip3;\ + int selective_update;\ + int v1_only;\ + +#define VECTOR_INTRO \ + CODEBOOK_INTRO\ + v1_only = (chunk_id & 0x02);\ +\ + for (y=strip->y1; y < strip->y2; y+=4) {\ + +#define VECTOR_STREAM_PARSING \ + for (x=strip->x1; x < strip->x2; x+=4) {\ + if (selective_update && !(mask >>= 1)) {\ + if ((data + 4) > eod)\ + return AVERROR_INVALIDDATA;\ +\ + flag = AV_RB32 (data);\ + data += 4;\ + mask = 0x80000000;\ + }\ +\ + if (!selective_update || (flag & mask)) {\ + if (!v1_only && !(mask >>= 1)) {\ + if ((data + 4) > eod)\ + return AVERROR_INVALIDDATA;\ +\ + flag = AV_RB32 (data);\ + data += 4;\ + mask = 0x80000000;\ + }\ +\ + if (v1_only || (~flag & mask)) {\ + POINTER_TYPE *p;\ + if (data >= eod)\ + return AVERROR_INVALIDDATA;\ + +#define VECTOR_DO \ +/* take care of y dimension not being multiple of 4, such streams exist */\ + if(s->avctx->height - y > 1) {\ + ip1 = ip0 + s->frame->linesize[0];\ + if(s->avctx->height - y > 2) {\ + ip2 = ip1 + s->frame->linesize[0];\ + if(s->avctx->height - y > 3) {\ + ip3 = ip2 + s->frame->linesize[0];\ + }\ + }\ + }\ +/* to get the correct picture for not-multiple-of-4 cases let us fill each\ + * block from the bottom up, thus possibly overwriting the bottommost line\ + * more than once but ending with the correct data in place\ + * (instead of in-loop checking) */\ + VECTOR_STREAM_PARSING\ + +DECODE_CODEBOOK(rgb32) + uint32_t *p = codebook->rgb32[0]; + + CODEBOOK_FULL_COLOR + + if (n == 4) + if (palette_video) + for (k = 0; k < 4; ++k) + *p++ = s->pal[*data++]; /* this is easy */ + else + for (k = 0; k < 4; ++k) { + int r = *data++; +/* in some situations we might not have to set opacity */ + *p++ = /**/ (255<<24)| /**/ (r<<16)|(r<<8)|r; + } + else { /* n == 6 */ + int y, u, v; + u = (int8_t)data[4]; + v = (int8_t)data[5]; + for(k=0; k<4; ++k) { + y = *data++; +/* in some situations we might not have to set opacity */ + *p++ = /**/ (255<<24)| /**/ +/* here the cinepak color space excels */ + (av_clip_uint8(y + v*2)<<16)| + (av_clip_uint8(y - (u/2) - v)<<8)| + av_clip_uint8(y + u*2); + } + data += 2; + } + } else { + p += 4; } + } +} - if (!(chunk_id & 0x01) || (flag & mask)) { - int k, kk; +DECODE_VECTORS(rgb32) + uint32_t *cb0, *cb1, *cb2, *cb3; - if ((data + n) > eod) - break; + VECTOR_INTRO - for (k = 0; k < 4; ++k) { - int r = *data++; - for (kk = 0; kk < 3; ++kk) - *p++ = r; + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1*4 + y*s->frame->linesize[0]; +#define POINTER_TYPE uint32_t + VECTOR_DO +#undef POINTER_TYPE + + p = strip->v1_codebook.rgb32[*data++] + 2; /* ... + 8 */ + memcpy(ip3 + 0, p, 4); memcpy(ip3 + 4, p, 4); + memcpy(ip2 + 0, p, 4); memcpy(ip2 + 4, p, 4); + p += 1; /* ... + 12 */ + memcpy(ip3 + 8, p, 4); memcpy(ip3 + 12, p, 4); + memcpy(ip2 + 8, p, 4); memcpy(ip2 + 12, p, 4); + p -= 3; /* ... + 0 */ + memcpy(ip1 + 0, p, 4); memcpy(ip1 + 4, p, 4); + memcpy(ip0 + 0, p, 4); memcpy(ip0 + 4, p, 4); + p += 1; /* ... + 4 */ + memcpy(ip1 + 8, p, 4); memcpy(ip1 + 12, p, 4); + memcpy(ip0 + 8, p, 4); memcpy(ip0 + 12, p, 4); + + } else if (flag & mask) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + cb0 = strip->v4_codebook.rgb32[*data++]; + cb1 = strip->v4_codebook.rgb32[*data++]; + cb2 = strip->v4_codebook.rgb32[*data++]; + cb3 = strip->v4_codebook.rgb32[*data++]; + memcpy(ip3 + 0, cb2 + 2, 8); + memcpy(ip3 + 8, cb3 + 2, 8); + memcpy(ip2 + 0, cb2 + 0, 8); + memcpy(ip2 + 8, cb3 + 0, 8); + memcpy(ip1 + 0, cb0 + 2, 8); + memcpy(ip1 + 8, cb1 + 2, 8); + memcpy(ip0 + 0, cb0 + 0, 8); + memcpy(ip0 + 8, cb1 + 0, 8); + + } } - if (n == 6) { - int r, g, b, u, v; - u = *(int8_t *)data++; - v = *(int8_t *)data++; - p -= 12; + + ip0 += 16; ip1 += 16; + ip2 += 16; ip3 += 16; + } + } + + return 0; +} + +DECODE_CODEBOOK(rgb24) + uint8_t *p = codebook->rgb24[0]; + + CODEBOOK_FULL_COLOR + + if (n == 4) + if (palette_video) + for (k = 0; k < 4; ++k) { + uint32_t r = s->pal[*data++]; + *p++ = (r>>16)&0xff; + *p++ = (r>>8) &0xff; + *p++ = r &0xff; + } + else + for (k = 0; k < 4; ++k) { + int kk, r = *data++; + for (kk = 0; kk < 3; ++kk) + *p++ = r; + } + else { /* n == 6 */ + int y, u, v; + u = (int8_t)data[4]; + v = (int8_t)data[5]; for(k=0; k<4; ++k) { - r = *p++ + v*2; - g = *p++ - (u/2) - v; - b = *p + u*2; - p -= 2; - *p++ = av_clip_uint8(r); - *p++ = av_clip_uint8(g); - *p++ = av_clip_uint8(b); + y = *data++; +/* here the cinepak color space excels */ + *p++ = av_clip_uint8(y + v*2); + *p++ = av_clip_uint8(y - (u/2) - v); + *p++ = av_clip_uint8(y + u*2); } + data += 2; } } else { p += 12; @@ -129,134 +359,344 @@ static void cinepak_decode_codebook (cvid_codebook *codebook, } } -static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip, - int chunk_id, int size, const uint8_t *data) -{ - const uint8_t *eod = (data + size); - uint32_t flag, mask; +DECODE_VECTORS(rgb24) uint8_t *cb0, *cb1, *cb2, *cb3; - int x, y; - char *ip0, *ip1, *ip2, *ip3; - flag = 0; - mask = 0; + VECTOR_INTRO + + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1*3 + y*s->frame->linesize[0]; + +#define POINTER_TYPE uint8_t + VECTOR_DO +#undef POINTER_TYPE + + p = strip->v1_codebook.rgb24[*data++] + 6; + memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3); + memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3); + p += 3; /* ... + 9 */ + memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3); + memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3); + p -= 9; /* ... + 0 */ + memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3); + memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3); + p += 3; /* ... + 3 */ + memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3); + memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3); + + } else if (flag & mask) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + cb0 = strip->v4_codebook.rgb24[*data++]; + cb1 = strip->v4_codebook.rgb24[*data++]; + cb2 = strip->v4_codebook.rgb24[*data++]; + cb3 = strip->v4_codebook.rgb24[*data++]; + memcpy(ip3 + 0, cb2 + 6, 6); + memcpy(ip3 + 6, cb3 + 6, 6); + memcpy(ip2 + 0, cb2 + 0, 6); + memcpy(ip2 + 6, cb3 + 0, 6); + memcpy(ip1 + 0, cb0 + 6, 6); + memcpy(ip1 + 6, cb1 + 6, 6); + memcpy(ip0 + 0, cb0 + 0, 6); + memcpy(ip0 + 6, cb1 + 0, 6); + + } + } + + ip0 += 12; ip1 += 12; + ip2 += 12; ip3 += 12; + } + } + + return 0; +} + +DECODE_CODEBOOK(rgb565) + uint16_t *p = codebook->rgb565[0]; - for (y=strip->y1; y < strip->y2; y+=4) { + CODEBOOK_FULL_COLOR + + if (n == 4) + if (palette_video) + for (k = 0; k < 4; ++k) { + uint32_t r = s->pal[*data++]; + *p++ = PACK_RGB_RGB565((r>>16)&0xff, + (r>>8)&0xff, + r&0xff); + } + else + for (k = 0; k < 4; ++k) { + int r = *data++; + *p++ = PACK_RGB_RGB565(r,r,r); + } + else { /* n == 6 */ + int y, u, v; + u = (int8_t)data[4]; + v = (int8_t)data[5]; + for(k=0; k<4; ++k) { + y = *data++; +/* here the cinepak color space excels */ + *p++ = PACK_RGB_RGB565(y + v*2, + y - (u/2) - v, + y + u*2); + } + data += 2; + } + } else { + p += 4; + } + } +} + +DECODE_VECTORS(rgb565) + uint16_t *cb0, *cb1, *cb2, *cb3; + + VECTOR_INTRO -/* take care of y dimension not being multiple of 4, such streams exist */ ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + - (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]); + strip->x1*2 + y*s->frame->linesize[0]; + +#define POINTER_TYPE uint16_t + VECTOR_DO +#undef POINTER_TYPE + + p = strip->v1_codebook.rgb565[*data++]; + * (uint16_t *)ip3 = *((uint16_t *)ip3+1) = + * (uint16_t *)ip2 = *((uint16_t *)ip2+1) = p[2]; + *((uint16_t *)ip3+2) = *((uint16_t *)ip3+3) = + *((uint16_t *)ip2+2) = *((uint16_t *)ip2+3) = p[3]; + * (uint16_t *)ip1 = *((uint16_t *)ip1+1) = + * (uint16_t *)ip0 = *((uint16_t *)ip0+1) = p[0]; + *((uint16_t *)ip1+2) = *((uint16_t *)ip1+3) = + *((uint16_t *)ip0+2) = *((uint16_t *)ip0+3) = p[1]; + + } else if (flag & mask) { + if ((data + 4) > eod) + return AVERROR_INVALIDDATA; + + cb0 = strip->v4_codebook.rgb565[*data++]; + cb1 = strip->v4_codebook.rgb565[*data++]; + cb2 = strip->v4_codebook.rgb565[*data++]; + cb3 = strip->v4_codebook.rgb565[*data++]; + memcpy(ip3 + 0, cb2 + 2, 4); + memcpy(ip3 + 4, cb3 + 2, 4); + memcpy(ip2 + 0, cb2 + 0, 4); + memcpy(ip2 + 4, cb3 + 0, 4); + memcpy(ip1 + 0, cb0 + 2, 4); + memcpy(ip1 + 4, cb1 + 2, 4); + memcpy(ip0 + 0, cb0 + 0, 4); + memcpy(ip0 + 4, cb1 + 0, 4); + + } + } + + ip0 += 8; ip1 += 8; + ip2 += 8; ip3 += 8; + } + } + + return 0; +} + +/* a simplistic version to begin with, it is also fast -- rl */ +DECODE_CODEBOOK(yuv420p) + uint8_t *p = codebook->yuv420p[0]; + + CODEBOOK_FULL_COLOR + + if (n == 4) + if (palette_video) { +/* here we have kind of "more" data than the output format can express */ + int r, g, b, u = 0, v = 0; + for (k = 0; k < 4; ++k) { + uint32_t rr = s->pal[*data++]; + r = (rr>>16)&0xff; + g = (rr>>8) &0xff; + b = rr &0xff; +/* calculate the components (https://en.wikipedia.org/wiki/YUV) */ + *p++ = ((r*66+g*129+b*25+128)>>8)+16; + u += (-r*38-g*74+b*112+128)>>8; + v += (r*112-g*94-b*18+128)>>8; + } + *p++ = (u+2)/4+128; + *p++ = (v+2)/4+128; + } else { /* grayscale, easy */ + for (k = 0; k < 4; ++k) { + *p++ = *data++; + } + *p++ = 128; + *p++ = 128; + } + else { /* n == 6 */ +/* here we'd have to handle double format conversion + * Cinepak=>rgb24 and then rgb24=>yuv420p, which can not be shortcut; + * for the moment just copying as-is, for simplicity and speed, + * color will be slightly off but not much */ + *p++ = *data++; + *p++ = *data++; + *p++ = *data++; + *p++ = *data++; + *p++ = *data++ + 128; + *p++ = *data++ + 128; + } + } else { + p += 6; + } + } +} + +DECODE_VECTORS(yuv420p) + uint8_t *cb0, *cb1, *cb2, *cb3; + char *up01, *up23, *vp01, *vp23; + + VECTOR_INTRO + + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1*3 + y*s->frame->linesize[0]; + up01 = up23 = s->frame->data[1] + strip->x1 + y/2*s->frame->linesize[1]; + vp01 = vp23 = s->frame->data[2] + strip->x1 + y/2*s->frame->linesize[2]; if(s->avctx->height - y > 1) { ip1 = ip0 + s->frame->linesize[0]; if(s->avctx->height - y > 2) { ip2 = ip1 + s->frame->linesize[0]; + up23 = up01 + s->frame->linesize[1]; + vp23 = vp01 + s->frame->linesize[2]; if(s->avctx->height - y > 3) { ip3 = ip2 + s->frame->linesize[0]; } } } + /* to get the correct picture for not-multiple-of-4 cases let us fill each * block from the bottom up, thus possibly overwriting the bottommost line * more than once but ending with the correct data in place * (instead of in-loop checking) */ - for (x=strip->x1; x < strip->x2; x+=4) { - if ((chunk_id & 0x01) && !(mask >>= 1)) { - if ((data + 4) > eod) - return AVERROR_INVALIDDATA; +#define POINTER_TYPE uint8_t + VECTOR_STREAM_PARSING +#undef POINTER_TYPE - flag = AV_RB32 (data); - data += 4; - mask = 0x80000000; - } + p = strip->v1_codebook.yuv420p[*data++]; + ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2]; + ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3]; + ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0]; + ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1]; + p += 4; + up01[0] = up01[1] = up23[0] = up23[1] = *p++; + vp01[0] = vp01[1] = vp23[0] = vp23[1] = *p++; - if (!(chunk_id & 0x01) || (flag & mask)) { - if (!(chunk_id & 0x02) && !(mask >>= 1)) { + } else if (flag & mask) { if ((data + 4) > eod) return AVERROR_INVALIDDATA; - flag = AV_RB32 (data); - data += 4; - mask = 0x80000000; + cb0 = strip->v4_codebook.yuv420p[*data++]; + cb1 = strip->v4_codebook.yuv420p[*data++]; + cb2 = strip->v4_codebook.yuv420p[*data++]; + cb3 = strip->v4_codebook.yuv420p[*data++]; + memcpy(ip3 + 0, cb2 + 2, 2); + memcpy(ip3 + 2, cb3 + 2, 2); + memcpy(ip2 + 0, cb2 + 0, 2); + memcpy(ip2 + 2, cb3 + 0, 2); + memcpy(ip1 + 0, cb0 + 2, 2); + memcpy(ip1 + 2, cb1 + 2, 2); + memcpy(ip0 + 0, cb0 + 0, 2); + memcpy(ip0 + 2, cb1 + 0, 2); + cb0 += 4; cb1 += 4; cb2 += 4; cb3 += 4; + up23[0] = *cb2++; vp23[0] = *cb2; + up23[1] = *cb3++; vp23[1] = *cb3; + up01[0] = *cb0++; vp01[0] = *cb0; + up01[1] = *cb1++; vp01[1] = *cb1; + } + } - if ((chunk_id & 0x02) || (~flag & mask)) { - uint8_t *p; - if (data >= eod) - return AVERROR_INVALIDDATA; + ip0 += 4; ip1 += 4; + ip2 += 4; ip3 += 4; + up01 += 2; up23 += 2; + vp01 += 2; vp23 += 2; + } + } - p = strip->v1_codebook[*data++]; - if (s->palette_video) { - ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6]; - ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9]; - ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0]; - ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3]; - } else { - p += 6; - memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3); - memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3); - p += 3; /* ... + 9 */ - memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3); - memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3); - p -= 9; /* ... + 0 */ - memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3); - memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3); - p += 3; /* ... + 3 */ - memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3); - memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3); - } + return 0; +} + +/* here we do not expect anything besides palettized video, + * nor check the data for validity, which should be ok, + * to the best of our knowledge we do not write beyond the bounds */ +DECODE_CODEBOOK_PAL8 + uint8_t *p = codebook->pal8[0]; + +#define PAL8_VECTOR_LENGTH 4 +#define n PAL8_VECTOR_LENGTH +/* here we assume but do not have to assert: av_assert0(chunk_id & 0x04); */ + + CODEBOOK_INTRO + CODEBOOK_STREAM_PARSING + +#undef n + + for (k = 0; k < 4; ++k) + *p++ = *data++; + } else { + p += 4; + } + } +} + +DECODE_VECTORS(pal8) + uint8_t *cb0, *cb1, *cb2, *cb3; + + VECTOR_INTRO + + ip0 = ip1 = ip2 = ip3 = s->frame->data[0] + + strip->x1 + y*s->frame->linesize[0]; + +#define POINTER_TYPE uint8_t + VECTOR_DO +#undef POINTER_TYPE + + p = strip->v1_codebook.pal8[*data++]; + ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2]; + ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3]; + ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0]; + ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1]; } else if (flag & mask) { + uint8_t *p; if ((data + 4) > eod) return AVERROR_INVALIDDATA; - cb0 = strip->v4_codebook[*data++]; - cb1 = strip->v4_codebook[*data++]; - cb2 = strip->v4_codebook[*data++]; - cb3 = strip->v4_codebook[*data++]; - if (s->palette_video) { - uint8_t *p; - p = ip3; - *p++ = cb2[6]; - *p++ = cb2[9]; - *p++ = cb3[6]; - *p = cb3[9]; - p = ip2; - *p++ = cb2[0]; - *p++ = cb2[3]; - *p++ = cb3[0]; - *p = cb3[3]; - p = ip1; - *p++ = cb0[6]; - *p++ = cb0[9]; - *p++ = cb1[6]; - *p = cb1[9]; - p = ip0; - *p++ = cb0[0]; - *p++ = cb0[3]; - *p++ = cb1[0]; - *p = cb1[3]; - } else { - memcpy(ip3 + 0, cb2 + 6, 6); - memcpy(ip3 + 6, cb3 + 6, 6); - memcpy(ip2 + 0, cb2 + 0, 6); - memcpy(ip2 + 6, cb3 + 0, 6); - memcpy(ip1 + 0, cb0 + 6, 6); - memcpy(ip1 + 6, cb1 + 6, 6); - memcpy(ip0 + 0, cb0 + 0, 6); - memcpy(ip0 + 6, cb1 + 0, 6); - } + cb0 = strip->v4_codebook.pal8[*data++]; + cb1 = strip->v4_codebook.pal8[*data++]; + cb2 = strip->v4_codebook.pal8[*data++]; + cb3 = strip->v4_codebook.pal8[*data++]; + p = ip3; + *p++ = cb2[2]; + *p++ = cb2[3]; + *p++ = cb3[2]; + *p = cb3[3]; + p = ip2; + *p++ = cb2[0]; + *p++ = cb2[1]; + *p++ = cb3[0]; + *p = cb3[1]; + p = ip1; + *p++ = cb0[2]; + *p++ = cb0[3]; + *p++ = cb1[2]; + *p = cb1[3]; + p = ip0; + *p++ = cb0[0]; + *p++ = cb0[1]; + *p++ = cb1[0]; + *p = cb1[1]; } } - if (s->palette_video) { - ip0 += 4; ip1 += 4; - ip2 += 4; ip3 += 4; - } else { - ip0 += 12; ip1 += 12; - ip2 += 12; ip3 += 12; - } + ip0 += 4; ip1 += 4; + ip2 += 4; ip3 += 4; } } @@ -290,22 +730,22 @@ static int cinepak_decode_strip (CinepakContext *s, case 0x21: case 0x24: case 0x25: - cinepak_decode_codebook (strip->v4_codebook, chunk_id, - chunk_size, data); + s->decode_codebook(s, &strip->v4_codebook, + chunk_id, chunk_size, data); break; case 0x22: case 0x23: case 0x26: case 0x27: - cinepak_decode_codebook (strip->v1_codebook, chunk_id, - chunk_size, data); + s->decode_codebook (s, &strip->v1_codebook, + chunk_id, chunk_size, data); break; case 0x30: case 0x31: case 0x32: - return cinepak_decode_vectors (s, strip, chunk_id, + return s->decode_vectors (s, strip, chunk_id, chunk_size, data); } @@ -385,9 +825,9 @@ static int cinepak_decode (CinepakContext *s) strip_size = ((s->data + strip_size) > eod) ? (eod - s->data) : strip_size; if ((i > 0) && !(frame_flags & 0x01)) { - memcpy (s->strips[i].v4_codebook, s->strips[i-1].v4_codebook, + memcpy (&s->strips[i].v4_codebook, &s->strips[i-1].v4_codebook, sizeof(s->strips[i].v4_codebook)); - memcpy (s->strips[i].v1_codebook, s->strips[i-1].v1_codebook, + memcpy (&s->strips[i].v1_codebook, &s->strips[i-1].v1_codebook, sizeof(s->strips[i].v1_codebook)); } @@ -402,25 +842,72 @@ static int cinepak_decode (CinepakContext *s) return 0; } +/* given a palettized input */ +static const enum AVPixelFormat pixfmt_list[] = { + AV_PIX_FMT_RGB24, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_RGB565, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_PAL8, /* only when input is palettized */ + AV_PIX_FMT_NONE +}; + +/* given a non-palettized input */ +static const enum AVPixelFormat pixfmt_list_2[] = { + AV_PIX_FMT_RGB24, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_RGB565, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NONE +}; + static av_cold int cinepak_decode_init(AVCodecContext *avctx) { CinepakContext *s = avctx->priv_data; +/* we take advantage of VQ to efficiently support + * multiple output formats */ + s->avctx = avctx; s->width = (avctx->width + 3) & ~3; s->height = (avctx->height + 3) & ~3; s->sega_film_skip_bytes = -1; /* uninitialized state */ - // check for paletted data - if (avctx->bits_per_coded_sample != 8) { - s->palette_video = 0; - avctx->pix_fmt = AV_PIX_FMT_RGB24; - } else { - s->palette_video = 1; - avctx->pix_fmt = AV_PIX_FMT_PAL8; + /* check for paletted data */ + s->palette_video = (avctx->bits_per_coded_sample == 8); + + if (s->out_pixfmt != AV_PIX_FMT_NONE) /* the option is set to something */ + avctx->pix_fmt = s->out_pixfmt; + else + if (s->palette_video) + avctx->pix_fmt = ff_get_format(avctx, pixfmt_list); + else + avctx->pix_fmt = ff_get_format(avctx, pixfmt_list_2); + +#define DECODE_TO(pixel_format) \ + s->decode_codebook = cinepak_decode_codebook_##pixel_format;\ + s->decode_vectors = cinepak_decode_vectors_##pixel_format;\ + break;\ + + switch (avctx->pix_fmt) { + case AV_PIX_FMT_RGB32: DECODE_TO(rgb32) + case AV_PIX_FMT_RGB24: DECODE_TO(rgb24) + case AV_PIX_FMT_RGB565: DECODE_TO(rgb565) + case AV_PIX_FMT_YUV420P: DECODE_TO(yuv420p) + case AV_PIX_FMT_PAL8: + if (!s->palette_video) { + av_log(avctx, AV_LOG_ERROR, "Palettized output not supported without palettized input\n"); + return AVERROR(EINVAL); + } + DECODE_TO(pal8) + default: + av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format %s\n", av_get_pix_fmt_name(avctx->pix_fmt)); + return AVERROR(EINVAL); } +#undef DECODE_TO + s->frame = av_frame_alloc(); if (!s->frame) return AVERROR(ENOMEM); @@ -457,7 +944,7 @@ static int cinepak_decode_frame(AVCodecContext *avctx, av_log(avctx, AV_LOG_ERROR, "cinepak_decode failed\n"); } - if (s->palette_video) + if (avctx->pix_fmt == AV_PIX_FMT_PAL8) memcpy (s->frame->data[1], s->pal, AVPALETTE_SIZE); if ((ret = av_frame_ref(data, s->frame)) < 0) @@ -488,4 +975,5 @@ AVCodec ff_cinepak_decoder = { .close = cinepak_decode_end, .decode = cinepak_decode_frame, .capabilities = AV_CODEC_CAP_DR1, + .priv_class = &cinepak_class, };

[FFmpeg-devel,1/2] Re: deduplicated [PATCH] Cinepak: speed up decoding several-fold, depending on the scenario, by supporting multiple output pixel formats.

Commit Message

Comments

Patch