From 2b6ac4f7093157533b7f279a78a73bfabeb98cf0 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Tue, 15 Aug 2023 21:13:59 +0200
Subject: [PATCH] avcodec/tta: switch to planar sample formats
Makes decoding few percent faster.
Also fix code style while here.
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
libavcodec/tta.c | 167 +++++++++++++++++++++++++++++++----------------
1 file changed, 109 insertions(+), 58 deletions(-)
@@ -55,7 +55,7 @@ typedef struct TTAContext {
unsigned data_length;
int frame_length, last_frame_length;
- int32_t *decode_buffer;
+ int32_t **decode_buffer;
uint8_t crc_pass[8];
uint8_t *pass;
@@ -107,10 +107,16 @@ static int allocate_buffers(AVCodecContext *avctx)
TTAContext *s = avctx->priv_data;
if (s->bps < 3) {
- s->decode_buffer = av_calloc(s->frame_length,
- sizeof(*s->decode_buffer) * s->channels);
+ s->decode_buffer = av_calloc(s->channels, sizeof(*s->decode_buffer));
if (!s->decode_buffer)
return AVERROR(ENOMEM);
+
+ for (int ch = 0; ch < s->channels; ch++) {
+ s->decode_buffer[ch] = av_calloc(s->frame_length,
+ sizeof(*s->decode_buffer[ch]));
+ if (!s->decode_buffer[ch])
+ return AVERROR(ENOMEM);
+ }
} else
s->decode_buffer = NULL;
s->ch_ctx = av_malloc_array(avctx->ch_layout.nb_channels, sizeof(*s->ch_ctx));
@@ -181,14 +187,14 @@ static av_cold int tta_decode_init(AVCodecContext * avctx)
}
switch(s->bps) {
- case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8; break;
+ case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8P; break;
case 2:
- avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+ avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
break;
case 3:
- avctx->sample_fmt = AV_SAMPLE_FMT_S32;
+ avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
break;
- //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32; break;
+ //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32P; break;
default:
av_log(avctx, AV_LOG_ERROR, "Invalid/unsupported sample format.\n");
return AVERROR_INVALIDDATA;
@@ -231,10 +237,10 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
const uint8_t *buf = avpkt->data;
int buf_size = avpkt->size;
TTAContext *s = avctx->priv_data;
+ const int bps = s->bps;
GetBitContext gb;
int i, ret;
int cur_chan = 0, framelen = s->frame_length;
- uint32_t *p;
if (avctx->err_recognition & AV_EF_CRCCHECK) {
if (buf_size < 4 ||
@@ -251,14 +257,13 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
return ret;
// decode directly to output buffer for 24-bit sample format
- if (s->bps == 3)
- s->decode_buffer = (int32_t *)frame->data[0];
+ if (bps == 3)
+ s->decode_buffer = (int32_t **)frame->extended_data;
// init per channel states
for (i = 0; i < s->channels; i++) {
TTAFilter *filter = &s->ch_ctx[i].filter;
- s->ch_ctx[i].predictor = 0;
- ff_tta_filter_init(filter, ff_tta_filter_configs[s->bps-1]);
+ ff_tta_filter_init(filter, ff_tta_filter_configs[bps-1]);
if (s->format == FORMAT_ENCRYPTED) {
int i;
for (i = 0; i < 8; i++)
@@ -268,9 +273,8 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
}
i = 0;
- for (p = s->decode_buffer; (int32_t*)p < s->decode_buffer + (framelen * s->channels); p++) {
- int32_t *predictor = &s->ch_ctx[cur_chan].predictor;
- TTAFilter *filter = &s->ch_ctx[cur_chan].filter;
+ for (int j = 0; j < framelen * s->channels; j++) {
+ int32_t *p = s->decode_buffer[cur_chan] + i;
TTARice *rice = &s->ch_ctx[cur_chan].rice;
uint32_t unary, depth, k;
int32_t value;
@@ -306,44 +310,24 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
rice->sum1 += value - (rice->sum1 >> 4);
if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
rice->k1--;
- else if(rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
+ else if (rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
rice->k1++;
value += ff_tta_shift_1[rice->k0];
default:
rice->sum0 += value - (rice->sum0 >> 4);
if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
rice->k0--;
- else if(rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
+ else if (rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
rice->k0++;
}
// extract coded value
*p = 1 + ((value >> 1) ^ ((value & 1) - 1));
- // run hybrid filter
- s->dsp.filter_process(filter->qm, filter->dx, filter->dl, &filter->error, p,
- filter->shift, filter->round);
-
- // fixed order prediction
-#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
- switch (s->bps) {
- case 1: *p += PRED(*predictor, 4); break;
- case 2:
- case 3: *p += PRED(*predictor, 5); break;
- case 4: *p += *predictor; break;
- }
- *predictor = *p;
-
// flip channels
if (cur_chan < (s->channels-1))
cur_chan++;
else {
- // decorrelate in case of multiple channels
- if (s->channels > 1) {
- int32_t *r = p - 1;
- for (*p += *r / 2; r > (int32_t*)p - s->channels; r--)
- *r = *(r + 1) - *r;
- }
cur_chan = 0;
i++;
// check for last frame
@@ -354,6 +338,64 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
}
}
+ // run hybrid filter
+ for (int ch = 0; ch < s->channels; ch++) {
+ TTAFilter *filter = &s->ch_ctx[ch].filter;
+ const int32_t shift = filter->shift;
+ const int32_t round = filter->round;
+ int32_t *p = s->decode_buffer[ch];
+ int32_t error = filter->error;
+ int32_t *qm = filter->qm;
+ int32_t *dx = filter->dx;
+ int32_t *dl = filter->dl;
+
+ for (int n = 0; n < framelen; n++) {
+ s->dsp.filter_process(qm, dx, dl,
+ &error, &p[n],
+ shift, round);
+ }
+ }
+
+ // fixed order prediction
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
+ for (int ch = 0; ch < s->channels; ch++) {
+ int32_t *p = s->decode_buffer[ch];
+ int32_t predictor = 0;
+
+ switch (bps) {
+ case 1:
+ for (int n = 0; n < framelen; n++) {
+ p[n] += PRED(predictor, 4);
+ predictor = p[n];
+ }
+ break;
+ case 2:
+ case 3:
+ for (int n = 0; n < framelen; n++) {
+ p[n] += PRED(predictor, 5);
+ predictor = p[n];
+ }
+ break;
+ }
+ }
+
+ // decorrelate in case of multiple channels
+ if (s->channels > 1) {
+ int32_t *a = s->decode_buffer[s->channels-1];
+ int32_t *b = s->decode_buffer[s->channels-2];
+
+ for (int n = 0; n < framelen; n++)
+ a[n] += b[n] / 2;
+
+ for (int ch = s->channels - 1; ch >= 1; ch--) {
+ int32_t *b = s->decode_buffer[ch-1];
+ int32_t *c = s->decode_buffer[ch ];
+
+ for (int n = 0; n < framelen; n++)
+ b[n] = c[n] - b[n];
+ }
+ }
+
align_get_bits(&gb);
if (get_bits_left(&gb) < 32) {
ret = AVERROR_INVALIDDATA;
@@ -362,31 +404,34 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
skip_bits_long(&gb, 32); // frame crc
// convert to output buffer
- switch (s->bps) {
- case 1: {
- uint8_t *samples = (uint8_t *)frame->data[0];
- p = s->decode_buffer;
- for (i = 0; i < framelen * s->channels; i++)
- samples[i] = p[i] + 0x80;
- break;
+ switch (bps) {
+ case 1:
+ for (int ch = 0; ch < s->channels; ch++) {
+ uint8_t *samples = (uint8_t *)frame->extended_data[ch];
+ int32_t *p = s->decode_buffer[ch];
+ for (i = 0; i < framelen; i++)
+ samples[i] = p[i] + 0x80;
}
- case 2: {
- int16_t *samples = (int16_t *)frame->data[0];
- p = s->decode_buffer;
- for (i = 0; i < framelen * s->channels; i++)
- samples[i] = p[i];
break;
+ case 2:
+ for (int ch = 0; ch < s->channels; ch++) {
+ int16_t *samples = (int16_t *)frame->extended_data[ch];
+ int32_t *p = s->decode_buffer[ch];
+ for (i = 0; i < framelen; i++)
+ samples[i] = p[i];
}
- case 3: {
- // shift samples for 24-bit sample format
- int32_t *samples = (int32_t *)frame->data[0];
+ break;
+ case 3:
+ for (int ch = 0; ch < s->channels; ch++) {
+ // shift samples for 24-bit sample format
+ int32_t *samples = (int32_t *)frame->extended_data[ch];
- for (i = 0; i < framelen * s->channels; i++)
- samples[i] = samples[i] * 256U;
+ for (i = 0; i < framelen; i++)
+ samples[i] = samples[i] * 256U;
+ }
// reset decode buffer
s->decode_buffer = NULL;
break;
- }
}
*got_frame_ptr = 1;
@@ -394,16 +439,22 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
return buf_size;
error:
// reset decode buffer
- if (s->bps == 3)
+ if (bps == 3)
s->decode_buffer = NULL;
return ret;
}
-static av_cold int tta_decode_close(AVCodecContext *avctx) {
+static av_cold int tta_decode_close(AVCodecContext *avctx)
+{
TTAContext *s = avctx->priv_data;
- if (s->bps < 3)
+ if (s->bps < 3) {
+ if (s->decode_buffer) {
+ for (int ch = 0; ch < s->channels; ch++)
+ av_freep(&s->decode_buffer[ch]);
+ }
av_freep(&s->decode_buffer);
+ }
s->decode_buffer = NULL;
av_freep(&s->ch_ctx);
--
2.39.1