diff mbox series

[FFmpeg-devel] tta decoder improvements

Message ID CAPYw7P6of-NO0ZgXR8hv1tHZ_T2HJfdjU_xo6LA2Cim88MqPHA@mail.gmail.com
State New
Headers show
Series [FFmpeg-devel] tta decoder improvements | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 fail Make fate failed

Commit Message

Paul B Mahol Aug. 16, 2023, 10:47 a.m. UTC
Patch attached.

Comments

Michael Niedermayer Aug. 16, 2023, 4:49 p.m. UTC | #1
On Wed, Aug 16, 2023 at 12:47:36PM +0200, Paul B Mahol wrote:
> Patch attached.

>  tta.c |  167 +++++++++++++++++++++++++++++++++++++++++++-----------------------
>  1 file changed, 109 insertions(+), 58 deletions(-)
> a289f05b1ebb50604675a11894e254108e276714  0001-avcodec-tta-switch-to-planar-sample-formats.patch
> From 2b6ac4f7093157533b7f279a78a73bfabeb98cf0 Mon Sep 17 00:00:00 2001
> From: Paul B Mahol <onemda@gmail.com>
> Date: Tue, 15 Aug 2023 21:13:59 +0200
> Subject: [PATCH] avcodec/tta: switch to planar sample formats
> 
> Makes decoding few percent faster.

great


> Also fix code style while here.

great too

but these should be 2 seprate patches
we can see also below the changes are unrelated and not limited
to otherwise changed lines


[...]
> @@ -306,44 +310,24 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
>              rice->sum1 += value - (rice->sum1 >> 4);
>              if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
>                  rice->k1--;
> -            else if(rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
> +            else if (rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
>                  rice->k1++;
>              value += ff_tta_shift_1[rice->k0];
>          default:
>              rice->sum0 += value - (rice->sum0 >> 4);
>              if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
>                  rice->k0--;
> -            else if(rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
> +            else if (rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
>                  rice->k0++;
>          }
[...]
>  }
>
> -static av_cold int tta_decode_close(AVCodecContext *avctx) {
> +static av_cold int tta_decode_close(AVCodecContext *avctx)
> +{
>      TTAContext *s = avctx->priv_data;
>


[...]
Michael Niedermayer Aug. 16, 2023, 6:14 p.m. UTC | #2
On Wed, Aug 16, 2023 at 12:47:36PM +0200, Paul B Mahol wrote:
> Patch attached.

>  tta.c |  167 +++++++++++++++++++++++++++++++++++++++++++-----------------------
>  1 file changed, 109 insertions(+), 58 deletions(-)
> a289f05b1ebb50604675a11894e254108e276714  0001-avcodec-tta-switch-to-planar-sample-formats.patch
> From 2b6ac4f7093157533b7f279a78a73bfabeb98cf0 Mon Sep 17 00:00:00 2001
> From: Paul B Mahol <onemda@gmail.com>
> Date: Tue, 15 Aug 2023 21:13:59 +0200
> Subject: [PATCH] avcodec/tta: switch to planar sample formats
> 
> Makes decoding few percent faster.
> Also fix code style while here.
> 
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavcodec/tta.c | 167 +++++++++++++++++++++++++++++++----------------
>  1 file changed, 109 insertions(+), 58 deletions(-)

breaks fate-lossless-tta

Stream mapping:
  Stream #0:0 -> #0:0 (tta (native) -> pcm_s16le (native))
The filters 'Parsed_anull_0' and 'format_out_0_0' do not have a common format and automatic conversion is disabled.
[af#0:0 @ 0x561fd8f3f280] Error reinitializing filters!
Failed to inject frame into filter network: Invalid argument
Error while filtering: Invalid argument
[out#0/crc @ 0x561fd8f3c840] Nothing was written into output file, because at least one of its streams received no packets.
size=       0kB time=N/A bitrate=N/A speed=N/A
Conversion failed!
threads=1
tests/Makefile:308: recipe for target 'fate-lossless-tta' failed
make: *** [fate-lossless-tta] Error 234

[...]
diff mbox series

Patch

From 2b6ac4f7093157533b7f279a78a73bfabeb98cf0 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Tue, 15 Aug 2023 21:13:59 +0200
Subject: [PATCH] avcodec/tta: switch to planar sample formats

Makes decoding few percent faster.
Also fix code style while here.

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/tta.c | 167 +++++++++++++++++++++++++++++++----------------
 1 file changed, 109 insertions(+), 58 deletions(-)

diff --git a/libavcodec/tta.c b/libavcodec/tta.c
index 3e89571f16..6add4106d3 100644
--- a/libavcodec/tta.c
+++ b/libavcodec/tta.c
@@ -55,7 +55,7 @@  typedef struct TTAContext {
     unsigned data_length;
     int frame_length, last_frame_length;
 
-    int32_t *decode_buffer;
+    int32_t **decode_buffer;
 
     uint8_t crc_pass[8];
     uint8_t *pass;
@@ -107,10 +107,16 @@  static int allocate_buffers(AVCodecContext *avctx)
     TTAContext *s = avctx->priv_data;
 
     if (s->bps < 3) {
-        s->decode_buffer = av_calloc(s->frame_length,
-                                     sizeof(*s->decode_buffer) * s->channels);
+        s->decode_buffer = av_calloc(s->channels, sizeof(*s->decode_buffer));
         if (!s->decode_buffer)
             return AVERROR(ENOMEM);
+
+        for (int ch = 0; ch < s->channels; ch++) {
+            s->decode_buffer[ch] = av_calloc(s->frame_length,
+                                             sizeof(*s->decode_buffer[ch]));
+            if (!s->decode_buffer[ch])
+                return AVERROR(ENOMEM);
+        }
     } else
         s->decode_buffer = NULL;
     s->ch_ctx = av_malloc_array(avctx->ch_layout.nb_channels, sizeof(*s->ch_ctx));
@@ -181,14 +187,14 @@  static av_cold int tta_decode_init(AVCodecContext * avctx)
         }
 
         switch(s->bps) {
-        case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8; break;
+        case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8P; break;
         case 2:
-            avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+            avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
             break;
         case 3:
-            avctx->sample_fmt = AV_SAMPLE_FMT_S32;
+            avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
             break;
-        //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32; break;
+        //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32P; break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Invalid/unsupported sample format.\n");
             return AVERROR_INVALIDDATA;
@@ -231,10 +237,10 @@  static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     TTAContext *s = avctx->priv_data;
+    const int bps = s->bps;
     GetBitContext gb;
     int i, ret;
     int cur_chan = 0, framelen = s->frame_length;
-    uint32_t *p;
 
     if (avctx->err_recognition & AV_EF_CRCCHECK) {
         if (buf_size < 4 ||
@@ -251,14 +257,13 @@  static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
         return ret;
 
     // decode directly to output buffer for 24-bit sample format
-    if (s->bps == 3)
-        s->decode_buffer = (int32_t *)frame->data[0];
+    if (bps == 3)
+        s->decode_buffer = (int32_t **)frame->extended_data;
 
     // init per channel states
     for (i = 0; i < s->channels; i++) {
         TTAFilter *filter = &s->ch_ctx[i].filter;
-        s->ch_ctx[i].predictor = 0;
-        ff_tta_filter_init(filter, ff_tta_filter_configs[s->bps-1]);
+        ff_tta_filter_init(filter, ff_tta_filter_configs[bps-1]);
         if (s->format == FORMAT_ENCRYPTED) {
             int i;
             for (i = 0; i < 8; i++)
@@ -268,9 +273,8 @@  static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     }
 
     i = 0;
-    for (p = s->decode_buffer; (int32_t*)p < s->decode_buffer + (framelen * s->channels); p++) {
-        int32_t *predictor = &s->ch_ctx[cur_chan].predictor;
-        TTAFilter *filter = &s->ch_ctx[cur_chan].filter;
+    for (int j = 0; j < framelen * s->channels; j++) {
+        int32_t *p = s->decode_buffer[cur_chan] + i;
         TTARice *rice = &s->ch_ctx[cur_chan].rice;
         uint32_t unary, depth, k;
         int32_t value;
@@ -306,44 +310,24 @@  static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
             rice->sum1 += value - (rice->sum1 >> 4);
             if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
                 rice->k1--;
-            else if(rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
+            else if (rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
                 rice->k1++;
             value += ff_tta_shift_1[rice->k0];
         default:
             rice->sum0 += value - (rice->sum0 >> 4);
             if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
                 rice->k0--;
-            else if(rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
+            else if (rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
                 rice->k0++;
         }
 
         // extract coded value
         *p = 1 + ((value >> 1) ^ ((value & 1) - 1));
 
-        // run hybrid filter
-        s->dsp.filter_process(filter->qm, filter->dx, filter->dl, &filter->error, p,
-                              filter->shift, filter->round);
-
-        // fixed order prediction
-#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
-        switch (s->bps) {
-        case 1: *p += PRED(*predictor, 4); break;
-        case 2:
-        case 3: *p += PRED(*predictor, 5); break;
-        case 4: *p +=      *predictor;     break;
-        }
-        *predictor = *p;
-
         // flip channels
         if (cur_chan < (s->channels-1))
             cur_chan++;
         else {
-            // decorrelate in case of multiple channels
-            if (s->channels > 1) {
-                int32_t *r = p - 1;
-                for (*p += *r / 2; r > (int32_t*)p - s->channels; r--)
-                    *r = *(r + 1) - *r;
-            }
             cur_chan = 0;
             i++;
             // check for last frame
@@ -354,6 +338,64 @@  static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
         }
     }
 
+    // run hybrid filter
+    for (int ch = 0; ch < s->channels; ch++) {
+        TTAFilter *filter = &s->ch_ctx[ch].filter;
+        const int32_t shift = filter->shift;
+        const int32_t round = filter->round;
+        int32_t *p = s->decode_buffer[ch];
+        int32_t error = filter->error;
+        int32_t *qm = filter->qm;
+        int32_t *dx = filter->dx;
+        int32_t *dl = filter->dl;
+
+        for (int n = 0; n < framelen; n++) {
+            s->dsp.filter_process(qm, dx, dl,
+                                  &error, &p[n],
+                                  shift, round);
+        }
+    }
+
+    // fixed order prediction
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
+    for (int ch = 0; ch < s->channels; ch++) {
+        int32_t *p = s->decode_buffer[ch];
+        int32_t predictor = 0;
+
+        switch (bps) {
+        case 1:
+            for (int n = 0; n < framelen; n++) {
+                p[n] += PRED(predictor, 4);
+                predictor = p[n];
+            }
+            break;
+        case 2:
+        case 3:
+            for (int n = 0; n < framelen; n++) {
+                p[n] += PRED(predictor, 5);
+                predictor = p[n];
+            }
+            break;
+        }
+    }
+
+    // decorrelate in case of multiple channels
+    if (s->channels > 1) {
+        int32_t *a = s->decode_buffer[s->channels-1];
+        int32_t *b = s->decode_buffer[s->channels-2];
+
+        for (int n = 0; n < framelen; n++)
+            a[n] += b[n] / 2;
+
+        for (int ch = s->channels - 1; ch >= 1; ch--) {
+            int32_t *b = s->decode_buffer[ch-1];
+            int32_t *c = s->decode_buffer[ch  ];
+
+            for (int n = 0; n < framelen; n++)
+                b[n] = c[n] - b[n];
+        }
+    }
+
     align_get_bits(&gb);
     if (get_bits_left(&gb) < 32) {
         ret = AVERROR_INVALIDDATA;
@@ -362,31 +404,34 @@  static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     skip_bits_long(&gb, 32); // frame crc
 
     // convert to output buffer
-    switch (s->bps) {
-    case 1: {
-        uint8_t *samples = (uint8_t *)frame->data[0];
-        p = s->decode_buffer;
-        for (i = 0; i < framelen * s->channels; i++)
-            samples[i] = p[i] + 0x80;
-        break;
+    switch (bps) {
+    case 1:
+        for (int ch = 0; ch < s->channels; ch++) {
+            uint8_t *samples = (uint8_t *)frame->extended_data[ch];
+            int32_t *p = s->decode_buffer[ch];
+            for (i = 0; i < framelen; i++)
+                samples[i] = p[i] + 0x80;
         }
-    case 2: {
-        int16_t *samples = (int16_t *)frame->data[0];
-        p = s->decode_buffer;
-        for (i = 0; i < framelen * s->channels; i++)
-            samples[i] = p[i];
         break;
+    case 2:
+        for (int ch = 0; ch < s->channels; ch++) {
+            int16_t *samples = (int16_t *)frame->extended_data[ch];
+            int32_t *p = s->decode_buffer[ch];
+            for (i = 0; i < framelen; i++)
+                samples[i] = p[i];
         }
-    case 3: {
-        // shift samples for 24-bit sample format
-        int32_t *samples = (int32_t *)frame->data[0];
+        break;
+    case 3:
+        for (int ch = 0; ch < s->channels; ch++) {
+            // shift samples for 24-bit sample format
+            int32_t *samples = (int32_t *)frame->extended_data[ch];
 
-        for (i = 0; i < framelen * s->channels; i++)
-            samples[i] = samples[i] * 256U;
+            for (i = 0; i < framelen; i++)
+                samples[i] = samples[i] * 256U;
+        }
         // reset decode buffer
         s->decode_buffer = NULL;
         break;
-        }
     }
 
     *got_frame_ptr = 1;
@@ -394,16 +439,22 @@  static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     return buf_size;
 error:
     // reset decode buffer
-    if (s->bps == 3)
+    if (bps == 3)
         s->decode_buffer = NULL;
     return ret;
 }
 
-static av_cold int tta_decode_close(AVCodecContext *avctx) {
+static av_cold int tta_decode_close(AVCodecContext *avctx)
+{
     TTAContext *s = avctx->priv_data;
 
-    if (s->bps < 3)
+    if (s->bps < 3) {
+        if (s->decode_buffer) {
+            for (int ch = 0; ch < s->channels; ch++)
+                av_freep(&s->decode_buffer[ch]);
+        }
         av_freep(&s->decode_buffer);
+    }
     s->decode_buffer = NULL;
     av_freep(&s->ch_ctx);
 
-- 
2.39.1