[FFmpeg-devel,2/2] avcodec/vorbisenc: Implement transient detection in Vorbis encoder

Submitted by Tyler Jones on March 27, 2017, 2:58 a.m.

Details

Message ID 20170327025833.GA1669@tdjones879
State New
Headers show

Commit Message

Tyler Jones March 27, 2017, 2:58 a.m.
The existing AAC psychoacoustic system is used to detect transients within the
vorbis encoder. This is useful, in general, as an initial step in later utilizing
a complex psychoacoustic model for the vorbis encoder, but more specifically
allows the cacellation of pre-echo effects that frequently occur with this
codec.

Signed-off-by: Tyler Jones <tdjones879@gmail.com>
---
 libavcodec/psymodel.c  |  1 +
 libavcodec/vorbisenc.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

Comments

Michael Niedermayer March 27, 2017, 12:35 p.m.
On Sun, Mar 26, 2017 at 08:58:33PM -0600, Tyler Jones wrote:
> The existing AAC psychoacoustic system is used to detect transients within the
> vorbis encoder. This is useful, in general, as an initial step in later utilizing
> a complex psychoacoustic model for the vorbis encoder, but more specifically
> allows the cacellation of pre-echo effects that frequently occur with this
> codec.
> 
> Signed-off-by: Tyler Jones <tdjones879@gmail.com>
> ---
>  libavcodec/psymodel.c  |  1 +
>  libavcodec/vorbisenc.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 61 insertions(+)

This changes
make fate

stddev: 6521.39 PSNR: 20.04 MAXDIFF:55190 bytes:  1675800/  1675264
stddev: |6521.39 - 296| >= 30
Test vorbis-encode failed. Look at tests/data/fate/vorbis-encode.err for details.
make: *** [fate-vorbis-encode] Error 1

If the change is intended the reference must be updated

[...]
Rostislav Pehlivanov March 27, 2017, 9:45 p.m.
On 27 March 2017 at 03:58, Tyler Jones <tdjones879@gmail.com> wrote:

> The existing AAC psychoacoustic system is used to detect transients within
> the
> vorbis encoder. This is useful, in general, as an initial step in later
> utilizing
> a complex psychoacoustic model for the vorbis encoder, but more
> specifically
> allows the cacellation of pre-echo effects that frequently occur with this
> codec.
>
> Signed-off-by: Tyler Jones <tdjones879@gmail.com>
> ---
>  libavcodec/psymodel.c  |  1 +
>  libavcodec/vorbisenc.c | 60 ++++++++++++++++++++++++++++++
> ++++++++++++++++++++
>  2 files changed, 61 insertions(+)
>
> diff --git a/libavcodec/psymodel.c b/libavcodec/psymodel.c
> index 2b5f111..38831ce 100644
> --- a/libavcodec/psymodel.c
> +++ b/libavcodec/psymodel.c
> @@ -62,6 +62,7 @@ av_cold int ff_psy_init(FFPsyContext *ctx,
> AVCodecContext *avctx, int num_lens,
>
>      switch (ctx->avctx->codec_id) {
>      case AV_CODEC_ID_AAC:
> +    case AV_CODEC_ID_VORBIS:
>          ctx->model = &ff_aac_psy_model;
>          break;
>      }
> diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
> index 2974ca2..e4ec822 100644
> --- a/libavcodec/vorbisenc.c
> +++ b/libavcodec/vorbisenc.c
> @@ -33,6 +33,8 @@
>  #include "vorbis.h"
>  #include "vorbis_enc_data.h"
>
> +#include "psymodel.h"
> +
>  #define BITSTREAM_WRITER_LE
>  #include "put_bits.h"
>
> @@ -126,6 +128,9 @@ typedef struct vorbis_enc_context {
>      vorbis_enc_mode *modes;
>
>      int64_t next_pts;
> +
> +    FFPsyContext psy;
> +    struct FFPsyPreprocessContext* psypp;
>  } vorbis_enc_context;
>
>  #define MAX_CHANNELS     2
> @@ -1024,10 +1029,38 @@ static int vorbis_encode_frame(AVCodecContext
> *avctx, AVPacket *avpkt,
>      vorbis_enc_context *venc = avctx->priv_data;
>      float **audio = frame ? (float **)frame->extended_data : NULL;
>      int samples = frame ? frame->nb_samples : 0;
> +    float *samples2, *la, *overlap;
>      vorbis_enc_mode *mode;
>      vorbis_enc_mapping *mapping;
>      PutBitContext pb;
>      int i, ret;
> +    int start_ch, ch, chans, cur_channel;
> +    FFPsyWindowInfo windows[MAX_CHANNELS];
> +    enum WindowSequence window_sequence[MAX_CHANNELS];
> +
> +    if (!avctx->frame_number)
> +        return 0;
> +
> +    if (venc->psypp)
> +        ff_psy_preprocess(venc->psypp, audio, venc->channels);
> +
> +    if (frame) {
> +        start_ch = 0;
> +        cur_channel = 0;
> +        for (i = 0; i < venc->channels - 1; i++) {
> +            FFPsyWindowInfo* wi = windows + start_ch;
> +            chans = 2;
> +            for (ch = 0; ch < 2; ch++) {
> +                cur_channel = start_ch + ch;
> +                overlap = &audio[cur_channel][0];
> +                samples2 = overlap + 1024;
> +                la = samples2 + (448+64);
> +                wi[ch] = venc->psy.model->window(&venc->psy, samples2,
> la,
> +                                                 cur_channel,
> window_sequence[0]);
>

window_sequence[0] must point to the previous frame's type, not the
current. You'll need
to add enum WindowSequence window_sequence[MAX_CHANNELS]; to the main
encoder
context.

After that, check the wi[i] for EIGHT_SHORT, and if it is EIGHT_SHORT,
you'll need to modify
the encoder to do 8 small transforms and signal that so that the decoder
knows what to do.


> +            }
> +            start_ch += chans;
> +        }
> +    }
>
>      if (!apply_window_and_mdct(venc, audio, samples))
>          return 0;
> @@ -1158,7 +1191,10 @@ static av_cold int vorbis_encode_close(AVCodecContext
> *avctx)
>
>      ff_mdct_end(&venc->mdct[0]);
>      ff_mdct_end(&venc->mdct[1]);
> +    ff_psy_end(&venc->psy);
>
> +    if (venc->psypp)
> +        ff_psy_preprocess_end(venc->psypp);
>      av_freep(&avctx->extradata);
>
>      return 0 ;
> @@ -1168,6 +1204,10 @@ static av_cold int vorbis_encode_init(AVCodecContext
> *avctx)
>  {
>      vorbis_enc_context *venc = avctx->priv_data;
>      int ret;
> +    const uint8_t *sizes[MAX_CHANNELS];
> +    uint8_t grouping[MAX_CHANNELS];
> +    int lengths[MAX_CHANNELS];
> +    int samplerate_index;
>
>      if (avctx->channels != 2) {
>          av_log(avctx, AV_LOG_ERROR, "Current FFmpeg Vorbis encoder only
> supports 2 channels.\n");
> @@ -1190,6 +1230,26 @@ static av_cold int vorbis_encode_init(AVCodecContext
> *avctx)
>
>      avctx->frame_size = 1 << (venc->log2_blocksize[0] - 1);
>
> +    for (samplerate_index = 0; samplerate_index < 16; samplerate_index++)
> +        if (avctx->sample_rate == mpeg4audio_sample_rates[
> samplerate_index])
> +            break;
> +    if (samplerate_index == 16 ||
> +        samplerate_index >= ff_vorbis_swb_size_1024_len ||
> +        samplerate_index >= ff_vorbis_swb_size_128_len)
> +        av_log(avctx, AV_LOG_ERROR, "Unsupported sample rate %d\n",
> avctx->sample_rate);
> +
> +    sizes[0]   = ff_vorbis_swb_size_1024[samplerate_index];
> +    sizes[1]   = ff_vorbis_swb_size_128[samplerate_index];
> +    lengths[0] = ff_vorbis_num_swb_1024[samplerate_index];
> +    lengths[1] = ff_vorbis_num_swb_128[samplerate_index];
> +    grouping[0] = 1;
> +
> +    if ((ret = ff_psy_init(&venc->psy, avctx, 2,
> +                           sizes, lengths,
> +                           1, grouping)) < 0)
> +        goto error;
> +    venc->psypp = ff_psy_preprocess_init(avctx);
> +
>      return 0;
>  error:
>      vorbis_encode_close(avctx);
> --
> 2.7.4
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

Patch hide | download patch | download mbox

diff --git a/libavcodec/psymodel.c b/libavcodec/psymodel.c
index 2b5f111..38831ce 100644
--- a/libavcodec/psymodel.c
+++ b/libavcodec/psymodel.c
@@ -62,6 +62,7 @@  av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx, int num_lens,
 
     switch (ctx->avctx->codec_id) {
     case AV_CODEC_ID_AAC:
+    case AV_CODEC_ID_VORBIS:
         ctx->model = &ff_aac_psy_model;
         break;
     }
diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
index 2974ca2..e4ec822 100644
--- a/libavcodec/vorbisenc.c
+++ b/libavcodec/vorbisenc.c
@@ -33,6 +33,8 @@ 
 #include "vorbis.h"
 #include "vorbis_enc_data.h"
 
+#include "psymodel.h"
+
 #define BITSTREAM_WRITER_LE
 #include "put_bits.h"
 
@@ -126,6 +128,9 @@  typedef struct vorbis_enc_context {
     vorbis_enc_mode *modes;
 
     int64_t next_pts;
+
+    FFPsyContext psy;
+    struct FFPsyPreprocessContext* psypp;
 } vorbis_enc_context;
 
 #define MAX_CHANNELS     2
@@ -1024,10 +1029,38 @@  static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     vorbis_enc_context *venc = avctx->priv_data;
     float **audio = frame ? (float **)frame->extended_data : NULL;
     int samples = frame ? frame->nb_samples : 0;
+    float *samples2, *la, *overlap;
     vorbis_enc_mode *mode;
     vorbis_enc_mapping *mapping;
     PutBitContext pb;
     int i, ret;
+    int start_ch, ch, chans, cur_channel;
+    FFPsyWindowInfo windows[MAX_CHANNELS];
+    enum WindowSequence window_sequence[MAX_CHANNELS];
+
+    if (!avctx->frame_number)
+        return 0;
+
+    if (venc->psypp)
+        ff_psy_preprocess(venc->psypp, audio, venc->channels);
+
+    if (frame) {
+        start_ch = 0;
+        cur_channel = 0;
+        for (i = 0; i < venc->channels - 1; i++) {
+            FFPsyWindowInfo* wi = windows + start_ch;
+            chans = 2;
+            for (ch = 0; ch < 2; ch++) {
+                cur_channel = start_ch + ch;
+                overlap = &audio[cur_channel][0];
+                samples2 = overlap + 1024;
+                la = samples2 + (448+64);
+                wi[ch] = venc->psy.model->window(&venc->psy, samples2, la,
+                                                 cur_channel, window_sequence[0]);
+            }
+            start_ch += chans;
+        }
+    }
 
     if (!apply_window_and_mdct(venc, audio, samples))
         return 0;
@@ -1158,7 +1191,10 @@  static av_cold int vorbis_encode_close(AVCodecContext *avctx)
 
     ff_mdct_end(&venc->mdct[0]);
     ff_mdct_end(&venc->mdct[1]);
+    ff_psy_end(&venc->psy);
 
+    if (venc->psypp)
+        ff_psy_preprocess_end(venc->psypp);
     av_freep(&avctx->extradata);
 
     return 0 ;
@@ -1168,6 +1204,10 @@  static av_cold int vorbis_encode_init(AVCodecContext *avctx)
 {
     vorbis_enc_context *venc = avctx->priv_data;
     int ret;
+    const uint8_t *sizes[MAX_CHANNELS];
+    uint8_t grouping[MAX_CHANNELS];
+    int lengths[MAX_CHANNELS];
+    int samplerate_index;
 
     if (avctx->channels != 2) {
         av_log(avctx, AV_LOG_ERROR, "Current FFmpeg Vorbis encoder only supports 2 channels.\n");
@@ -1190,6 +1230,26 @@  static av_cold int vorbis_encode_init(AVCodecContext *avctx)
 
     avctx->frame_size = 1 << (venc->log2_blocksize[0] - 1);
 
+    for (samplerate_index = 0; samplerate_index < 16; samplerate_index++)
+        if (avctx->sample_rate == mpeg4audio_sample_rates[samplerate_index])
+            break;
+    if (samplerate_index == 16 ||
+        samplerate_index >= ff_vorbis_swb_size_1024_len ||
+        samplerate_index >= ff_vorbis_swb_size_128_len)
+        av_log(avctx, AV_LOG_ERROR, "Unsupported sample rate %d\n", avctx->sample_rate);
+
+    sizes[0]   = ff_vorbis_swb_size_1024[samplerate_index];
+    sizes[1]   = ff_vorbis_swb_size_128[samplerate_index];
+    lengths[0] = ff_vorbis_num_swb_1024[samplerate_index];
+    lengths[1] = ff_vorbis_num_swb_128[samplerate_index];
+    grouping[0] = 1;
+
+    if ((ret = ff_psy_init(&venc->psy, avctx, 2,
+                           sizes, lengths,
+                           1, grouping)) < 0)
+        goto error;
+    venc->psypp = ff_psy_preprocess_init(avctx);
+
     return 0;
 error:
     vorbis_encode_close(avctx);