Message ID | 20170727212245.GA32538@tdjones.localdomain |
---|---|
State | Superseded |
Headers | show |
On 7/27/2017 6:22 PM, Tyler Jones wrote: > The encoder will attempt to determine the existence of transient > signals by applying a 4th order highpass filter to remove dominant > low frequency waveforms. Frames are then split up into blocks > where the variance is calculated and compared with blocks from > the previous frame. A preecho is only likely to be noticeable when > relatively quiet audio is followed by a loud transient signal. > > Signed-off-by: Tyler Jones <tdjones879@gmail.com> > --- > V3: Use normal float notation > Don't check before freeing NULL pointers > Remove unnecessary includes > > V2: Provide proper prefix for non-static function > > libavcodec/Makefile | 2 +- > libavcodec/vorbisenc.c | 27 +++++++-- > libavcodec/vorbispsy.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++ > libavcodec/vorbispsy.h | 79 ++++++++++++++++++++++++++ > 4 files changed, 251 insertions(+), 5 deletions(-) > create mode 100644 libavcodec/vorbispsy.c > create mode 100644 libavcodec/vorbispsy.h > > diff --git a/libavcodec/Makefile b/libavcodec/Makefile > index 357fa1a361..08acbc723e 100644 > --- a/libavcodec/Makefile > +++ b/libavcodec/Makefile > @@ -611,7 +611,7 @@ OBJS-$(CONFIG_VMNC_DECODER) += vmnc.o > OBJS-$(CONFIG_VORBIS_DECODER) += vorbisdec.o vorbisdsp.o vorbis.o \ > vorbis_data.o > OBJS-$(CONFIG_VORBIS_ENCODER) += vorbisenc.o vorbis.o \ > - vorbis_data.o > + vorbis_data.o vorbispsy.o > OBJS-$(CONFIG_VP3_DECODER) += vp3.o > OBJS-$(CONFIG_VP5_DECODER) += vp5.o vp56.o vp56data.o vp56rac.o > OBJS-$(CONFIG_VP6_DECODER) += vp6.o vp56.o vp56data.o \ > diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c > index bf21a3b1ff..1330b1b376 100644 > --- a/libavcodec/vorbisenc.c > +++ b/libavcodec/vorbisenc.c > @@ -33,6 +33,7 @@ > #include "mathops.h" > #include "vorbis.h" > #include "vorbis_enc_data.h" > +#include "vorbispsy.h" > > #include "audio_frame_queue.h" > #include "libavfilter/bufferqueue.h" > @@ -136,6 +137,7 @@ typedef struct vorbis_enc_context { > int64_t next_pts; > > AVFloatDSPContext *fdsp; > + VorbisPsyContext *vpctx; Why a pointer? I don't see the benefit. It means an unnecessary malloc and free call. > } vorbis_enc_context; > > #define MAX_CHANNELS 2 > @@ -272,11 +274,12 @@ static int create_vorbis_context(vorbis_enc_context *venc, > vorbis_enc_floor *fc; > vorbis_enc_residue *rc; > vorbis_enc_mapping *mc; > - int i, book, ret; > + int i, book, ret, blocks; > > venc->channels = avctx->channels; > venc->sample_rate = avctx->sample_rate; > - venc->log2_blocksize[0] = venc->log2_blocksize[1] = 11; > + venc->log2_blocksize[0] = 8; > + venc->log2_blocksize[1] = 11; > > venc->ncodebooks = FF_ARRAY_ELEMS(cvectors); > venc->codebooks = av_malloc(sizeof(vorbis_enc_codebook) * venc->ncodebooks); > @@ -464,6 +467,12 @@ static int create_vorbis_context(vorbis_enc_context *venc, > if ((ret = dsp_init(avctx, venc)) < 0) > return ret; > > + blocks = 1 << (venc->log2_blocksize[1] - venc->log2_blocksize[0]); > + venc->vpctx = av_mallocz(sizeof(VorbisPsyContext)); > + if (!venc->vpctx || (ret = ff_psy_vorbis_init(venc->vpctx, venc->sample_rate, > + venc->channels, blocks)) < 0) > + return AVERROR(ENOMEM); > + > return 0; > } > > @@ -1078,15 +1087,17 @@ static void move_audio(vorbis_enc_context *venc, int sf_size) > av_frame_free(&cur); > } > venc->have_saved = 1; > - memcpy(venc->scratch, venc->samples, 2 * venc->channels * frame_size); > + memcpy(venc->scratch, venc->samples, sizeof(float) * venc->channels * 2 * frame_size); > } > > static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, > const AVFrame *frame, int *got_packet_ptr) > { > vorbis_enc_context *venc = avctx->priv_data; > - int i, ret, need_more; > + int i, ret, need_more, ch; > + int curr_win = 1; > int frame_size = 1 << (venc->log2_blocksize[1] - 1); > + int block_size = 1 << (venc->log2_blocksize[0] - 1); > vorbis_enc_mode *mode; > vorbis_enc_mapping *mapping; > PutBitContext pb; > @@ -1121,6 +1132,13 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, > > move_audio(venc, avctx->frame_size); > > + for (ch = 0; ch < venc->channels; ch++) { > + float *scratch = venc->scratch + 2 * ch * frame_size + frame_size; > + > + if (!ff_psy_vorbis_block_frame(venc->vpctx, scratch, ch, frame_size, block_size)) > + curr_win = 0; > + } > + > if (!apply_window_and_mdct(venc)) > return 0; > > @@ -1252,6 +1270,7 @@ static av_cold int vorbis_encode_close(AVCodecContext *avctx) > ff_mdct_end(&venc->mdct[1]); > ff_af_queue_close(&venc->afq); > ff_bufqueue_discard_all(&venc->bufqueue); > + ff_psy_vorbis_close(venc->vpctx); You should pass a pointer to venc->vpctx instead, regardless of what you do with the comment above. > > av_freep(&avctx->extradata); > > diff --git a/libavcodec/vorbispsy.c b/libavcodec/vorbispsy.c > new file mode 100644 > index 0000000000..ef48f6ac8c > --- /dev/null > +++ b/libavcodec/vorbispsy.c > @@ -0,0 +1,148 @@ > +/* > + * Vorbis encoder psychoacoustic model > + * Copyright (C) 2017 Tyler Jones > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "libavutil/ffmath.h" > + > +#include "avcodec.h" > +#include "vorbispsy.h" > + > +/** > + * Generate the coefficients for a highpass biquad filter > + * > + * @param filter Instance of biquad filter to be initialized > + * @param Fs Input's sampling frequency > + * @param Fc Critical frequency for samples to be passed > + * @param Q Quality factor > + */ > +static av_cold void biquad_filter_init(IIRFilter *filter, int Fs, int Fc, float Q) > +{ > + float k = tan(M_PI * Fc / Fs); > + float normalize = 1 / (1 + k / Q + k * k); > + > + filter->b[0] = normalize; > + filter->b[1] = -2 * normalize; > + filter->b[2] = normalize; > + > + filter->a[0] = 1; > + filter->a[1] = 2 * (k * k - 1) * normalize; > + filter->a[2] = (1 - k / Q + k * k) * normalize; > +} > + > +/** > + * Direct Form II implementation for a second order digital filter > + * > + * @param filter Filter to be applied to input samples > + * @param in Single input sample to be filtered > + * @param delay Array of IIR feedback values > + * @return Filtered sample > + */ > +static float apply_filter(IIRFilter *filter, float in, float *delay) > +{ > + float ret, w; > + > + w = filter->a[0] * in - filter->a[1] * delay[0] - filter->a[2] * delay[1]; > + ret = filter->b[0] * w + filter->b[1] * delay[0] + filter->b[2] * delay[1]; > + > + delay[1] = delay[0]; > + delay[0] = w; > + > + return ret; > +} > + > +/** > + * Calculate the variance of a block of samples > + * > + * @param in Array of input samples > + * @param length Number of input samples being analyzed > + * @return The variance for the current block > + */ > +static float variance(const float *in, int length) > +{ > + int i; > + float mean = 0.0f, square_sum = 0.0f; > + > + for (i = 0; i < length; i++) { > + mean += in[i]; > + square_sum += in[i] * in[i]; Can't you use AVFloatDSPContext's scalarproduct_float for square_sum? The constrains are lax. 16 byte alignment for in and length a multiple of 4. You can pad the buffer if needed to achieve that. > + } > + > + mean /= length; > + return (square_sum - length * mean * mean) / (length - 1); > +} > + > +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, > + int channels, int blocks) > +{ > + int crit_freq; > + float Q[2] = {.54, 1.31}; // Quality values for maximally flat cascaded filters const float Q[2] > + > + vpctx->filter_delay = av_mallocz_array(4 * channels, sizeof(vpctx->filter_delay[0])); > + if (!vpctx->filter_delay) > + return AVERROR(ENOMEM); > + > + crit_freq = sample_rate / 4; > + biquad_filter_init(&vpctx->filter[0], sample_rate, crit_freq, Q[0]); > + biquad_filter_init(&vpctx->filter[1], sample_rate, crit_freq, Q[1]); > + > + vpctx->variance = av_mallocz_array(channels * blocks, sizeof(vpctx->variance[0])); > + if (!vpctx->variance) > + return AVERROR(ENOMEM); > + > + vpctx->preecho_thresh = 100.0f; > + > + return 0; > +} > + > +int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio, > + int ch, int frame_size, int block_size) > +{ > + int i, block_flag = 1; > + int blocks = frame_size / block_size; > + float last_var; > + const float eps = 0.0001f; > + float *var = vpctx->variance + ch * blocks; > + > + for (i = 0; i < frame_size; i++) { > + apply_filter(&vpctx->filter[0], audio[i], vpctx->filter_delay + 4 * ch); > + apply_filter(&vpctx->filter[1], audio[i], vpctx->filter_delay + 4 * ch + 2); > + } > + > + for (i = 0; i < blocks; i++) { > + last_var = var[i]; > + var[i] = variance(audio + i * block_size, block_size); > + > + /* A small constant is added to the threshold in order to prevent false > + * transients from being detected when quiet sounds follow near-silence */ > + if (var[i] > vpctx->preecho_thresh * last_var + eps) > + block_flag = 0; > + } > + > + return block_flag; > +} > + > +av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx) > +{ > + if (vpctx) { > + av_freep(&vpctx->filter_delay); > + av_freep(&vpctx->variance); > + } > + av_freep(&vpctx); > +} > diff --git a/libavcodec/vorbispsy.h b/libavcodec/vorbispsy.h > new file mode 100644 > index 0000000000..021a5e8a28 > --- /dev/null > +++ b/libavcodec/vorbispsy.h > @@ -0,0 +1,79 @@ > +/* > + * Vorbis encoder psychoacoustic model > + * Copyright (C) 2017 Tyler Jones > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +/** > + * @file > + * Vorbis psychoacoustic model > + */ > + > +#ifndef AVCODEC_VORBISPSY_H > +#define AVCODEC_VORBISPSY_H > + > +#include "libavutil/attributes.h" > + > +/** > + * Second order IIR Filter > + */ > +typedef struct IIRFilter { > + float b[3]; ///< Normalized cofficients for numerator of transfer function > + float a[3]; ///< Normalized coefficiets for denominator of transfer function > +} IIRFilter; > + > +typedef struct VorbisPsyContext { > + IIRFilter filter[2]; > + float *filter_delay; ///< Direct Form II delay registers for each channel > + float *variance; ///< Saved variances from previous sub-blocks for each channel > + float preecho_thresh; ///< Threshold for determining prescence of a preecho > +} VorbisPsyContext; > + > +/** > + * Initializes the psychoacoustic model context > + * > + * @param vpctx Uninitialized pointer to the model context > + * @param sample_rate Input audio sample rate > + * @param channels Number of channels being analyzed > + * @param blocks Number of short blocks for every frame of input > + * @return 0 on success, negative on failure > + */ > +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, > + int channels, int blocks); > + > +/** > + * Suggest the type of block to use for encoding the current frame > + * > + * Each frame of input is passed through a highpass filter to remove dominant > + * low-frequency waveforms and the variance of each short block of input is > + * then calculated. If the variance over this block is significantly more than > + * blocks from the previous frame, a transient signal is likely present. > + * > + * @param audio Pointer to the current channel's input samples > + * @param ch Current channel being analyzed > + * @param frame_size Size of a full frame, i.e. the size of the long block > + * @param block_size Size of the short block > + * @return The correct blockflag to use for encoding, 0 short and 1 long > + */ > +int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio, > + int ch, int frame_size, int block_size); > +/** > + * Closes and frees the memory used by the psychoacoustic model > + */ > +av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx); > +#endif /* AVCODEC_VORBISPSY_H */ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
> > --- a/libavcodec/vorbisenc.c > > +++ b/libavcodec/vorbisenc.c > > @@ -33,6 +33,7 @@ > > #include "mathops.h" > > #include "vorbis.h" > > #include "vorbis_enc_data.h" > > +#include "vorbispsy.h" > > > > #include "audio_frame_queue.h" > > #include "libavfilter/bufferqueue.h" > > @@ -136,6 +137,7 @@ typedef struct vorbis_enc_context { > > int64_t next_pts; > > > > AVFloatDSPContext *fdsp; > > + VorbisPsyContext *vpctx; > > Why a pointer? I don't see the benefit. It means an unnecessary malloc > and free call. You're probably right. It's changed now. > > @@ -1252,6 +1270,7 @@ static av_cold int vorbis_encode_close(AVCodecContext *avctx) > > ff_mdct_end(&venc->mdct[1]); > > ff_af_queue_close(&venc->afq); > > ff_bufqueue_discard_all(&venc->bufqueue); > > + ff_psy_vorbis_close(venc->vpctx); > > You should pass a pointer to venc->vpctx instead, regardless of what you > do with the comment above. I'm not sure I understand what you mean. It is passing a pointer to a VorbisPsyContext, please see the prototype: av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx); > > +/** > > + * Calculate the variance of a block of samples > > + * > > + * @param in Array of input samples > > + * @param length Number of input samples being analyzed > > + * @return The variance for the current block > > + */ > > +static float variance(const float *in, int length) > > +{ > > + int i; > > + float mean = 0.0f, square_sum = 0.0f; > > + > > + for (i = 0; i < length; i++) { > > + mean += in[i]; > > + square_sum += in[i] * in[i]; > > Can't you use AVFloatDSPContext's scalarproduct_float for square_sum? > The constrains are lax. 16 byte alignment for in and length a multiple > of 4. You can pad the buffer if needed to achieve that. You are correct, it is switched over now. > > + } > > + > > + mean /= length; > > + return (square_sum - length * mean * mean) / (length - 1); > > +} > > + > > +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, > > + int channels, int blocks) > > +{ > > + int crit_freq; > > + float Q[2] = {.54, 1.31}; // Quality values for maximally flat cascaded filters > > const float Q[2] Fixed. Thank you for catching these mistakes and providing suggestions. A new version of this patch will be sent soon. Thanks again, Tyler Jones
On 7/28/2017 11:59 AM, Tyler Jones wrote: >>> --- a/libavcodec/vorbisenc.c >>> +++ b/libavcodec/vorbisenc.c >>> @@ -33,6 +33,7 @@ >>> #include "mathops.h" >>> #include "vorbis.h" >>> #include "vorbis_enc_data.h" >>> +#include "vorbispsy.h" >>> >>> #include "audio_frame_queue.h" >>> #include "libavfilter/bufferqueue.h" >>> @@ -136,6 +137,7 @@ typedef struct vorbis_enc_context { >>> int64_t next_pts; >>> >>> AVFloatDSPContext *fdsp; >>> + VorbisPsyContext *vpctx; >> >> Why a pointer? I don't see the benefit. It means an unnecessary malloc >> and free call. > > You're probably right. It's changed now. > >>> @@ -1252,6 +1270,7 @@ static av_cold int vorbis_encode_close(AVCodecContext *avctx) >>> ff_mdct_end(&venc->mdct[1]); >>> ff_af_queue_close(&venc->afq); >>> ff_bufqueue_discard_all(&venc->bufqueue); >>> + ff_psy_vorbis_close(venc->vpctx); >> >> You should pass a pointer to venc->vpctx instead, regardless of what you >> do with the comment above. > > I'm not sure I understand what you mean. It is passing a pointer to a > VorbisPsyContext, please see the prototype: > > av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx); For this version (but not V4) it should have been a pointer to the pointer in vorbis_enc_context, that is **vpctx. Otherwise, the av_freep() call in ff_psy_vorbis_close() would have zeroed a copy of said pointer. > >>> +/** >>> + * Calculate the variance of a block of samples >>> + * >>> + * @param in Array of input samples >>> + * @param length Number of input samples being analyzed >>> + * @return The variance for the current block >>> + */ >>> +static float variance(const float *in, int length) >>> +{ >>> + int i; >>> + float mean = 0.0f, square_sum = 0.0f; >>> + >>> + for (i = 0; i < length; i++) { >>> + mean += in[i]; >>> + square_sum += in[i] * in[i]; >> >> Can't you use AVFloatDSPContext's scalarproduct_float for square_sum? >> The constrains are lax. 16 byte alignment for in and length a multiple >> of 4. You can pad the buffer if needed to achieve that. > > You are correct, it is switched over now. > >>> + } >>> + >>> + mean /= length; >>> + return (square_sum - length * mean * mean) / (length - 1); >>> +} >>> + >>> +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, >>> + int channels, int blocks) >>> +{ >>> + int crit_freq; >>> + float Q[2] = {.54, 1.31}; // Quality values for maximally flat cascaded filters >> >> const float Q[2] > > Fixed. > > Thank you for catching these mistakes and providing suggestions. A new version > of this patch will be sent soon. > > Thanks again, > > Tyler Jones > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 357fa1a361..08acbc723e 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -611,7 +611,7 @@ OBJS-$(CONFIG_VMNC_DECODER) += vmnc.o OBJS-$(CONFIG_VORBIS_DECODER) += vorbisdec.o vorbisdsp.o vorbis.o \ vorbis_data.o OBJS-$(CONFIG_VORBIS_ENCODER) += vorbisenc.o vorbis.o \ - vorbis_data.o + vorbis_data.o vorbispsy.o OBJS-$(CONFIG_VP3_DECODER) += vp3.o OBJS-$(CONFIG_VP5_DECODER) += vp5.o vp56.o vp56data.o vp56rac.o OBJS-$(CONFIG_VP6_DECODER) += vp6.o vp56.o vp56data.o \ diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c index bf21a3b1ff..1330b1b376 100644 --- a/libavcodec/vorbisenc.c +++ b/libavcodec/vorbisenc.c @@ -33,6 +33,7 @@ #include "mathops.h" #include "vorbis.h" #include "vorbis_enc_data.h" +#include "vorbispsy.h" #include "audio_frame_queue.h" #include "libavfilter/bufferqueue.h" @@ -136,6 +137,7 @@ typedef struct vorbis_enc_context { int64_t next_pts; AVFloatDSPContext *fdsp; + VorbisPsyContext *vpctx; } vorbis_enc_context; #define MAX_CHANNELS 2 @@ -272,11 +274,12 @@ static int create_vorbis_context(vorbis_enc_context *venc, vorbis_enc_floor *fc; vorbis_enc_residue *rc; vorbis_enc_mapping *mc; - int i, book, ret; + int i, book, ret, blocks; venc->channels = avctx->channels; venc->sample_rate = avctx->sample_rate; - venc->log2_blocksize[0] = venc->log2_blocksize[1] = 11; + venc->log2_blocksize[0] = 8; + venc->log2_blocksize[1] = 11; venc->ncodebooks = FF_ARRAY_ELEMS(cvectors); venc->codebooks = av_malloc(sizeof(vorbis_enc_codebook) * venc->ncodebooks); @@ -464,6 +467,12 @@ static int create_vorbis_context(vorbis_enc_context *venc, if ((ret = dsp_init(avctx, venc)) < 0) return ret; + blocks = 1 << (venc->log2_blocksize[1] - venc->log2_blocksize[0]); + venc->vpctx = av_mallocz(sizeof(VorbisPsyContext)); + if (!venc->vpctx || (ret = ff_psy_vorbis_init(venc->vpctx, venc->sample_rate, + venc->channels, blocks)) < 0) + return AVERROR(ENOMEM); + return 0; } @@ -1078,15 +1087,17 @@ static void move_audio(vorbis_enc_context *venc, int sf_size) av_frame_free(&cur); } venc->have_saved = 1; - memcpy(venc->scratch, venc->samples, 2 * venc->channels * frame_size); + memcpy(venc->scratch, venc->samples, sizeof(float) * venc->channels * 2 * frame_size); } static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, const AVFrame *frame, int *got_packet_ptr) { vorbis_enc_context *venc = avctx->priv_data; - int i, ret, need_more; + int i, ret, need_more, ch; + int curr_win = 1; int frame_size = 1 << (venc->log2_blocksize[1] - 1); + int block_size = 1 << (venc->log2_blocksize[0] - 1); vorbis_enc_mode *mode; vorbis_enc_mapping *mapping; PutBitContext pb; @@ -1121,6 +1132,13 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, move_audio(venc, avctx->frame_size); + for (ch = 0; ch < venc->channels; ch++) { + float *scratch = venc->scratch + 2 * ch * frame_size + frame_size; + + if (!ff_psy_vorbis_block_frame(venc->vpctx, scratch, ch, frame_size, block_size)) + curr_win = 0; + } + if (!apply_window_and_mdct(venc)) return 0; @@ -1252,6 +1270,7 @@ static av_cold int vorbis_encode_close(AVCodecContext *avctx) ff_mdct_end(&venc->mdct[1]); ff_af_queue_close(&venc->afq); ff_bufqueue_discard_all(&venc->bufqueue); + ff_psy_vorbis_close(venc->vpctx); av_freep(&avctx->extradata); diff --git a/libavcodec/vorbispsy.c b/libavcodec/vorbispsy.c new file mode 100644 index 0000000000..ef48f6ac8c --- /dev/null +++ b/libavcodec/vorbispsy.c @@ -0,0 +1,148 @@ +/* + * Vorbis encoder psychoacoustic model + * Copyright (C) 2017 Tyler Jones + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/ffmath.h" + +#include "avcodec.h" +#include "vorbispsy.h" + +/** + * Generate the coefficients for a highpass biquad filter + * + * @param filter Instance of biquad filter to be initialized + * @param Fs Input's sampling frequency + * @param Fc Critical frequency for samples to be passed + * @param Q Quality factor + */ +static av_cold void biquad_filter_init(IIRFilter *filter, int Fs, int Fc, float Q) +{ + float k = tan(M_PI * Fc / Fs); + float normalize = 1 / (1 + k / Q + k * k); + + filter->b[0] = normalize; + filter->b[1] = -2 * normalize; + filter->b[2] = normalize; + + filter->a[0] = 1; + filter->a[1] = 2 * (k * k - 1) * normalize; + filter->a[2] = (1 - k / Q + k * k) * normalize; +} + +/** + * Direct Form II implementation for a second order digital filter + * + * @param filter Filter to be applied to input samples + * @param in Single input sample to be filtered + * @param delay Array of IIR feedback values + * @return Filtered sample + */ +static float apply_filter(IIRFilter *filter, float in, float *delay) +{ + float ret, w; + + w = filter->a[0] * in - filter->a[1] * delay[0] - filter->a[2] * delay[1]; + ret = filter->b[0] * w + filter->b[1] * delay[0] + filter->b[2] * delay[1]; + + delay[1] = delay[0]; + delay[0] = w; + + return ret; +} + +/** + * Calculate the variance of a block of samples + * + * @param in Array of input samples + * @param length Number of input samples being analyzed + * @return The variance for the current block + */ +static float variance(const float *in, int length) +{ + int i; + float mean = 0.0f, square_sum = 0.0f; + + for (i = 0; i < length; i++) { + mean += in[i]; + square_sum += in[i] * in[i]; + } + + mean /= length; + return (square_sum - length * mean * mean) / (length - 1); +} + +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, + int channels, int blocks) +{ + int crit_freq; + float Q[2] = {.54, 1.31}; // Quality values for maximally flat cascaded filters + + vpctx->filter_delay = av_mallocz_array(4 * channels, sizeof(vpctx->filter_delay[0])); + if (!vpctx->filter_delay) + return AVERROR(ENOMEM); + + crit_freq = sample_rate / 4; + biquad_filter_init(&vpctx->filter[0], sample_rate, crit_freq, Q[0]); + biquad_filter_init(&vpctx->filter[1], sample_rate, crit_freq, Q[1]); + + vpctx->variance = av_mallocz_array(channels * blocks, sizeof(vpctx->variance[0])); + if (!vpctx->variance) + return AVERROR(ENOMEM); + + vpctx->preecho_thresh = 100.0f; + + return 0; +} + +int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio, + int ch, int frame_size, int block_size) +{ + int i, block_flag = 1; + int blocks = frame_size / block_size; + float last_var; + const float eps = 0.0001f; + float *var = vpctx->variance + ch * blocks; + + for (i = 0; i < frame_size; i++) { + apply_filter(&vpctx->filter[0], audio[i], vpctx->filter_delay + 4 * ch); + apply_filter(&vpctx->filter[1], audio[i], vpctx->filter_delay + 4 * ch + 2); + } + + for (i = 0; i < blocks; i++) { + last_var = var[i]; + var[i] = variance(audio + i * block_size, block_size); + + /* A small constant is added to the threshold in order to prevent false + * transients from being detected when quiet sounds follow near-silence */ + if (var[i] > vpctx->preecho_thresh * last_var + eps) + block_flag = 0; + } + + return block_flag; +} + +av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx) +{ + if (vpctx) { + av_freep(&vpctx->filter_delay); + av_freep(&vpctx->variance); + } + av_freep(&vpctx); +} diff --git a/libavcodec/vorbispsy.h b/libavcodec/vorbispsy.h new file mode 100644 index 0000000000..021a5e8a28 --- /dev/null +++ b/libavcodec/vorbispsy.h @@ -0,0 +1,79 @@ +/* + * Vorbis encoder psychoacoustic model + * Copyright (C) 2017 Tyler Jones + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Vorbis psychoacoustic model + */ + +#ifndef AVCODEC_VORBISPSY_H +#define AVCODEC_VORBISPSY_H + +#include "libavutil/attributes.h" + +/** + * Second order IIR Filter + */ +typedef struct IIRFilter { + float b[3]; ///< Normalized cofficients for numerator of transfer function + float a[3]; ///< Normalized coefficiets for denominator of transfer function +} IIRFilter; + +typedef struct VorbisPsyContext { + IIRFilter filter[2]; + float *filter_delay; ///< Direct Form II delay registers for each channel + float *variance; ///< Saved variances from previous sub-blocks for each channel + float preecho_thresh; ///< Threshold for determining prescence of a preecho +} VorbisPsyContext; + +/** + * Initializes the psychoacoustic model context + * + * @param vpctx Uninitialized pointer to the model context + * @param sample_rate Input audio sample rate + * @param channels Number of channels being analyzed + * @param blocks Number of short blocks for every frame of input + * @return 0 on success, negative on failure + */ +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, + int channels, int blocks); + +/** + * Suggest the type of block to use for encoding the current frame + * + * Each frame of input is passed through a highpass filter to remove dominant + * low-frequency waveforms and the variance of each short block of input is + * then calculated. If the variance over this block is significantly more than + * blocks from the previous frame, a transient signal is likely present. + * + * @param audio Pointer to the current channel's input samples + * @param ch Current channel being analyzed + * @param frame_size Size of a full frame, i.e. the size of the long block + * @param block_size Size of the short block + * @return The correct blockflag to use for encoding, 0 short and 1 long + */ +int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio, + int ch, int frame_size, int block_size); +/** + * Closes and frees the memory used by the psychoacoustic model + */ +av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx); +#endif /* AVCODEC_VORBISPSY_H */
The encoder will attempt to determine the existence of transient signals by applying a 4th order highpass filter to remove dominant low frequency waveforms. Frames are then split up into blocks where the variance is calculated and compared with blocks from the previous frame. A preecho is only likely to be noticeable when relatively quiet audio is followed by a loud transient signal. Signed-off-by: Tyler Jones <tdjones879@gmail.com> --- V3: Use normal float notation Don't check before freeing NULL pointers Remove unnecessary includes V2: Provide proper prefix for non-static function libavcodec/Makefile | 2 +- libavcodec/vorbisenc.c | 27 +++++++-- libavcodec/vorbispsy.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++ libavcodec/vorbispsy.h | 79 ++++++++++++++++++++++++++ 4 files changed, 251 insertions(+), 5 deletions(-) create mode 100644 libavcodec/vorbispsy.c create mode 100644 libavcodec/vorbispsy.h