[FFmpeg-devel,1/6] avcodec/vorbisenc: Add pre-echo detection

Message ID	20170822012307.6019-2-tdjones879@gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: Tyler Jones <tdjones879@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Mon, 21 Aug 2017 19:23:02 -0600 Message-Id: <20170822012307.6019-2-tdjones879@gmail.com> In-Reply-To: <20170822012307.6019-1-tdjones879@gmail.com> References: <20170822012307.6019-1-tdjones879@gmail.com> Subject: [FFmpeg-devel] [PATCH 1/6] avcodec/vorbisenc: Add pre-echo detection Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 982d7f5179..315c403c9c 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -611,7 +611,7 @@ OBJS-$(CONFIG_VMNC_DECODER) += vmnc.o OBJS-$(CONFIG_VORBIS_DECODER) += vorbisdec.o vorbisdsp.o vorbis.o \ vorbis_data.o OBJS-$(CONFIG_VORBIS_ENCODER) += vorbisenc.o vorbis.o \ - vorbis_data.o + vorbis_data.o vorbispsy.o OBJS-$(CONFIG_VP3_DECODER) += vp3.o OBJS-$(CONFIG_VP5_DECODER) += vp5.o vp56.o vp56data.o vp56rac.o OBJS-$(CONFIG_VP6_DECODER) += vp6.o vp56.o vp56data.o \ diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c index bf21a3b1ff..6da5f012c2 100644 --- a/libavcodec/vorbisenc.c +++ b/libavcodec/vorbisenc.c @@ -33,6 +33,7 @@ #include "mathops.h" #include "vorbis.h" #include "vorbis_enc_data.h" +#include "vorbispsy.h" #include "audio_frame_queue.h" #include "libavfilter/bufferqueue.h" @@ -136,6 +137,7 @@ typedef struct vorbis_enc_context { int64_t next_pts; AVFloatDSPContext *fdsp; + VorbisPsyContext vpctx; } vorbis_enc_context; #define MAX_CHANNELS 2 @@ -272,11 +274,12 @@ static int create_vorbis_context(vorbis_enc_context *venc, vorbis_enc_floor *fc; vorbis_enc_residue *rc; vorbis_enc_mapping *mc; - int i, book, ret; + int i, book, ret, blocks; venc->channels = avctx->channels; venc->sample_rate = avctx->sample_rate; - venc->log2_blocksize[0] = venc->log2_blocksize[1] = 11; + venc->log2_blocksize[0] = 8; + venc->log2_blocksize[1] = 11; venc->ncodebooks = FF_ARRAY_ELEMS(cvectors); venc->codebooks = av_malloc(sizeof(vorbis_enc_codebook) * venc->ncodebooks); @@ -464,6 +467,11 @@ static int create_vorbis_context(vorbis_enc_context *venc, if ((ret = dsp_init(avctx, venc)) < 0) return ret; + blocks = 1 << (venc->log2_blocksize[1] - venc->log2_blocksize[0]); + if ((ret = ff_psy_vorbis_init(&venc->vpctx, venc->sample_rate, + venc->channels, blocks, venc->fdsp)) < 0) + return ret; + return 0; } @@ -1078,15 +1086,17 @@ static void move_audio(vorbis_enc_context *venc, int sf_size) av_frame_free(&cur); } venc->have_saved = 1; - memcpy(venc->scratch, venc->samples, 2 * venc->channels * frame_size); + memcpy(venc->scratch, venc->samples, sizeof(float) * venc->channels * 2 * frame_size); } static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, const AVFrame *frame, int *got_packet_ptr) { vorbis_enc_context *venc = avctx->priv_data; - int i, ret, need_more; + int i, ret, need_more, ch; + int curr_win = 1; int frame_size = 1 << (venc->log2_blocksize[1] - 1); + int block_size = 1 << (venc->log2_blocksize[0] - 1); vorbis_enc_mode *mode; vorbis_enc_mapping *mapping; PutBitContext pb; @@ -1121,6 +1131,14 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, move_audio(venc, avctx->frame_size); + for (ch = 0; ch < venc->channels; ch++) { + float *scratch = venc->scratch + 2 * ch * frame_size + frame_size; + + if (!ff_psy_vorbis_block_frame(&venc->vpctx, scratch, ch, + frame_size, block_size)) + curr_win = 0; + } + if (!apply_window_and_mdct(venc)) return 0; @@ -1252,6 +1270,7 @@ static av_cold int vorbis_encode_close(AVCodecContext *avctx) ff_mdct_end(&venc->mdct[1]); ff_af_queue_close(&venc->afq); ff_bufqueue_discard_all(&venc->bufqueue); + ff_psy_vorbis_close(&venc->vpctx); av_freep(&avctx->extradata); diff --git a/libavcodec/vorbispsy.c b/libavcodec/vorbispsy.c new file mode 100644 index 0000000000..ab2d41f62f --- /dev/null +++ b/libavcodec/vorbispsy.c @@ -0,0 +1,147 @@ +/* + * Vorbis encoder psychoacoustic model + * Copyright (C) 2017 Tyler Jones + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/ffmath.h" + +#include "avcodec.h" +#include "vorbispsy.h" + +/** + * Generate the coefficients for a highpass biquad filter + * + * @param filter Instance of biquad filter to be initialized + * @param Fs Input's sampling frequency + * @param Fc Critical frequency for samples to be passed + * @param Q Quality factor + */ +static av_cold void biquad_filter_init(IIRFilter *filter, int Fs, int Fc, float Q) +{ + float k = tan(M_PI * Fc / Fs); + float normalize = 1 / (1 + k / Q + k * k); + + filter->b[0] = normalize; + filter->b[1] = -2 * normalize; + filter->b[2] = normalize; + + filter->a[0] = 1; + filter->a[1] = 2 * (k * k - 1) * normalize; + filter->a[2] = (1 - k / Q + k * k) * normalize; +} + +/** + * Direct Form II implementation for a second order digital filter + * + * @param filter Filter to be applied to input samples + * @param in Single input sample to be filtered + * @param delay Array of IIR feedback values + * @return Filtered sample + */ +static float apply_filter(IIRFilter *filter, float in, float *delay) +{ + float ret, w; + + w = filter->a[0] * in - filter->a[1] * delay[0] - filter->a[2] * delay[1]; + ret = filter->b[0] * w + filter->b[1] * delay[0] + filter->b[2] * delay[1]; + + delay[1] = delay[0]; + delay[0] = w; + + return ret; +} + +/** + * Calculate the variance of a block of samples + * + * @param in Array of input samples + * @param length Number of input samples being analyzed + * @return The variance for the current block + */ +static float variance(const float *in, int length, AVFloatDSPContext *fdsp) +{ + int i; + float mean = 0.0f, square_sum = 0.0f; + + for (i = 0; i < length; i++) { + mean += in[i]; + } + + square_sum = fdsp->scalarproduct_float(in, in, length); + + mean /= length; + return (square_sum - length * mean * mean) / (length - 1); +} + +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, + int channels, int blocks, AVFloatDSPContext *fdsp) +{ + int crit_freq; + const float Q[2] = {.54, 1.31}; // Quality values for maximally flat cascaded filters + + vpctx->filter_delay = av_mallocz_array(4 * channels, sizeof(vpctx->filter_delay[0])); + if (!vpctx->filter_delay) + return AVERROR(ENOMEM); + + crit_freq = sample_rate / 4; + biquad_filter_init(&vpctx->filter[0], sample_rate, crit_freq, Q[0]); + biquad_filter_init(&vpctx->filter[1], sample_rate, crit_freq, Q[1]); + + vpctx->variance = av_mallocz_array(channels * blocks, sizeof(vpctx->variance[0])); + if (!vpctx->variance) + return AVERROR(ENOMEM); + + vpctx->preecho_thresh = 100.0f; + vpctx->fdsp = fdsp; + + return 0; +} + +int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio, + int ch, int frame_size, int block_size) +{ + int i, block_flag = 1; + int blocks = frame_size / block_size; + float last_var; + const float eps = 0.0001f; + float *var = vpctx->variance + ch * blocks; + + for (i = 0; i < frame_size; i++) { + apply_filter(&vpctx->filter[0], audio[i], vpctx->filter_delay + 4 * ch); + apply_filter(&vpctx->filter[1], audio[i], vpctx->filter_delay + 4 * ch + 2); + } + + for (i = 0; i < blocks; i++) { + last_var = var[i]; + var[i] = variance(audio + i * block_size, block_size, vpctx->fdsp); + + /* A small constant is added to the threshold in order to prevent false + * transients from being detected when quiet sounds follow near-silence */ + if (var[i] > vpctx->preecho_thresh * last_var + eps) + block_flag = 0; + } + + return block_flag; +} + +av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx) +{ + av_freep(&vpctx->filter_delay); + av_freep(&vpctx->variance); +} diff --git a/libavcodec/vorbispsy.h b/libavcodec/vorbispsy.h new file mode 100644 index 0000000000..93a03fd8ca --- /dev/null +++ b/libavcodec/vorbispsy.h @@ -0,0 +1,82 @@ +/* + * Vorbis encoder psychoacoustic model + * Copyright (C) 2017 Tyler Jones + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Vorbis psychoacoustic model + */ + +#ifndef AVCODEC_VORBISPSY_H +#define AVCODEC_VORBISPSY_H + +#include "libavutil/attributes.h" +#include "libavutil/float_dsp.h" + +/** + * Second order IIR Filter + */ +typedef struct IIRFilter { + float b[3]; ///< Normalized cofficients for numerator of transfer function + float a[3]; ///< Normalized coefficiets for denominator of transfer function +} IIRFilter; + +typedef struct VorbisPsyContext { + AVFloatDSPContext *fdsp; + IIRFilter filter[2]; + float *filter_delay; ///< Direct Form II delay registers for each channel + float *variance; ///< Saved variances from previous sub-blocks for each channel + float preecho_thresh; ///< Threshold for determining prescence of a preecho +} VorbisPsyContext; + +/** + * Initializes the psychoacoustic model context + * + * @param vpctx Uninitialized pointer to the model context + * @param sample_rate Input audio sample rate + * @param channels Number of channels being analyzed + * @param blocks Number of short blocks for every frame of input + * @param fdsp Parent context's AVFloatDSPContext + * @return 0 on success, negative on failure + */ +av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate, + int channels, int blocks, AVFloatDSPContext *fdsp); + +/** + * Suggest the type of block to use for encoding the current frame + * + * Each frame of input is passed through a highpass filter to remove dominant + * low-frequency waveforms and the variance of each short block of input is + * then calculated. If the variance over this block is significantly more than + * blocks from the previous frame, a transient signal is likely present. + * + * @param audio Pointer to the current channel's input samples + * @param ch Current channel being analyzed + * @param frame_size Size of a full frame, i.e. the size of the long block + * @param block_size Size of the short block + * @return The correct blockflag to use for encoding, 0 short and 1 long + */ +int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio, + int ch, int frame_size, int block_size); +/** + * Closes and frees the memory used by the psychoacoustic model + */ +av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx); +#endif /* AVCODEC_VORBISPSY_H */

[FFmpeg-devel,1/6] avcodec/vorbisenc: Add pre-echo detection

Commit Message

Comments

Patch