[FFmpeg-devel,2/2] opusenc: (WIP) add a new psychoacoustic system for the native Opus encoder

Message ID	CAE9qxYAuteExYz2eOrgn7F95-OUeYd-AYdESncgUACQy8AR1AA@mail.gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; MIME-Version: 1.0 In-Reply-To: <20170412222635.26793-2-atomnuker@gmail.com> References: <20170412222635.26793-1-atomnuker@gmail.com> <20170412222635.26793-2-atomnuker@gmail.com> From: Rostislav Pehlivanov <atomnuker@gmail.com> Date: Sat, 23 Sep 2017 00:46:48 +0100 Message-ID: <CAE9qxYAuteExYz2eOrgn7F95-OUeYd-AYdESncgUACQy8AR1AA@mail.gmail.com> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: multipart/mixed; boundary="001a114d8a109814d60559cfd2cf" Subject: Re: [FFmpeg-devel] [PATCH 2/2] opusenc: (WIP) add a new psychoacoustic system for the native Opus encoder Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From 62dcd1e7e4db391c30c3893fde82bc205ebb7bfe Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov <atomnuker@gmail.com> Date: Sat, 23 Sep 2017 00:38:37 +0100 Subject: [PATCH] opusenc: implement a psychoacoustic system This commit implement a psychoacoustic system for the native Opus encoder. Its unlike any other psychoacoustic system known since its capable of using a lookahead to make better choices on how to treat the current frame and how many bits to allocate for it (and future frames). Also, whilst the main bulk of the analysis function has to run in a single thread, the per-frame anaylsis functions do not modify the main psychoacoustic context, so in the future it will be fairly trivial to run those as slice threads. Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com> --- libavcodec/Makefile | 3 +- libavcodec/opus_celt.h | 6 + libavcodec/opusenc.c | 270 ++++++++++++---------- libavcodec/opusenc.h | 56 +++++ libavcodec/opusenc_psy.c | 556 +++++++++++++++++++++++++++++++++++++++++++++ libavcodec/opusenc_psy.h | 104 +++++++++ libavcodec/opusenc_utils.h | 82 +++++++ 7 files changed, 951 insertions(+), 126 deletions(-) create mode 100644 libavcodec/opusenc.h create mode 100644 libavcodec/opusenc_psy.c create mode 100644 libavcodec/opusenc_psy.h create mode 100644 libavcodec/opusenc_utils.h diff --git a/libavcodec/Makefile b/libavcodec/Makefile index b0c39ac040..ff862f2c81 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -454,7 +454,8 @@ OBJS-$(CONFIG_NUV_DECODER) += nuv.o rtjpeg.o OBJS-$(CONFIG_ON2AVC_DECODER) += on2avc.o on2avcdata.o OBJS-$(CONFIG_OPUS_DECODER) += opusdec.o opus.o opus_celt.o opus_rc.o \ opus_pvq.o opus_silk.o opustab.o vorbis_data.o -OBJS-$(CONFIG_OPUS_ENCODER) += opusenc.o opus_rc.o opustab.o opus_pvq.o +OBJS-$(CONFIG_OPUS_ENCODER) += opusenc.o opus_rc.o opustab.o opus_pvq.o \ + opusenc_psy.o OBJS-$(CONFIG_PAF_AUDIO_DECODER) += pafaudio.o OBJS-$(CONFIG_PAF_VIDEO_DECODER) += pafvideo.o OBJS-$(CONFIG_PAM_DECODER) += pnmdec.o pnm.o diff --git a/libavcodec/opus_celt.h b/libavcodec/opus_celt.h index 31299912bd..45d50ab27b 100644 --- a/libavcodec/opus_celt.h +++ b/libavcodec/opus_celt.h @@ -120,6 +120,12 @@ struct CeltFrame { uint32_t seed; enum CeltSpread spread; + /* Encoder PF coeffs */ + int pf_octave; + int pf_period; + int pf_tapset; + float pf_gain; + /* Bit allocation */ int framebits; int remaining; diff --git a/libavcodec/opusenc.c b/libavcodec/opusenc.c index 8f2da4a7ba..79d20dc6e6 100644 --- a/libavcodec/opusenc.c +++ b/libavcodec/opusenc.c @@ -19,8 +19,9 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "opus_celt.h" +#include "opusenc.h" #include "opus_pvq.h" +#include "opusenc_psy.h" #include "opustab.h" #include "libavutil/float_dsp.h" @@ -29,28 +30,10 @@ #include "bytestream.h" #include "audio_frame_queue.h" -/* Determines the maximum delay the psychoacoustic system will use for lookahead */ -#define FF_BUFQUEUE_SIZE 145 -#include "libavfilter/bufferqueue.h" - -#define OPUS_MAX_LOOKAHEAD ((FF_BUFQUEUE_SIZE - 1)*2.5f) - -#define OPUS_MAX_CHANNELS 2 - -/* 120 ms / 2.5 ms = 48 frames (extremely improbable, but the encoder'll work) */ -#define OPUS_MAX_FRAMES_PER_PACKET 48 - -#define OPUS_BLOCK_SIZE(x) (2 * 15 * (1 << ((x) + 2))) - -#define OPUS_SAMPLES_TO_BLOCK_SIZE(x) (ff_log2((x) / (2 * 15)) - 2) - -typedef struct OpusEncOptions { - float max_delay_ms; -} OpusEncOptions; - typedef struct OpusEncContext { AVClass *av_class; OpusEncOptions options; + OpusPsyContext psyctx; AVCodecContext *avctx; AudioFrameQueue afq; AVFloatDSPContext *dsp; @@ -58,10 +41,10 @@ typedef struct OpusEncContext { CeltPVQ *pvq; struct FFBufQueue bufqueue; - enum OpusMode mode; - enum OpusBandwidth bandwidth; - int pkt_framesize; - int pkt_frames; + uint8_t enc_id[64]; + int enc_id_bits; + + OpusPacketInfo packet; int channels; @@ -100,18 +83,18 @@ static int opus_gen_toc(OpusEncContext *s, uint8_t *toc, int *size, int *fsize_n { { 3, 7, 11, 0, 0 }, { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 } }, /* 40 ms */ { { 4, 8, 12, 0, 0 }, { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 } }, /* 60 ms */ }; - int cfg = toc_cfg[s->pkt_framesize][s->mode][s->bandwidth]; + int cfg = toc_cfg[s->packet.framesize][s->packet.mode][s->packet.bandwidth]; *fsize_needed = 0; if (!cfg) return 1; - if (s->pkt_frames == 2) { /* 2 packets */ + if (s->packet.frames == 2) { /* 2 packets */ if (s->frame[0].framebits == s->frame[1].framebits) { /* same size */ tmp = 0x1; } else { /* different size */ tmp = 0x2; *fsize_needed = 1; /* put frame sizes in the packet */ } - } else if (s->pkt_frames > 2) { + } else if (s->packet.frames > 2) { tmp = 0x3; extended_toc = 1; } @@ -119,10 +102,11 @@ static int opus_gen_toc(OpusEncContext *s, uint8_t *toc, int *size, int *fsize_n tmp |= (cfg - 1) << 3; /* codec configuration */ *toc++ = tmp; if (extended_toc) { - for (i = 0; i < (s->pkt_frames - 1); i++) + for (i = 0; i < (s->packet.frames - 1); i++) *fsize_needed |= (s->frame[i].framebits != s->frame[i + 1].framebits); - tmp = (*fsize_needed) << 7; /* vbr flag */ - tmp |= s->pkt_frames; /* frame number - can be 0 as well */ + tmp = (*fsize_needed) << 7; /* vbr flag */ + tmp |= (0) << 6; /* padding flag */ + tmp |= s->packet.frames; *toc++ = tmp; } *size = 1 + extended_toc; @@ -134,7 +118,7 @@ static void celt_frame_setup_input(OpusEncContext *s, CeltFrame *f) int sf, ch; AVFrame *cur = NULL; const int subframesize = s->avctx->frame_size; - int subframes = OPUS_BLOCK_SIZE(s->pkt_framesize) / subframesize; + int subframes = OPUS_BLOCK_SIZE(s->packet.framesize) / subframesize; cur = ff_bufqueue_get(&s->bufqueue); @@ -174,7 +158,7 @@ static void celt_apply_preemph_filter(OpusEncContext *s, CeltFrame *f) { int i, sf, ch; const int subframesize = s->avctx->frame_size; - const int subframes = OPUS_BLOCK_SIZE(s->pkt_framesize) / subframesize; + const int subframes = OPUS_BLOCK_SIZE(s->packet.framesize) / subframesize; /* Filter overlap */ for (ch = 0; ch < f->channels; ch++) { @@ -207,7 +191,7 @@ static void celt_apply_preemph_filter(OpusEncContext *s, CeltFrame *f) /* Create the window and do the mdct */ static void celt_frame_mdct(OpusEncContext *s, CeltFrame *f) { - int t, ch; + int i, j, t, ch; float *win = s->scratch, *temp = s->scratch + 1920; if (f->transient) { @@ -245,12 +229,6 @@ static void celt_frame_mdct(OpusEncContext *s, CeltFrame *f) s->mdct[f->size]->mdct(s->mdct[f->size], b->coeffs, win, 1); } } -} - -/* Fills the bands and normalizes them */ -static void celt_frame_map_norm_bands(OpusEncContext *s, CeltFrame *f) -{ - int i, j, ch; for (ch = 0; ch < f->channels; ch++) { CeltBlock *block = &f->block[ch]; @@ -304,7 +282,7 @@ static void celt_enc_tf(OpusRangeCoder *rc, CeltFrame *f) f->tf_change[i] = ff_celt_tf_select[f->size][f->transient][tf_select][f->tf_change[i]]; } -static void ff_celt_enc_bitalloc(OpusRangeCoder *rc, CeltFrame *f) +void ff_celt_enc_bitalloc(OpusRangeCoder *rc, CeltFrame *f) { int i, j, low, high, total, done, bandbits, remaining, tbits_8ths; int skip_startband = f->start_band; @@ -324,6 +302,8 @@ static void ff_celt_enc_bitalloc(OpusRangeCoder *rc, CeltFrame *f) /* Tell the spread to the decoder */ if (opus_rc_tell(rc) + 4 <= f->framebits) ff_opus_rc_enc_cdf(rc, f->spread, ff_celt_model_spread); + else + f->spread = CELT_SPREAD_NORMAL; /* Generate static allocation caps */ for (i = 0; i < CELT_MAX_BANDS; i++) { @@ -629,6 +609,43 @@ static void ff_celt_enc_bitalloc(OpusRangeCoder *rc, CeltFrame *f) } } +static void celt_enc_quant_pfilter(OpusRangeCoder *rc, CeltFrame *f) +{ + float gain = f->pf_gain; + int i, txval, octave = f->pf_octave, period = f->pf_period, tapset = f->pf_tapset; + + ff_opus_rc_enc_log(rc, f->pfilter, 1); + if (!f->pfilter) + return; + + /* Octave */ + txval = FFMIN(octave, 6); + ff_opus_rc_enc_uint(rc, txval, 6); + octave = txval; + /* Period */ + txval = av_clip(period - (16 << octave) + 1, 0, (1 << (4 + octave)) - 1); + ff_opus_rc_put_raw(rc, period, 4 + octave); + period = txval + (16 << octave) - 1; + /* Gain */ + txval = FFMIN(((int)(gain / 0.09375f)) - 1, 7); + ff_opus_rc_put_raw(rc, txval, 3); + gain = 0.09375f * (txval + 1); + /* Tapset */ + if ((opus_rc_tell(rc) + 2) <= f->framebits) + ff_opus_rc_enc_cdf(rc, tapset, ff_celt_model_tapset); + else + tapset = 0; + /* Finally create the coeffs */ + for (i = 0; i < 2; i++) { + CeltBlock *block = &f->block[i]; + + block->pf_period_new = FFMAX(period, CELT_POSTFILTER_MINPERIOD); + block->pf_gains_new[0] = gain * ff_celt_postfilter_taps[tapset][0]; + block->pf_gains_new[1] = gain * ff_celt_postfilter_taps[tapset][1]; + block->pf_gains_new[2] = gain * ff_celt_postfilter_taps[tapset][2]; + } +} + static void exp_quant_coarse(OpusRangeCoder *rc, CeltFrame *f, float last_energy[][CELT_MAX_BANDS], int intra) { @@ -819,39 +836,64 @@ static void celt_quant_bands(OpusRangeCoder *rc, CeltFrame *f) } } -static void celt_encode_frame(OpusEncContext *s, OpusRangeCoder *rc, CeltFrame *f) +static void celt_encode_frame(OpusEncContext *s, OpusRangeCoder *rc, + CeltFrame *f, int index) { int i, ch; + ff_opus_rc_enc_init(rc); + + ff_opus_psy_celt_frame_init(&s->psyctx, f, index); + celt_frame_setup_input(s, f); + + if (f->silence) { + if (f->framebits >= 16) + ff_opus_rc_enc_log(rc, 1, 15); /* Silence (if using explicit singalling) */ + for (ch = 0; ch < s->channels; ch++) + memset(s->last_quantized_energy[ch], 0.0f, sizeof(float)*CELT_MAX_BANDS); + return; + } + + /* Filters */ celt_apply_preemph_filter(s, f); if (f->pfilter) { - /* Not implemented */ + ff_opus_rc_enc_log(rc, 0, 15); + celt_enc_quant_pfilter(rc, f); } + + /* Transform */ celt_frame_mdct(s, f); - celt_frame_map_norm_bands(s, f); - ff_opus_rc_enc_log(rc, f->silence, 15); + /* Need to handle transient/non-transient switches at any point during analysis */ + while (ff_opus_psy_celt_frame_process(&s->psyctx, f, index)) + celt_frame_mdct(s, f); - if (!f->start_band && opus_rc_tell(rc) + 16 <= f->framebits) - ff_opus_rc_enc_log(rc, f->pfilter, 1); + ff_opus_rc_enc_init(rc); - if (f->pfilter) { - /* Not implemented */ - } + /* Silence */ + ff_opus_rc_enc_log(rc, 0, 15); + + /* Pitch filter */ + if (!f->start_band && opus_rc_tell(rc) + 16 <= f->framebits) + celt_enc_quant_pfilter(rc, f); + /* Transient flag */ if (f->size && opus_rc_tell(rc) + 3 <= f->framebits) ff_opus_rc_enc_log(rc, f->transient, 3); + /* Main encoding */ celt_quant_coarse(rc, f, s->last_quantized_energy); celt_enc_tf (rc, f); ff_celt_enc_bitalloc(rc, f); celt_quant_fine (rc, f); celt_quant_bands (rc, f); + /* Anticollapse bit */ if (f->anticollapse_needed) ff_opus_rc_put_raw(rc, f->anticollapse, 1); + /* Final per-band energy adjustments from leftover bits */ celt_quant_final(s, rc, f); for (ch = 0; ch < f->channels; ch++) { @@ -861,49 +903,11 @@ static void celt_encode_frame(OpusEncContext *s, OpusRangeCoder *rc, CeltFrame * } } -static void ff_opus_psy_process(OpusEncContext *s, int end, int *need_more) +static inline int write_opuslacing(uint8_t *dst, int v) { - int max_delay_samples = (s->options.max_delay_ms*s->avctx->sample_rate)/1000; - int max_bsize = FFMIN(OPUS_SAMPLES_TO_BLOCK_SIZE(max_delay_samples), CELT_BLOCK_960); - - s->pkt_frames = 1; - s->pkt_framesize = max_bsize; - s->mode = OPUS_MODE_CELT; - s->bandwidth = OPUS_BANDWIDTH_FULLBAND; - - *need_more = s->bufqueue.available*s->avctx->frame_size < (max_delay_samples + CELT_OVERLAP); - /* Don't request more if we start being flushed with NULL frames */ - *need_more = !end && *need_more; -} - -static void ff_opus_psy_celt_frame_setup(OpusEncContext *s, CeltFrame *f, int index) -{ - int frame_size = OPUS_BLOCK_SIZE(s->pkt_framesize); - - f->avctx = s->avctx; - f->dsp = s->dsp; - f->pvq = s->pvq; - f->start_band = (s->mode == OPUS_MODE_HYBRID) ? 17 : 0; - f->end_band = ff_celt_band_end[s->bandwidth]; - f->channels = s->channels; - f->size = s->pkt_framesize; - - /* Decisions */ - f->silence = 0; - f->pfilter = 0; - f->transient = 0; - f->tf_select = 0; - f->anticollapse = 0; - f->alloc_trim = 5; - f->skip_band_floor = f->end_band; - f->intensity_stereo = f->end_band; - f->dual_stereo = 0; - f->spread = CELT_SPREAD_NORMAL; - memset(f->tf_change, 0, sizeof(int)*CELT_MAX_BANDS); - memset(f->alloc_boost, 0, sizeof(int)*CELT_MAX_BANDS); - - f->blocks = f->transient ? frame_size/CELT_OVERLAP : 1; - f->framebits = FFALIGN(lrintf((double)s->avctx->bit_rate/(s->avctx->sample_rate/frame_size)), 8); + dst[0] = FFMIN(v - FFALIGN(v - 255, 4), v); + dst[1] = v - dst[0] >> 2; + return 1 + (v >= 252); } static void opus_packet_assembler(OpusEncContext *s, AVPacket *avpkt) @@ -913,8 +917,18 @@ static void opus_packet_assembler(OpusEncContext *s, AVPacket *avpkt) /* Write toc */ opus_gen_toc(s, avpkt->data, &offset, &fsize_needed); - for (i = 0; i < s->pkt_frames; i++) { - ff_opus_rc_enc_end(&s->rc[i], avpkt->data + offset, s->frame[i].framebits >> 3); + /* Frame sizes if needed */ + if (fsize_needed) { + for (i = 0; i < s->packet.frames - 1; i++) { + offset += write_opuslacing(avpkt->data + offset, + s->frame[i].framebits >> 3); + } + } + + /* Packets */ + for (i = 0; i < s->packet.frames; i++) { + ff_opus_rc_enc_end(&s->rc[i], avpkt->data + offset, + s->frame[i].framebits >> 3); offset += s->frame[i].framebits >> 3; } @@ -946,29 +960,27 @@ static int opus_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, const AVFrame *frame, int *got_packet_ptr) { OpusEncContext *s = avctx->priv_data; - int i, ret, frame_size, need_more, alloc_size = 0; + int i, ret, frame_size, alloc_size = 0; if (frame) { /* Add new frame to queue */ if ((ret = ff_af_queue_add(&s->afq, frame)) < 0) return ret; ff_bufqueue_add(avctx, &s->bufqueue, av_frame_clone(frame)); } else { + ff_opus_psy_signal_eof(&s->psyctx); if (!s->afq.remaining_samples) return 0; /* We've been flushed and there's nothing left to encode */ } /* Run the psychoacoustic system */ - ff_opus_psy_process(s, !frame, &need_more); - - /* Get more samples for lookahead/encoding */ - if (need_more) + if (ff_opus_psy_process(&s->psyctx, &s->packet)) return 0; - frame_size = OPUS_BLOCK_SIZE(s->pkt_framesize); + frame_size = OPUS_BLOCK_SIZE(s->packet.framesize); if (!frame) { /* This can go negative, that's not a problem, we only pad if positive */ - int pad_empty = s->pkt_frames*(frame_size/s->avctx->frame_size) - s->bufqueue.available + 1; + int pad_empty = s->packet.frames*(frame_size/s->avctx->frame_size) - s->bufqueue.available + 1; /* Pad with empty 2.5 ms frames to whatever framesize was decided, * this should only happen at the very last flush frame. The frames * allocated here will be freed (because they have no other references) @@ -981,15 +993,13 @@ static int opus_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, } } - for (i = 0; i < s->pkt_frames; i++) { - ff_opus_rc_enc_init(&s->rc[i]); - ff_opus_psy_celt_frame_setup(s, &s->frame[i], i); - celt_encode_frame(s, &s->rc[i], &s->frame[i]); + for (i = 0; i < s->packet.frames; i++) { + celt_encode_frame(s, &s->rc[i], &s->frame[i], i); alloc_size += s->frame[i].framebits >> 3; } /* Worst case toc + the frame lengths if needed */ - alloc_size += 2 + s->pkt_frames*2; + alloc_size += 2 + s->packet.frames*2; if ((ret = ff_alloc_packet2(avctx, avpkt, alloc_size, 0)) < 0) return ret; @@ -997,13 +1007,16 @@ static int opus_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, /* Assemble packet */ opus_packet_assembler(s, avpkt); + /* Update the psychoacoustic system */ + ff_opus_psy_postencode_update(&s->psyctx, s->frame, s->rc); + /* Remove samples from queue and skip if needed */ - ff_af_queue_remove(&s->afq, s->pkt_frames*frame_size, &avpkt->pts, &avpkt->duration); - if (s->pkt_frames*frame_size > avpkt->duration) { + ff_af_queue_remove(&s->afq, s->packet.frames*frame_size, &avpkt->pts, &avpkt->duration); + if (s->packet.frames*frame_size > avpkt->duration) { uint8_t *side = av_packet_new_side_data(avpkt, AV_PKT_DATA_SKIP_SAMPLES, 10); if (!side) return AVERROR(ENOMEM); - AV_WL32(&side[4], s->pkt_frames*frame_size - avpkt->duration + 120); + AV_WL32(&side[4], s->packet.frames*frame_size - avpkt->duration + 120); } *got_packet_ptr = 1; @@ -1024,6 +1037,7 @@ static av_cold int opus_encode_end(AVCodecContext *avctx) av_freep(&s->frame); av_freep(&s->rc); ff_af_queue_close(&s->afq); + ff_opus_psy_end(&s->psyctx); ff_bufqueue_discard_all(&s->bufqueue); av_freep(&avctx->extradata); @@ -1032,7 +1046,7 @@ static av_cold int opus_encode_end(AVCodecContext *avctx) static av_cold int opus_encode_init(AVCodecContext *avctx) { - int i, ch, ret; + int i, ch, ret, max_frames; OpusEncContext *s = avctx->priv_data; s->avctx = avctx; @@ -1057,14 +1071,6 @@ static av_cold int opus_encode_init(AVCodecContext *avctx) avctx->bit_rate = clipped_rate; } - /* Frame structs and range coder buffers */ - s->frame = av_malloc(OPUS_MAX_FRAMES_PER_PACKET*sizeof(CeltFrame)); - if (!s->frame) - return AVERROR(ENOMEM); - s->rc = av_malloc(OPUS_MAX_FRAMES_PER_PACKET*sizeof(OpusRangeCoder)); - if (!s->rc) - return AVERROR(ENOMEM); - /* Extradata */ avctx->extradata_size = 19; avctx->extradata = av_malloc(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE); @@ -1085,27 +1091,41 @@ static av_cold int opus_encode_init(AVCodecContext *avctx) if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i)))) return AVERROR(ENOMEM); - for (i = 0; i < OPUS_MAX_FRAMES_PER_PACKET; i++) { - s->frame[i].block[0].emph_coeff = s->frame[i].block[1].emph_coeff = 0.0f; - s->frame[i].seed = 0; - } - /* Zero out previous energy (matters for inter first frame) */ for (ch = 0; ch < s->channels; ch++) - for (i = 0; i < CELT_MAX_BANDS; i++) - s->last_quantized_energy[ch][i] = 0.0f; + memset(s->last_quantized_energy[ch], 0.0f, sizeof(float)*CELT_MAX_BANDS); /* Allocate an empty frame to use as overlap for the first frame of audio */ ff_bufqueue_add(avctx, &s->bufqueue, spawn_empty_frame(s)); if (!ff_bufqueue_peek(&s->bufqueue, 0)) return AVERROR(ENOMEM); + if ((ret = ff_opus_psy_init(&s->psyctx, s->avctx, &s->bufqueue, &s->options))) + return ret; + + /* Frame structs and range coder buffers */ + max_frames = ceilf(FFMIN(s->options.max_delay_ms, 120.0f)/2.5f); + s->frame = av_malloc(max_frames*sizeof(CeltFrame)); + if (!s->frame) + return AVERROR(ENOMEM); + s->rc = av_malloc(max_frames*sizeof(OpusRangeCoder)); + if (!s->rc) + return AVERROR(ENOMEM); + + for (i = 0; i < max_frames; i++) { + s->frame[i].dsp = s->dsp; + s->frame[i].avctx = s->avctx; + s->frame[i].seed = 0; + s->frame[i].pvq = s->pvq; + s->frame[i].block[0].emph_coeff = s->frame[i].block[1].emph_coeff = 0.0f; + } + return 0; } #define OPUSENC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM static const AVOption opusenc_options[] = { - { "opus_delay", "Maximum delay (and lookahead) in milliseconds", offsetof(OpusEncContext, options.max_delay_ms), AV_OPT_TYPE_FLOAT, { .dbl = OPUS_MAX_LOOKAHEAD }, 2.5f, OPUS_MAX_LOOKAHEAD, OPUSENC_FLAGS }, + { "opus_delay", "Maximum delay in milliseconds", offsetof(OpusEncContext, options.max_delay_ms), AV_OPT_TYPE_FLOAT, { .dbl = OPUS_MAX_LOOKAHEAD }, 2.5f, OPUS_MAX_LOOKAHEAD, OPUSENC_FLAGS, "max_delay_ms" }, { NULL }, }; diff --git a/libavcodec/opusenc.h b/libavcodec/opusenc.h new file mode 100644 index 0000000000..3273d0a9a2 --- /dev/null +++ b/libavcodec/opusenc.h @@ -0,0 +1,56 @@ +/* + * Opus encoder + * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_OPUSENC_H +#define AVCODEC_OPUSENC_H + +#include "internal.h" +#include "opus_celt.h" + +/* Determines the maximum delay the psychoacoustic system will use for lookahead */ +#define FF_BUFQUEUE_SIZE 145 +#include "libavfilter/bufferqueue.h" + +#define OPUS_MAX_LOOKAHEAD ((FF_BUFQUEUE_SIZE - 1)*2.5f) + +#define OPUS_MAX_CHANNELS 2 + +/* 120 ms / 2.5 ms = 48 frames (extremely improbable, but the encoder'll work) */ +#define OPUS_MAX_FRAMES_PER_PACKET 48 + +#define OPUS_BLOCK_SIZE(x) (2 * 15 * (1 << ((x) + 2))) + +#define OPUS_SAMPLES_TO_BLOCK_SIZE(x) (ff_log2((x) / (2 * 15)) - 2) + +typedef struct OpusEncOptions { + float max_delay_ms; +} OpusEncOptions; + +typedef struct OpusPacketInfo { + enum OpusMode mode; + enum OpusBandwidth bandwidth; + int framesize; + int frames; +} OpusPacketInfo; + +void ff_celt_enc_bitalloc(OpusRangeCoder *rc, CeltFrame *f); + +#endif /* AVCODEC_OPUSENC_H */ diff --git a/libavcodec/opusenc_psy.c b/libavcodec/opusenc_psy.c new file mode 100644 index 0000000000..7c356fc568 --- /dev/null +++ b/libavcodec/opusenc_psy.c @@ -0,0 +1,556 @@ +/* + * Opus encoder + * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "opusenc_psy.h" +#include "opus_pvq.h" +#include "opustab.h" +#include "mdct15.h" +#include "libavutil/qsort.h" + +/* Populate metrics without taking into consideration neighbouring steps */ +static void step_collect_psy_metrics(OpusPsyContext *s, int index) +{ + int silence = 0, ch, i, j; + OpusPsyStep *st = s->steps[index]; + + st->index = index; + + for (ch = 0; ch < s->avctx->channels; ch++) { + const int lap_size = (1 << s->bsize_analysis); + for (i = 1; i <= FFMIN(lap_size, index); i++) { + const int offset = i*120; + AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index - i); + memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float)); + } + for (i = 0; i < lap_size; i++) { + const int offset = i*120 + lap_size; + AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + i); + memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float)); + } + + s->dsp->vector_fmul(s->scratch, s->scratch, s->window[s->bsize_analysis], + (OPUS_BLOCK_SIZE(s->bsize_analysis) << 1)); + + s->mdct[s->bsize_analysis]->mdct(s->mdct[s->bsize_analysis], st->coeffs[ch], s->scratch, 1); + + for (i = 0; i < CELT_MAX_BANDS; i++) + st->bands[ch][i] = &st->coeffs[ch][ff_celt_freq_bands[i] << s->bsize_analysis]; + } + + for (ch = 0; ch < s->avctx->channels; ch++) { + for (i = 0; i < CELT_MAX_BANDS; i++) { + float avg_c_s, energy = 0.0f, dist_dev = 0.0f; + const int range = ff_celt_freq_range[i] << s->bsize_analysis; + const float *coeffs = st->bands[ch][i]; + for (j = 0; j < range; j++) + energy += coeffs[j]*coeffs[j]; + + st->energy[ch][i] += sqrtf(energy); + silence |= !!st->energy[ch][i]; + avg_c_s = energy / range; + + for (j = 0; j < range; j++) { + const float c_s = coeffs[j]*coeffs[j]; + dist_dev = (avg_c_s - c_s)*(avg_c_s - c_s); + } + + st->tone[ch][i] += sqrtf(dist_dev); + } + } + + st->silence = !silence; + + if (s->avctx->channels > 1) { + for (i = 0; i < CELT_MAX_BANDS; i++) { + float incompat = 0.0f; + const float *coeffs1 = st->bands[0][i]; + const float *coeffs2 = st->bands[1][i]; + const int range = ff_celt_freq_range[i] << s->bsize_analysis; + for (j = 0; j < range; j++) + incompat += (coeffs1[j] - coeffs2[j])*(coeffs1[j] - coeffs2[j]); + st->stereo[i] = sqrtf(incompat); + } + } + + for (ch = 0; ch < s->avctx->channels; ch++) { + for (i = 0; i < CELT_MAX_BANDS; i++) { + OpusBandExcitation *ex = &s->ex[ch][i]; + float bp_e = bessel_filter(&s->bfilter_lo[ch][i], st->energy[ch][i]); + bp_e = bessel_filter(&s->bfilter_hi[ch][i], bp_e); + bp_e *= bp_e; + if (bp_e > ex->excitation) { + st->change_amp[ch][i] = bp_e - ex->excitation; + st->total_change += st->change_amp[ch][i]; + ex->excitation = ex->excitation_init = bp_e; + ex->excitation_dist = 0.0f; + } + if (ex->excitation > 0.0f) { + ex->excitation -= av_clipf((1/expf(ex->excitation_dist)), ex->excitation_init/20, ex->excitation_init/1.09); + ex->excitation = FFMAX(ex->excitation, 0.0f); + ex->excitation_dist += 1.0f; + } + } + } +} + +static void search_for_change_points(OpusPsyContext *s, float tgt_change, + int offset_s, int offset_e, int resolution, + int level) +{ + int i; + float c_change = 0.0f; + if ((offset_e - offset_s) <= resolution) + return; + for (i = offset_s; i < offset_e; i++) { + c_change += s->steps[i]->total_change; + if (c_change > tgt_change) + break; + } + if (i == offset_e) + return; + search_for_change_points(s, tgt_change / 2.0f, offset_s, i + 0, resolution, level + 1); + s->inflection_points[s->inflection_points_count++] = i; + search_for_change_points(s, tgt_change / 2.0f, i + 1, offset_e, resolution, level + 1); +} + +static int flush_silent_frames(OpusPsyContext *s) +{ + int fsize, silent_frames; + + for (silent_frames = 0; silent_frames < s->buffered_steps; silent_frames++) + if (!s->steps[silent_frames]->silence) + break; + if (--silent_frames < 0) + return 0; + + for (fsize = CELT_BLOCK_960; fsize > CELT_BLOCK_120; fsize--) { + if ((1 << fsize) > silent_frames) + continue; + s->p.frames = FFMIN(silent_frames / (1 << fsize), 48 >> fsize); + s->p.framesize = fsize; + return 1; + } + + return 0; +} + +/* Main function which decides frame size and frames per current packet */ +static void psy_output_groups(OpusPsyContext *s) +{ + int max_delay_samples = (s->options->max_delay_ms*s->avctx->sample_rate)/1000; + int max_bsize = FFMIN(OPUS_SAMPLES_TO_BLOCK_SIZE(max_delay_samples), CELT_BLOCK_960); + + /* These don't change for now */ + s->p.mode = OPUS_MODE_CELT; + s->p.bandwidth = OPUS_BANDWIDTH_FULLBAND; + + /* Flush silent frames ASAP */ + if (s->steps[0]->silence && flush_silent_frames(s)) + return; + + s->p.framesize = FFMIN(max_bsize, CELT_BLOCK_960); + s->p.frames = 1; +} + +int ff_opus_psy_process(OpusPsyContext *s, OpusPacketInfo *p) +{ + int i; + float total_energy_change = 0.0f; + + if (s->buffered_steps < s->max_steps && !s->eof) { + const int awin = (1 << s->bsize_analysis); + if (++s->steps_to_process >= awin) { + step_collect_psy_metrics(s, s->buffered_steps - awin + 1); + s->steps_to_process = 0; + } + if ((++s->buffered_steps) < s->max_steps) + return 1; + } + + for (i = 0; i < s->buffered_steps; i++) + total_energy_change += s->steps[i]->total_change; + + search_for_change_points(s, total_energy_change / 2.0f, 0, + s->buffered_steps, 1, 0); + + psy_output_groups(s); + + p->frames = s->p.frames; + p->framesize = s->p.framesize; + p->mode = s->p.mode; + p->bandwidth = s->p.bandwidth; + + return 0; +} + +void ff_opus_psy_celt_frame_init(OpusPsyContext *s, CeltFrame *f, int index) +{ + int i, neighbouring_points = 0, start_offset = 0; + int radius = (1 << s->p.framesize), step_offset = radius*index; + int silence = 1; + + f->start_band = (s->p.mode == OPUS_MODE_HYBRID) ? 17 : 0; + f->end_band = ff_celt_band_end[s->p.bandwidth]; + f->channels = s->avctx->channels; + f->size = s->p.framesize; + + for (i = 0; i < (1 << f->size); i++) + silence &= s->steps[index*(1 << f->size) + i]->silence; + + f->silence = silence; + if (f->silence) { + f->framebits = 0; /* Otherwise the silence flag eats up 16(!) bits */ + return; + } + + for (i = 0; i < s->inflection_points_count; i++) { + if (s->inflection_points[i] >= step_offset) { + start_offset = i; + break; + } + } + + for (i = start_offset; i < FFMIN(radius, s->inflection_points_count - start_offset); i++) { + if (s->inflection_points[i] < (step_offset + radius)) { + neighbouring_points++; + } + } + + /* Transient flagging */ + f->transient = neighbouring_points > 0; + f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1; + + /* Some sane defaults */ + f->pfilter = 0; + f->pf_gain = 0.5f; + f->pf_octave = 2; + f->pf_period = 1; + f->pf_tapset = 2; + + /* More sane defaults */ + f->tf_select = 0; + f->anticollapse = 1; + f->alloc_trim = 5; + f->skip_band_floor = f->end_band; + f->intensity_stereo = f->end_band; + f->dual_stereo = 0; + f->spread = CELT_SPREAD_NORMAL; + memset(f->tf_change, 0, sizeof(int)*CELT_MAX_BANDS); + memset(f->alloc_boost, 0, sizeof(int)*CELT_MAX_BANDS); +} + +static void celt_gauge_psy_weight(OpusPsyContext *s, OpusPsyStep **start, + CeltFrame *f_out) +{ + int i, f, ch; + int frame_size = OPUS_BLOCK_SIZE(s->p.framesize); + float rate, frame_bits = 0; + + /* Used for the global ROTATE flag */ + float tonal = 0.0f; + + /* Pseudo-weights */ + float band_score[CELT_MAX_BANDS] = { 0 }; + float max_score = 1.0f; + + /* Pass one - one loop around each band, computing unquant stuff */ + for (i = 0; i < CELT_MAX_BANDS; i++) { + float weight = 0.0f; + float tonal_contrib = 0.0f; + for (f = 0; f < (1 << s->p.framesize); f++) { + weight = start[f]->stereo[i]; + for (ch = 0; ch < s->avctx->channels; ch++) { + weight += start[f]->change_amp[ch][i] + start[f]->tone[ch][i] + start[f]->energy[ch][i]; + tonal_contrib += start[f]->tone[ch][i]; + } + } + tonal += tonal_contrib; + band_score[i] = weight; + } + + tonal /= (float)CELT_MAX_BANDS; + + for (i = 0; i < CELT_MAX_BANDS; i++) { + if (band_score[i] > max_score) + max_score = band_score[i]; + } + + for (i = 0; i < CELT_MAX_BANDS; i++) { + f_out->alloc_boost[i] = (int)((band_score[i]/max_score)*3.0f); + frame_bits += band_score[i]*8.0f; + } + + tonal /= 1333136.0f; + f_out->spread = av_clip(lrintf(tonal), 0, 3); + + rate = ((float)s->avctx->bit_rate) + frame_bits*frame_size*16; + rate *= s->lambda; + rate /= s->avctx->sample_rate/frame_size; + + f_out->framebits = lrintf(rate); + f_out->framebits = FFMIN(f_out->framebits, OPUS_MAX_PACKET_SIZE*8); + f_out->framebits = FFALIGN(f_out->framebits, 8); +} + +static int bands_dist(OpusPsyContext *s, CeltFrame *f, float *total_dist) +{ + int i, tdist = 0.0f; + OpusRangeCoder dump; + + ff_opus_rc_enc_init(&dump); + ff_celt_enc_bitalloc(&dump, f); + + for (i = 0; i < CELT_MAX_BANDS; i++) { + float bits = 0.0f; + float dist = f->pvq->band_cost(f->pvq, f, &dump, i, &bits, s->lambda); + tdist += dist; + } + + *total_dist = tdist; + + return 0; +} + +static void celt_search_for_dual_stereo(OpusPsyContext *s, CeltFrame *f) +{ + float td1, td2; + f->dual_stereo = 0; + bands_dist(s, f, &td1); + f->dual_stereo = 1; + bands_dist(s, f, &td2); + + f->dual_stereo = td2 < td1; + s->dual_stereo_used += td2 < td1; +} + +static void celt_search_for_intensity(OpusPsyContext *s, CeltFrame *f) +{ + int i, best_band = CELT_MAX_BANDS - 1; + float dist, best_dist = FLT_MAX; + + /* TODO: fix, make some heuristic up here using the lambda value */ + float end_band = 0; + + for (i = f->end_band; i >= end_band; i--) { + f->intensity_stereo = i; + bands_dist(s, f, &dist); + if (best_dist > dist) { + best_dist = dist; + best_band = i; + } + } + + f->intensity_stereo = best_band; + s->avg_is_band = (s->avg_is_band + f->intensity_stereo)/2.0f; +} + +static int celt_search_for_tf(OpusPsyContext *s, OpusPsyStep **start, CeltFrame *f) +{ + int i, j, k, cway, config[2][CELT_MAX_BANDS] = { { 0 } }; + float score[2] = { 0 }; + + for (cway = 0; cway < 2; cway++) { + int mag[2]; + int base = f->transient ? 120 : 960; + + for (int i = 0; i < 2; i++) { + int c = ff_celt_tf_select[f->size][f->transient][cway][i]; + mag[i] = c < 0 ? base >> FFABS(c) : base << FFABS(c); + } + + for (i = 0; i < CELT_MAX_BANDS; i++) { + float iscore0 = 0.0f; + float iscore1 = 0.0f; + for (j = 0; j < (1 << f->size); j++) { + for (k = 0; k < s->avctx->channels; k++) { + iscore0 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[0]; + iscore1 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[1]; + } + } + config[cway][i] = FFABS(iscore0 - 1.0f) < FFABS(iscore1 - 1.0f); + score[cway] += config[cway][i] ? iscore1 : iscore0; + } + } + + f->tf_select = score[0] < score[1]; + memcpy(f->tf_change, config[f->tf_select], sizeof(int)*CELT_MAX_BANDS); + + return 0; +} + +int ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index) +{ + int start_transient_flag = f->transient; + OpusPsyStep **start = &s->steps[index * (1 << s->p.framesize)]; + + if (f->silence) + return 0; + + celt_gauge_psy_weight(s, start, f); + celt_search_for_intensity(s, f); + celt_search_for_dual_stereo(s, f); + celt_search_for_tf(s, start, f); + + if (f->transient != start_transient_flag) { + f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1; + s->redo_analysis = 1; + return 1; + } + + s->redo_analysis = 0; + + return 0; +} + +void ff_opus_psy_postencode_update(OpusPsyContext *s, CeltFrame *f, OpusRangeCoder *rc) +{ + int i, frame_size = OPUS_BLOCK_SIZE(s->p.framesize); + int steps_out = s->p.frames*(frame_size/120); + void *tmp[FF_BUFQUEUE_SIZE]; + float ideal_fbits; + + for (i = 0; i < steps_out; i++) + memset(s->steps[i], 0, sizeof(OpusPsyStep)); + + for (i = 0; i < s->max_steps; i++) + tmp[i] = s->steps[i]; + + for (i = 0; i < s->max_steps; i++) { + const int i_new = i - steps_out; + s->steps[i_new < 0 ? s->max_steps + i_new : i_new] = tmp[i]; + } + + for (i = steps_out; i < s->buffered_steps; i++) + s->steps[i]->index -= steps_out; + + ideal_fbits = s->avctx->bit_rate/(s->avctx->sample_rate/frame_size); + + for (i = 0; i < s->p.frames; i++) { + s->avg_is_band += f[i].intensity_stereo; + s->lambda *= ideal_fbits / f[i].framebits; + } + + s->avg_is_band /= (s->p.frames + 1); + + s->cs_num = 0; + s->steps_to_process = 0; + s->buffered_steps -= steps_out; + s->total_packets_out += s->p.frames; + s->inflection_points_count = 0; +} + +av_cold int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx, + struct FFBufQueue *bufqueue, OpusEncOptions *options) +{ + int i, ch, ret; + + s->redo_analysis = 0; + s->lambda = 1.0f; + s->options = options; + s->avctx = avctx; + s->bufqueue = bufqueue; + s->max_steps = ceilf(s->options->max_delay_ms/2.5f); + s->bsize_analysis = CELT_BLOCK_960; + s->avg_is_band = CELT_MAX_BANDS - 1; + s->inflection_points_count = 0; + + s->inflection_points = av_mallocz(sizeof(*s->inflection_points)*s->max_steps); + if (!s->inflection_points) { + ret = AVERROR(ENOMEM); + goto fail; + } + + s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT); + if (!s->dsp) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (ch = 0; ch < s->avctx->channels; ch++) { + for (i = 0; i < CELT_MAX_BANDS; i++) { + bessel_init(&s->bfilter_hi[ch][i], 1.0f, 19.0f, 100.0f, 1); + bessel_init(&s->bfilter_lo[ch][i], 1.0f, 20.0f, 100.0f, 0); + } + } + + for (i = 0; i < s->max_steps; i++) { + s->steps[i] = av_mallocz(sizeof(OpusPsyStep)); + if (!s->steps[i]) { + ret = AVERROR(ENOMEM); + goto fail; + } + } + + for (i = 0; i < CELT_BLOCK_NB; i++) { + float tmp; + const int len = OPUS_BLOCK_SIZE(i); + s->window[i] = av_malloc(2*len*sizeof(float)); + if (!s->window[i]) { + ret = AVERROR(ENOMEM); + goto fail; + } + ff_generate_window_func(s->window[i], 2*len, WFUNC_SINE, &tmp); + if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i)))) + goto fail; + } + + return 0; + +fail: + av_freep(&s->inflection_points); + av_freep(&s->dsp); + + for (i = 0; i < CELT_BLOCK_NB; i++) { + ff_mdct15_uninit(&s->mdct[i]); + av_freep(&s->window[i]); + } + + for (i = 0; i < s->max_steps; i++) + av_freep(&s->steps[i]); + + return ret; +} + +void ff_opus_psy_signal_eof(OpusPsyContext *s) +{ + s->eof = 1; +} + +av_cold int ff_opus_psy_end(OpusPsyContext *s) +{ + int i; + + av_freep(&s->inflection_points); + av_freep(&s->dsp); + + for (i = 0; i < CELT_BLOCK_NB; i++) { + ff_mdct15_uninit(&s->mdct[i]); + av_freep(&s->window[i]); + } + + for (i = 0; i < s->max_steps; i++) + av_freep(&s->steps[i]); + + av_log(s->avctx, AV_LOG_INFO, "Average Intensity Stereo band: %0.1f\n", s->avg_is_band); + av_log(s->avctx, AV_LOG_INFO, "Dual Stereo used: %0.2f%%\n", ((float)s->dual_stereo_used/s->total_packets_out)*100.0f); + + return 0; +} diff --git a/libavcodec/opusenc_psy.h b/libavcodec/opusenc_psy.h new file mode 100644 index 0000000000..b91e4f1b8b --- /dev/null +++ b/libavcodec/opusenc_psy.h @@ -0,0 +1,104 @@ +/* + * Opus encoder + * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_OPUSENC_PSY_H +#define AVCODEC_OPUSENC_PSY_H + +#include "opusenc.h" +#include "opusenc_utils.h" +#include "libavfilter/window_func.h" + +/* Each step is 2.5ms */ +typedef struct OpusPsyStep { + int index; /* Current index */ + int silence; + float energy[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; /* Masking effects included */ + float tone[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; /* Tonality */ + float stereo[CELT_MAX_BANDS]; /* IS/MS compatibility */ + float change_amp[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; /* Jump over last frame */ + float total_change; /* Total change */ + + float *bands[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; + float coeffs[OPUS_MAX_CHANNELS][OPUS_BLOCK_SIZE(CELT_BLOCK_960)]; +} OpusPsyStep; + +typedef struct OpusBandExcitation { + float excitation; + float excitation_dist; + float excitation_init; +} OpusBandExcitation; + +typedef struct PsyChain { + int start; + int end; +} PsyChain; + +typedef struct OpusPsyContext { + AVCodecContext *avctx; + AVFloatDSPContext *dsp; + struct FFBufQueue *bufqueue; + OpusEncOptions *options; + + PsyChain cs[128]; + int cs_num; + + OpusBandExcitation ex[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; + FFBesselFilter bfilter_lo[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; + FFBesselFilter bfilter_hi[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; + + OpusPsyStep *steps[FF_BUFQUEUE_SIZE + 1]; + int max_steps; + + float *window[CELT_BLOCK_NB]; + MDCT15Context *mdct[CELT_BLOCK_NB]; + int bsize_analysis; + + DECLARE_ALIGNED(32, float, scratch)[2048]; + + /* Stats */ + float rc_waste; + float avg_is_band; + int64_t dual_stereo_used; + int64_t total_packets_out; + + /* State */ + FFBesselFilter lambda_lp; + OpusPacketInfo p; + int redo_analysis; + int buffered_steps; + int steps_to_process; + int eof; + float lambda; + int *inflection_points; + int inflection_points_count; +} OpusPsyContext; + +int ff_opus_psy_process (OpusPsyContext *s, OpusPacketInfo *p); +void ff_opus_psy_celt_frame_init (OpusPsyContext *s, CeltFrame *f, int index); +int ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index); +void ff_opus_psy_postencode_update (OpusPsyContext *s, CeltFrame *f, OpusRangeCoder *rc); + +int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx, + struct FFBufQueue *bufqueue, OpusEncOptions *options); +void ff_opus_psy_signal_eof(OpusPsyContext *s); +int ff_opus_psy_end(OpusPsyContext *s); + +#endif /* AVCODEC_OPUSENC_PSY_H */ diff --git a/libavcodec/opusenc_utils.h b/libavcodec/opusenc_utils.h new file mode 100644 index 0000000000..8b9c5bffaf --- /dev/null +++ b/libavcodec/opusenc_utils.h @@ -0,0 +1,82 @@ +/* + * Opus encoder + * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "opus.h" + +typedef struct FFBesselFilter { + float a[3]; + float b[2]; + float x[3]; + float y[3]; +} FFBesselFilter; + +/* Fills the coefficients, returns 1 if filter will be unstable */ +static inline int bessel_reinit(FFBesselFilter *s, float n, float f0, float fs, + int highpass) +{ + int unstable; + float c, cfreq, w0, k1, k2; + + if (!highpass) { + c = (1.0f/sqrtf(sqrtf(pow(2.0f, 1.0f/n) - 3.0f/4.0f) - 0.5f))/sqrtf(3.0f); + cfreq = c*f0/fs; + unstable = (cfreq <= 0.0f || cfreq >= 1.0f/4.0f); + } else { + c = sqrtf(3.0f)*sqrtf(sqrtf(pow(2.0f, 1.0f/n) - 3.0f/4.0f) - 0.5f); + cfreq = 0.5f - c*f0/fs; + unstable = (cfreq <= 3.0f/8.0f || cfreq >= 1.0f/2.0f); + } + + w0 = tanf(M_PI*cfreq); + k1 = 3.0f * w0; + k2 = 3.0f * w0; + + s->a[0] = k2/(1.0f + k1 + k2); + s->a[1] = 2.0f * s->a[0]; + s->a[2] = s->a[0]; + s->b[0] = 2.0f * s->a[0] * (1.0f/k2 - 1.0f); + s->b[1] = 1.0f - (s->a[0] + s->a[1] + s->a[2] + s->b[0]); + + if (highpass) { + s->a[1] *= -1; + s->b[0] *= -1; + } + + return unstable; +} + +static inline int bessel_init(FFBesselFilter *s, float n, float f0, float fs, + int highpass) +{ + memset(s, 0, sizeof(FFBesselFilter)); + return bessel_reinit(s, n, f0, fs, highpass); +} + +static inline float bessel_filter(FFBesselFilter *s, float x) +{ + s->x[2] = s->x[1]; + s->x[1] = s->x[0]; + s->x[0] = x; + s->y[2] = s->y[1]; + s->y[1] = s->y[0]; + s->y[0] = s->a[0]*s->x[0] + s->a[1]*s->x[1] + s->a[2]*s->x[2] + s->b[0]*s->y[1] + s->b[1]*s->y[2]; + return s->y[0]; +} -- 2.14.1.821.g8fa685d3b7

[FFmpeg-devel,2/2] opusenc: (WIP) add a new psychoacoustic system for the native Opus encoder

Commit Message

Comments

Patch