diff mbox series

[FFmpeg-devel,RFC] avformat/rtpdec: Audio level RTP extension RFC6464

Message ID 20240211053038.74908-1-jon@jonb.org
State New
Headers show
Series [FFmpeg-devel,RFC] avformat/rtpdec: Audio level RTP extension RFC6464 | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

jon@jonb.org Feb. 11, 2024, 5:30 a.m. UTC
From: Jonathan Baudanza <jon@jonb.org>

libwebrtc will add audio level (in decibels) and VAD status to each RTP packet.

This patch will add both values to the packet sidedata.

I've been using this patch in production for about a year on live audio RTP
streams to detect when users are speaking without needing to decode the audio
data.
---
 libavcodec/avpacket.c |  1 +
 libavcodec/defs.h     | 15 ++++++++
 libavcodec/packet.h   |  5 +++
 libavformat/rtpdec.c  | 87 +++++++++++++++++++++++++++++++++++++++++++
 libavformat/rtpdec.h  |  5 +++
 libavformat/rtsp.c    | 16 ++++++++
 libavformat/rtsp.h    |  2 +
 7 files changed, 131 insertions(+)
diff mbox series

Patch

diff --git a/libavcodec/avpacket.c b/libavcodec/avpacket.c
index e118bbaad1..73e0341bf7 100644
--- a/libavcodec/avpacket.c
+++ b/libavcodec/avpacket.c
@@ -305,6 +305,7 @@  const char *av_packet_side_data_name(enum AVPacketSideDataType type)
     case AV_PKT_DATA_IAMF_MIX_GAIN_PARAM:        return "IAMF Mix Gain Parameter Data";
     case AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM:   return "IAMF Demixing Info Parameter Data";
     case AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM: return "IAMF Recon Gain Info Parameter Data";
+    case AV_PKT_DATA_SSRC_AUDIO_LEVEL:           return "RTP SSRC Audio Level";
     }
     return NULL;
 }
diff --git a/libavcodec/defs.h b/libavcodec/defs.h
index 00d840ec19..87e8814760 100644
--- a/libavcodec/defs.h
+++ b/libavcodec/defs.h
@@ -323,6 +323,21 @@  typedef struct AVProducerReferenceTime {
     int flags;
 } AVProducerReferenceTime;
 
+/**
+ * Audio level structure from the ssrc-audio-level RTP header extension.
+ */
+typedef struct AVAudioLevel {
+    /**
+     * Audio level for this packet, measured in dBov: -127 - 0
+     */
+    int8_t level;
+
+    /**
+     * Set to 1 if the encoder believes this packet contains voice.
+     */
+    int voice;
+} AVAudioLevel;
+
 /**
  * Encode extradata length to a buffer. Used by xiph codecs.
  *
diff --git a/libavcodec/packet.h b/libavcodec/packet.h
index 8558ae849e..f7f1deb6e0 100644
--- a/libavcodec/packet.h
+++ b/libavcodec/packet.h
@@ -330,6 +330,11 @@  enum AVPacketSideDataType {
     */
     AV_PKT_DATA_AMBIENT_VIEWING_ENVIRONMENT,
 
+    /**
+     * Audio Level and VAD data from the RTP header extension as defined by RFC 6464.
+     */
+    AV_PKT_DATA_SSRC_AUDIO_LEVEL,
+
     /**
      * The number of side data types.
      * This is not part of the public API/ABI in the sense that it may
diff --git a/libavformat/rtpdec.c b/libavformat/rtpdec.c
index fa7544cc07..479ea2e245 100644
--- a/libavformat/rtpdec.c
+++ b/libavformat/rtpdec.c
@@ -694,6 +694,79 @@  static void finalize_packet(RTPDemuxContext *s, AVPacket *pkt, uint32_t timestam
                    s->base_timestamp;
 }
 
+
+static const uint8_t* find_header_ext_data(int id, const uint8_t *buf, uint8_t *len) {
+    int buflen = (AV_RB16(buf + 2)) * 4;
+
+    const uint8_t *p = buf + 4;
+    int idx = 0;
+    int this_id;
+    int this_len;
+
+    // This is a one-byte extention format, as defined by RFC rfc5285
+    if (buf[0] == 0xbe && buf[1] == 0xde) {
+        while (idx + 1 < buflen) {
+            if (p[idx] == 0) {
+                idx++; // skip padding
+            } else {
+                this_id = p[idx] >> 4;
+                this_len = (p[idx] & 0xf) + 1;
+
+                // spec says 15 is reserved
+                if (this_id == 15) {
+                    break; // reject
+                }
+
+                if (this_id == id) {
+                    if (this_len > buflen - idx - 1) {
+                        break; // reject
+                    }
+
+                    if (len != NULL)
+                        *len = this_len;
+
+                    return p + idx + 1;
+                }
+
+                idx += 1 + this_len;
+            }
+        }
+    } else if (buf[0] == 0x10 && (buf[1] & 0xff) == 0) {
+        // This is a two-byte extention format
+        while (idx + 1 < buflen) {
+            if (p[idx] == 0) {
+                idx++; // Skip padding
+            } else {
+                this_id = p[idx];
+                this_len = p[idx + 1];
+
+                // spec says 15 is reserved
+                if (this_id == 15) {
+                    break; // reject
+                }
+
+                if (this_id == id) {
+                    if (this_len > buflen - idx - 2) {
+                        break; // reject
+                    }
+
+                    if (len != NULL)
+                        *len = this_len;
+                    return p + idx + 2;
+                }
+
+                idx += 2 + this_len;
+            }
+        }
+    }
+
+    if (len != NULL)
+        *len = 0;
+
+    return NULL;
+}
+
+
 static int rtp_parse_packet_internal(RTPDemuxContext *s, AVPacket *pkt,
                                      const uint8_t *buf, int len)
 {
@@ -703,6 +776,7 @@  static int rtp_parse_packet_internal(RTPDemuxContext *s, AVPacket *pkt,
     AVStream *st;
     uint32_t timestamp;
     int rv = 0;
+    const uint8_t *audio_level_data = NULL;
 
     csrc         = buf[0] & 0x0f;
     ext          = buf[0] & 0x10;
@@ -753,6 +827,11 @@  static int rtp_parse_packet_internal(RTPDemuxContext *s, AVPacket *pkt,
 
         if (len < ext)
             return -1;
+
+        if (s->ssrc_audio_level_ext_id) {
+            audio_level_data = find_header_ext_data(s->ssrc_audio_level_ext_id, buf, NULL);
+        }
+
         // skip past RTP header extension
         len -= ext;
         buf += ext;
@@ -774,6 +853,14 @@  static int rtp_parse_packet_internal(RTPDemuxContext *s, AVPacket *pkt,
     // now perform timestamp things....
     finalize_packet(s, pkt, timestamp);
 
+    if (audio_level_data) {
+        AVAudioLevel *side_data = (struct AVAudioLevel *)av_packet_new_side_data(pkt, AV_PKT_DATA_SSRC_AUDIO_LEVEL, sizeof(AVAudioLevel));
+        if (side_data) {
+            side_data->voice = ((*audio_level_data & 0x80) == 0x80);
+            side_data->level = -(*audio_level_data & 0x7f);
+        }
+    }
+
     return rv;
 }
 
diff --git a/libavformat/rtpdec.h b/libavformat/rtpdec.h
index 5a02e72dc2..91a338200a 100644
--- a/libavformat/rtpdec.h
+++ b/libavformat/rtpdec.h
@@ -188,6 +188,11 @@  struct RTPDemuxContext {
     /* dynamic payload stuff */
     const RTPDynamicProtocolHandler *handler;
     PayloadContext *dynamic_protocol_context;
+
+    /**
+     * RFC 6464 header extension id
+     */
+    int ssrc_audio_level_ext_id;
 };
 
 /**
diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index c7d9b48684..63bc67fdf7 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -691,6 +691,21 @@  static void sdp_parse_line(AVFormatContext *s, SDPParseState *s1,
                     }
                 }
             }
+        } else if (av_strstart(p, "extmap:", &p)) {
+            char *end;
+            int id;
+            id = strtol(p, &end, 10);
+            if (p == end) {
+                break;
+            }
+            p = end;
+
+            get_word(buf1, sizeof(buf1), &p);
+
+            if (!strcmp(buf1, "urn:ietf:params:rtp-hdrext:ssrc-audio-level")) {
+                rtsp_st = rt->rtsp_streams[rt->nb_rtsp_streams - 1];
+                rtsp_st->ssrc_audio_level_ext_id = id;
+            }
         } else {
             if (rt->server_type == RTSP_SERVER_WMS)
                 ff_wms_parse_sdp_a_line(s, p);
@@ -868,6 +883,7 @@  int ff_rtsp_open_transport_ctx(AVFormatContext *s, RTSPStream *rtsp_st)
                s->iformat) {
         RTPDemuxContext *rtpctx = rtsp_st->transport_priv;
         rtpctx->ssrc = rtsp_st->ssrc;
+        rtpctx->ssrc_audio_level_ext_id = rtsp_st->ssrc_audio_level_ext_id;
         if (rtsp_st->dynamic_handler) {
             ff_rtp_parse_set_dynamic_protocol(rtsp_st->transport_priv,
                                               rtsp_st->dynamic_protocol_context,
diff --git a/libavformat/rtsp.h b/libavformat/rtsp.h
index 83b2e3f4fb..4315bbe2c8 100644
--- a/libavformat/rtsp.h
+++ b/libavformat/rtsp.h
@@ -483,6 +483,8 @@  typedef struct RTSPStream {
 
     char crypto_suite[40];
     char crypto_params[100];
+
+    int ssrc_audio_level_ext_id;
 } RTSPStream;
 
 void ff_rtsp_parse_line(AVFormatContext *s,