diff mbox

[FFmpeg-devel,1/3] mpegvideo_parser: implement parsing of the picture structure field

Message ID 20180211143752.7851-2-jeebjp@gmail.com
State New
Headers show

Commit Message

Jan Ekström Feb. 11, 2018, 2:37 p.m. UTC
From: Masaki Tanaka <maki.rxrz@gmail.com>

Lets one receive the proper field order from pictures coded in
field picture mode, until now forcibly set to BFF.
---
 libavcodec/mpegvideo_parser.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

Comments

Michael Niedermayer Feb. 11, 2018, 5:38 p.m. UTC | #1
On Sun, Feb 11, 2018 at 04:37:50PM +0200, Jan Ekström wrote:
> From: Masaki Tanaka <maki.rxrz@gmail.com>
> 
> Lets one receive the proper field order from pictures coded in
> field picture mode, until now forcibly set to BFF.
> ---
>  libavcodec/mpegvideo_parser.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)

probably ok

[...]
Yusuke Nakamura Feb. 11, 2018, 8:37 p.m. UTC | #2
2018-02-11 23:37 GMT+09:00 Jan Ekström <jeebjp@gmail.com>:

> From: Masaki Tanaka <maki.rxrz@gmail.com>
>
> Lets one receive the proper field order from pictures coded in
> field picture mode, until now forcibly set to BFF.
> ---
>  libavcodec/mpegvideo_parser.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
> index be240b6890..3406346a8b 100644
> --- a/libavcodec/mpegvideo_parser.c
> +++ b/libavcodec/mpegvideo_parser.c
> @@ -41,7 +41,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> *s,
>      uint32_t start_code;
>      int frame_rate_index, ext_type, bytes_left;
>      int frame_rate_ext_n, frame_rate_ext_d;
> -    int top_field_first, repeat_first_field, progressive_frame;
> +    int picture_structure, top_field_first, repeat_first_field,
> progressive_frame;
>      int horiz_size_ext, vert_size_ext, bit_rate_ext;
>      int did_set_size=0;
>      int set_dim_ret = 0;
> @@ -51,6 +51,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> *s,
>      enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
>  //FIXME replace the crap with get_bits()
>      s->repeat_pict = 0;
> +    s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
>
>      while (buf < buf_end) {
>          start_code= -1;
> @@ -114,6 +115,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> *s,
>                      break;
>                  case 0x8: /* picture coding extension */
>                      if (bytes_left >= 5) {
> +                        picture_structure = buf[2] & 0x03;
>                          top_field_first = buf[3] & (1 << 7);
>                          repeat_first_field = buf[3] & (1 << 1);
>                          progressive_frame = buf[4] & (1 << 7);
> @@ -138,6 +140,18 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> *s,
>                                  s->field_order = AV_FIELD_BB;
>                          } else
>                              s->field_order = AV_FIELD_PROGRESSIVE;
> +
> +                        switch (picture_structure) {
> +                        case PICT_TOP_FIELD:
> +                            s->picture_structure =
> AV_PICTURE_STRUCTURE_TOP_FIELD;
> +                            break;
> +                        case PICT_BOTTOM_FIELD:
> +                            s->picture_structure =
> AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
> +                            break;
> +                        case PICT_FRAME:
> +                            s->picture_structure =
> AV_PICTURE_STRUCTURE_FRAME;
> +                            break;
> +                        }
>

Libavcodec handles MPEG-2 Video packet per frame but not picture, and the
parser stops immediately after parsing the first slice of the picture. So,
the parser returns per full frame but picture_structure says it's a field
picture. This is evil and  this is the reason why I hesitate to post this
patch. I'm interested in how other devs consider this evil solution. I
think the parser should parse the second field too if encountering a field
coded picture, and treat the packet as frame coded and set field_order
appropriate.


>                      }
>                      break;
>                  }
> --
> 2.14.3
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
Michael Niedermayer Feb. 11, 2018, 10:23 p.m. UTC | #3
On Mon, Feb 12, 2018 at 05:37:32AM +0900, Yusuke Nakamura wrote:
> 2018-02-11 23:37 GMT+09:00 Jan Ekström <jeebjp@gmail.com>:
> 
> > From: Masaki Tanaka <maki.rxrz@gmail.com>
> >
> > Lets one receive the proper field order from pictures coded in
> > field picture mode, until now forcibly set to BFF.
> > ---
> >  libavcodec/mpegvideo_parser.c | 16 +++++++++++++++-
> >  1 file changed, 15 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
> > index be240b6890..3406346a8b 100644
> > --- a/libavcodec/mpegvideo_parser.c
> > +++ b/libavcodec/mpegvideo_parser.c
> > @@ -41,7 +41,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> > *s,
> >      uint32_t start_code;
> >      int frame_rate_index, ext_type, bytes_left;
> >      int frame_rate_ext_n, frame_rate_ext_d;
> > -    int top_field_first, repeat_first_field, progressive_frame;
> > +    int picture_structure, top_field_first, repeat_first_field,
> > progressive_frame;
> >      int horiz_size_ext, vert_size_ext, bit_rate_ext;
> >      int did_set_size=0;
> >      int set_dim_ret = 0;
> > @@ -51,6 +51,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> > *s,
> >      enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
> >  //FIXME replace the crap with get_bits()
> >      s->repeat_pict = 0;
> > +    s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
> >
> >      while (buf < buf_end) {
> >          start_code= -1;
> > @@ -114,6 +115,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> > *s,
> >                      break;
> >                  case 0x8: /* picture coding extension */
> >                      if (bytes_left >= 5) {
> > +                        picture_structure = buf[2] & 0x03;
> >                          top_field_first = buf[3] & (1 << 7);
> >                          repeat_first_field = buf[3] & (1 << 1);
> >                          progressive_frame = buf[4] & (1 << 7);
> > @@ -138,6 +140,18 @@ static void mpegvideo_extract_headers(AVCodecParserContext
> > *s,
> >                                  s->field_order = AV_FIELD_BB;
> >                          } else
> >                              s->field_order = AV_FIELD_PROGRESSIVE;
> > +
> > +                        switch (picture_structure) {
> > +                        case PICT_TOP_FIELD:
> > +                            s->picture_structure =
> > AV_PICTURE_STRUCTURE_TOP_FIELD;
> > +                            break;
> > +                        case PICT_BOTTOM_FIELD:
> > +                            s->picture_structure =
> > AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
> > +                            break;
> > +                        case PICT_FRAME:
> > +                            s->picture_structure =
> > AV_PICTURE_STRUCTURE_FRAME;
> > +                            break;
> > +                        }
> >
> 
> Libavcodec handles MPEG-2 Video packet per frame but not picture, and the
> parser stops immediately after parsing the first slice of the picture. So,
> the parser returns per full frame but picture_structure says it's a field
> picture. This is evil and  this is the reason why I hesitate to post this
> patch. I'm interested in how other devs consider this evil solution. I
> think the parser should parse the second field too if encountering a field
> coded picture, and treat the packet as frame coded and set field_order
> appropriate.

I think a better API is needed to export the picture_structure correctly.

With the API here, i think unspecified is better than delcaring field pictures
to be frames. The field value is also what decoders would use, they have to
2 field pics arent the same as a frame picture

[...]
Jan Ekström Feb. 12, 2018, 1:13 a.m. UTC | #4
On Mon, Feb 12, 2018 at 12:23 AM, Michael Niedermayer
<michael@niedermayer.cc> wrote:
>
> I think a better API is needed to export the picture_structure correctly.
>

I might be misunderstanding the problem at hand, but I'm not sure if a
better API is required right now in the sense that if we define that:
* the demuxer and/or parser should return a decode'able coding unit
(whether or not it can actually be decoded depends on the state of
things). In case of field coded pictures this would be one coded
field, if I understand correctly.
* and, if the decoder then needs two coded field pictures to generate
a combed together "frame" - so be it. The new decoding/encoding APIs
let you have a non-synchronized amount of input VS output.

The primary part of course being that we shouldn't be ignoring the
other field picture in the parser if they are in the same PES packet,
for example.

> With the API here, i think unspecified is better than delcaring field pictures
> to be frames. The field value is also what decoders would use, they have to
> 2 field pics arent the same as a frame picture
>

Yes, if we are not going to be separating them in the parser as two
decode'able units, then we at the very least would have to parse both
and set the field order flags for that packet of two field pictures
correctly.
Michael Niedermayer Feb. 13, 2018, 12:13 p.m. UTC | #5
Hi

On Mon, Feb 12, 2018 at 03:13:46AM +0200, Jan Ekström wrote:
> On Mon, Feb 12, 2018 at 12:23 AM, Michael Niedermayer
> <michael@niedermayer.cc> wrote:
> >
> > I think a better API is needed to export the picture_structure correctly.
> >
> 
> I might be misunderstanding the problem at hand, but I'm not sure if a
> better API is required right now in the sense that if we define that:

> * the demuxer and/or parser should return a decode'able coding unit

> (whether or not it can actually be decoded depends on the state of
> things). In case of field coded pictures this would be one coded
> field, if I understand correctly.

Whats a "decode'able coding unit" ?
A frame ? a field ? a slice ? an access unit ? a group of pictures ?

Should the user be able to choose at which level the parser should
split packets depening on her needs ?

currently and in the past the parser output was what was most convenient
to use for decoders and muxers internally, that was a "frame" when 
everything can be packetized as frames (no unpaired fields) or fields
when unpaired fields where possibly. In almost all parsers thats
also identical to an access unit.

If there are 2 fields in a packet that can be as 2 field pictures or
as a interlaced frame coded in a way thats inseperable. Then you have
2 timestamps really and might have information associated with each
field in principle. Our API doesnt have a place to put the information
for the 2nd field anywhere. (which is together with picture_structure
what i meant with needing a better API ...)

This exists more so if you would split at GOP level or wanted information
about slices from a parser returning fields.
(if a future API would support this)

spliting at slice level is for example usefull for network transmission
so that transmitted units are more aligned to slices.



> * and, if the decoder then needs two coded field pictures to generate
> a combed together "frame" - so be it. The new decoding/encoding APIs
> let you have a non-synchronized amount of input VS output.

We have several decoders that can be fed with input at lower granularity
like slices since a long time ago. So iam not sure how any "new APIs" are
related here


[...]
Jan Ekström Feb. 13, 2018, 6:39 p.m. UTC | #6
On Tue, Feb 13, 2018 at 2:13 PM, Michael Niedermayer
<michael@niedermayer.cc> wrote:
> ...
> If there are 2 fields in a packet that can be as 2 field pictures or
> as a interlaced frame coded in a way thats inseperable. Then you have
> 2 timestamps really and might have information associated with each
> field in principle. Our API doesnt have a place to put the information
> for the 2nd field anywhere. (which is together with picture_structure
> what i meant with needing a better API ...)
>

OK, so they are inseparable? Then it makes sense to pass them
together. Until now I only had the information that the two pictures
were in the same PES packet.

In that case the best thing we can do is set the field order correctly
for the set by parsing both field pictures' information. I just
wondered if the two field pictures are separately coded pictures, just
like with PAFF in AVC or how interlacing is generally done with HEVC
(which we even decode as separate fields right now).

Thus, I was just trying to query if the two field pictures were indeed
two separate images, and thus could be properly tagged and passed
through, without the latter picture apparently getting ignored. For
that at this point of time no new APIs would be needed, only work with
the parser.

>
> We have several decoders that can be fed with input at lower granularity
> like slices since a long time ago. So iam not sure how any "new APIs" are
> related here
>

Yes, although I think the feed/receive APIs make it work both ways
(you can feed more or receive more).

Jan
Michael Niedermayer Feb. 13, 2018, 6:55 p.m. UTC | #7
On Tue, Feb 13, 2018 at 08:39:09PM +0200, Jan Ekström wrote:
> On Tue, Feb 13, 2018 at 2:13 PM, Michael Niedermayer
> <michael@niedermayer.cc> wrote:
> > ...
> > If there are 2 fields in a packet that can be as 2 field pictures or
> > as a interlaced frame coded in a way thats inseperable. Then you have
> > 2 timestamps really and might have information associated with each
> > field in principle. Our API doesnt have a place to put the information
> > for the 2nd field anywhere. (which is together with picture_structure
> > what i meant with needing a better API ...)
> >
> 
> OK, so they are inseparable? Then it makes sense to pass them
> together. Until now I only had the information that the two pictures
> were in the same PES packet.
> 
> In that case the best thing we can do is set the field order correctly
> for the set by parsing both field pictures' information. I just
> wondered if the two field pictures are separately coded pictures, just
> like with PAFF in AVC or how interlacing is generally done with HEVC
> (which we even decode as separate fields right now).
> 
> Thus, I was just trying to query if the two field pictures were indeed
> two separate images, and thus could be properly tagged and passed
> through, without the latter picture apparently getting ignored. For
> that at this point of time no new APIs would be needed, only work with
> the parser.

interlaced video can be stored in more than one way, either in a inseperable
way or seperable field pictures.

Abother inseparable case is for example H263 PB frames, coding 2 frames
together. 
For all these cases parsers should idealy be able to output information
about both 


> 
> >
> > We have several decoders that can be fed with input at lower granularity
> > like slices since a long time ago. So iam not sure how any "new APIs" are
> > related here
> >
> 
> Yes, although I think the feed/receive APIs make it work both ways
> (you can feed more or receive more).
> 
> Jan
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
diff mbox

Patch

diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
index be240b6890..3406346a8b 100644
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@@ -41,7 +41,7 @@  static void mpegvideo_extract_headers(AVCodecParserContext *s,
     uint32_t start_code;
     int frame_rate_index, ext_type, bytes_left;
     int frame_rate_ext_n, frame_rate_ext_d;
-    int top_field_first, repeat_first_field, progressive_frame;
+    int picture_structure, top_field_first, repeat_first_field, progressive_frame;
     int horiz_size_ext, vert_size_ext, bit_rate_ext;
     int did_set_size=0;
     int set_dim_ret = 0;
@@ -51,6 +51,7 @@  static void mpegvideo_extract_headers(AVCodecParserContext *s,
     enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
 //FIXME replace the crap with get_bits()
     s->repeat_pict = 0;
+    s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
 
     while (buf < buf_end) {
         start_code= -1;
@@ -114,6 +115,7 @@  static void mpegvideo_extract_headers(AVCodecParserContext *s,
                     break;
                 case 0x8: /* picture coding extension */
                     if (bytes_left >= 5) {
+                        picture_structure = buf[2] & 0x03;
                         top_field_first = buf[3] & (1 << 7);
                         repeat_first_field = buf[3] & (1 << 1);
                         progressive_frame = buf[4] & (1 << 7);
@@ -138,6 +140,18 @@  static void mpegvideo_extract_headers(AVCodecParserContext *s,
                                 s->field_order = AV_FIELD_BB;
                         } else
                             s->field_order = AV_FIELD_PROGRESSIVE;
+
+                        switch (picture_structure) {
+                        case PICT_TOP_FIELD:
+                            s->picture_structure = AV_PICTURE_STRUCTURE_TOP_FIELD;
+                            break;
+                        case PICT_BOTTOM_FIELD:
+                            s->picture_structure = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
+                            break;
+                        case PICT_FRAME:
+                            s->picture_structure = AV_PICTURE_STRUCTURE_FRAME;
+                            break;
+                        }
                     }
                     break;
                 }