diff mbox

[FFmpeg-devel] lavd/xcbgrab: do not try to create refcounted packets.

Message ID 20161023122937.21378-1-george@nsup.org
State Accepted
Commit 0bd1be65e88d6a4f367e698d7a2b105424eb1905
Headers show

Commit Message

Nicolas George Oct. 23, 2016, 12:29 p.m. UTC
The framework will allocate a buffer and copy the data to it,
that takes time. But it avoids constently creating and
destroyng the shared memory segment, and that saves more time.

On my setup,
from ~200 to ~300 FPS at full screen (1920×1200),
from ~1400 to ~3300 at smaller size (640×480),
similar to legacy x11grab.

Plus, shared memory segments are a scarce resource,
allocating potentially many is a bad idea.

Note: if the application were to drop all references to the
buffer before the next call to av_read_frame(), then passing
the shared memory segment as a refcounted buffer would be
even more efficient, but it is hard to guarantee, and it does
not happen with the ffmpeg command-line tool. Using a small
number of preallocated buffers and resorting to a copy when
the pool is exhausted would be a solution to get the better
of both worlds.

Signed-off-by: Nicolas George <george@nsup.org>
---
 libavdevice/xcbgrab.c | 65 +++++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

Comments

Michael Niedermayer Oct. 25, 2016, 5:28 p.m. UTC | #1
On Sun, Oct 23, 2016 at 02:29:37PM +0200, Nicolas George wrote:
> The framework will allocate a buffer and copy the data to it,
> that takes time. But it avoids constently creating and
> destroyng the shared memory segment, and that saves more time.
> 
> On my setup,
> from ~200 to ~300 FPS at full screen (1920×1200),
> from ~1400 to ~3300 at smaller size (640×480),
> similar to legacy x11grab.
> 
> Plus, shared memory segments are a scarce resource,
> allocating potentially many is a bad idea.
> 
> Note: if the application were to drop all references to the
> buffer before the next call to av_read_frame(), then passing
> the shared memory segment as a refcounted buffer would be
> even more efficient, but it is hard to guarantee, and it does
> not happen with the ffmpeg command-line tool. Using a small
> number of preallocated buffers and resorting to a copy when
> the pool is exhausted would be a solution to get the better
> of both worlds.
> 
> Signed-off-by: Nicolas George <george@nsup.org>
> ---
>  libavdevice/xcbgrab.c | 65 +++++++++++++++++++++++++++------------------------
>  1 file changed, 35 insertions(+), 30 deletions(-)

Tested-by: Michael
126fps -> 141 here

[...]
Clément Bœsch Oct. 25, 2016, 5:47 p.m. UTC | #2
On Sun, Oct 23, 2016 at 02:29:37PM +0200, Nicolas George wrote:
> The framework will allocate a buffer and copy the data to it,
> that takes time. But it avoids constently creating and
> destroyng the shared memory segment, and that saves more time.
> 
> On my setup,
> from ~200 to ~300 FPS at full screen (1920×1200),

./ffmpeg -framerate 10000 -f x11grab -video_size hd1080 -i :0.0 -t 20 -f null -

before: fps=324
after:  fps=627
Clément Bœsch Oct. 25, 2016, 5:56 p.m. UTC | #3
On Sun, Oct 23, 2016 at 02:29:37PM +0200, Nicolas George wrote:
> The framework will allocate a buffer and copy the data to it,
> that takes time. But it avoids constently creating and
> destroyng the shared memory segment, and that saves more time.
> 
> On my setup,
> from ~200 to ~300 FPS at full screen (1920×1200),
> from ~1400 to ~3300 at smaller size (640×480),
> similar to legacy x11grab.
> 
> Plus, shared memory segments are a scarce resource,
> allocating potentially many is a bad idea.
> 
> Note: if the application were to drop all references to the
> buffer before the next call to av_read_frame(), then passing
> the shared memory segment as a refcounted buffer would be
> even more efficient, but it is hard to guarantee, and it does
> not happen with the ffmpeg command-line tool. Using a small
> number of preallocated buffers and resorting to a copy when
> the pool is exhausted would be a solution to get the better
> of both worlds.
> 
> Signed-off-by: Nicolas George <george@nsup.org>
> ---
>  libavdevice/xcbgrab.c | 65 +++++++++++++++++++++++++++------------------------
>  1 file changed, 35 insertions(+), 30 deletions(-)
> 
> diff --git a/libavdevice/xcbgrab.c b/libavdevice/xcbgrab.c
> index 9da46c8..702e66c 100644
> --- a/libavdevice/xcbgrab.c
> +++ b/libavdevice/xcbgrab.c
> @@ -49,6 +49,8 @@
>  typedef struct XCBGrabContext {
>      const AVClass *class;
>  
> +    uint8_t *buffer;
> +
>      xcb_connection_t *conn;
>      xcb_screen_t *screen;
>      xcb_window_t window;
> @@ -219,22 +221,16 @@ static int check_shm(xcb_connection_t *conn)
>      return 0;
>  }
>  
> -static void dealloc_shm(void *unused, uint8_t *data)
> -{
> -    shmdt(data);
> -}
> -
> -static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
> +static int allocate_shm(AVFormatContext *s)
>  {
>      XCBGrabContext *c = s->priv_data;
> -    xcb_shm_get_image_cookie_t iq;
> -    xcb_shm_get_image_reply_t *img;
> -    xcb_drawable_t drawable = c->screen->root;
> -    uint8_t *data;
>      int size = c->frame_size + AV_INPUT_BUFFER_PADDING_SIZE;
> -    int id   = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
> -    xcb_generic_error_t *e = NULL;
> +    uint8_t *data;
> +    int id;
>  
> +    if (c->buffer)
> +        return 0;
> +    id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
>      if (id == -1) {
>          char errbuf[1024];
>          int err = AVERROR(errno);
> @@ -243,15 +239,31 @@ static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
>                 size, errbuf);
>          return err;
>      }
> -
>      xcb_shm_attach(c->conn, c->segment, id, 0);
> +    data = shmat(id, NULL, 0);
> +    shmctl(id, IPC_RMID, 0);
> +    if ((intptr_t)data == -1 || !data)
> +        return AVERROR(errno);
> +    c->buffer = data;
> +    return 0;
> +}
> +
> +static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
> +{
> +    XCBGrabContext *c = s->priv_data;
> +    xcb_shm_get_image_cookie_t iq;
> +    xcb_shm_get_image_reply_t *img;
> +    xcb_drawable_t drawable = c->screen->root;
> +    xcb_generic_error_t *e = NULL;
> +    int ret;
> +
> +    ret = allocate_shm(s);
> +    if (ret < 0)
> +        return ret;
>  
>      iq = xcb_shm_get_image(c->conn, drawable,
>                             c->x, c->y, c->width, c->height, ~0,
>                             XCB_IMAGE_FORMAT_Z_PIXMAP, c->segment, 0);
> -
> -    xcb_shm_detach(c->conn, c->segment);
> -
>      img = xcb_shm_get_image_reply(c->conn, iq, &e);
>  
>      xcb_flush(c->conn);
> @@ -264,25 +276,12 @@ static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
>                 e->response_type, e->error_code,
>                 e->sequence, e->resource_id, e->minor_code, e->major_code);
>  
> -        shmctl(id, IPC_RMID, 0);
>          return AVERROR(EACCES);
>      }
>  
>      free(img);
>  
> -    data = shmat(id, NULL, 0);
> -    shmctl(id, IPC_RMID, 0);
> -
> -    if ((intptr_t)data == -1)
> -        return AVERROR(errno);
> -
> -    pkt->buf = av_buffer_create(data, size, dealloc_shm, NULL, 0);
> -    if (!pkt->buf) {
> -        shmdt(data);
> -        return AVERROR(ENOMEM);
> -    }
> -
> -    pkt->data = pkt->buf->data;
> +    pkt->data = c->buffer;
>      pkt->size = c->frame_size;

Sorry if this is a dumb question but: can you describe what happens if the
previous packet still held the same pkt->data = c->buffer?

That is, when and how the buffer copy does happen?

(no need for a av_buffer_create with RO flag?)
Nicolas George Oct. 25, 2016, 5:58 p.m. UTC | #4
Le quartidi 4 brumaire, an CCXXV, Clement Boesch a écrit :
> > The framework will allocate a buffer and copy the data to it,
> > that takes time.

> Sorry if this is a dumb question but: can you describe what happens if the
> previous packet still held the same pkt->data = c->buffer?
> 
> That is, when and how the buffer copy does happen?
> 
> (no need for a av_buffer_create with RO flag?)

Not dumb, but the answer was in the first sentence of the commit message.

The corresponding code is in ff_read_packet():

        if (!pkt->buf) {
            AVPacket tmp = { 0 };
            ret = av_packet_ref(&tmp, pkt);
            if (ret < 0)
                return ret;
            *pkt = tmp;
        }

And av_packet_ref() creates a refcounted buffer if the given one is not
refcounted.

Regards,
Clément Bœsch Oct. 25, 2016, 6:03 p.m. UTC | #5
On Tue, Oct 25, 2016 at 07:58:56PM +0200, Nicolas George wrote:
> Le quartidi 4 brumaire, an CCXXV, Clement Boesch a écrit :
> > > The framework will allocate a buffer and copy the data to it,
> > > that takes time.
> 
> > Sorry if this is a dumb question but: can you describe what happens if the
> > previous packet still held the same pkt->data = c->buffer?
> > 
> > That is, when and how the buffer copy does happen?
> > 
> > (no need for a av_buffer_create with RO flag?)
> 
> Not dumb, but the answer was in the first sentence of the commit message.
> 

I was wondering where and how,

> The corresponding code is in ff_read_packet():
> 
>         if (!pkt->buf) {
>             AVPacket tmp = { 0 };
>             ret = av_packet_ref(&tmp, pkt);
>             if (ret < 0)
>                 return ret;
>             *pkt = tmp;
>         }
> 
> And av_packet_ref() creates a refcounted buffer if the given one is not
> refcounted.

I see, that makes sense. Thanks for clarifying.
Sven C. Dack Oct. 25, 2016, 6:10 p.m. UTC | #6
On 23/10/16 13:29, Nicolas George wrote:
> The framework will allocate a buffer and copy the data to it,
> that takes time. But it avoids constently creating and
> destroyng the shared memory segment, and that saves more time.
>
> On my setup,
> from ~200 to ~300 FPS at full screen (1920×1200),
> from ~1400 to ~3300 at smaller size (640×480),
> similar to legacy x11grab.
>
> Plus, shared memory segments are a scarce resource,
> allocating potentially many is a bad idea.
>
> Note: if the application were to drop all references to the
> buffer before the next call to av_read_frame(), then passing
> the shared memory segment as a refcounted buffer would be
> even more efficient, but it is hard to guarantee, and it does
> not happen with the ffmpeg command-line tool. Using a small
> number of preallocated buffers and resorting to a copy when
> the pool is exhausted would be a solution to get the better
> of both worlds.

192fps -> 315fps (+64%)

Sven
Andy Furniss Oct. 27, 2016, 2:09 p.m. UTC | #7
Nicolas George wrote:
> The framework will allocate a buffer and copy the data to it,
> that takes time. But it avoids constently creating and
> destroyng the shared memory segment, and that saves more time.
>
> On my setup,
> from ~200 to ~300 FPS at full screen (1920×1200),
> from ~1400 to ~3300 at smaller size (640×480),
> similar to legacy x11grab.

Nice, thanks for doing this.

Beats legacy by a couple of fps on my old CPU and combined
with nv12 conversion makes the difference between being able
to do 1080p60 and not.

> Plus, shared memory segments are a scarce resource,
> allocating potentially many is a bad idea.
>
> Note: if the application were to drop all references to the
> buffer before the next call to av_read_frame(), then passing
> the shared memory segment as a refcounted buffer would be
> even more efficient, but it is hard to guarantee, and it does
> not happen with the ffmpeg command-line tool. Using a small
> number of preallocated buffers and resorting to a copy when
> the pool is exhausted would be a solution to get the better
> of both worlds.

Next 2160p60 :-) only joking, though for those with GPUs that do
shader BGR0 -> 420 CSC I guess it could make a difference - but then
in that case it would be even better if they could avoid  having to
copy over and back all together.
Nicolas George Nov. 3, 2016, 8:29 p.m. UTC | #8
Le sextidi 6 brumaire, an CCXXV, Andy Furniss a écrit :
> Beats legacy by a couple of fps on my old CPU and combined
> with nv12 conversion makes the difference between being able
> to do 1080p60 and not.

Thanks to everyone for all the testing.

I do not think I can take credit for beating the legacy implementation.
I suspect it is just a consequence of using xcb instead of Xlib.

Regards,
diff mbox

Patch

diff --git a/libavdevice/xcbgrab.c b/libavdevice/xcbgrab.c
index 9da46c8..702e66c 100644
--- a/libavdevice/xcbgrab.c
+++ b/libavdevice/xcbgrab.c
@@ -49,6 +49,8 @@ 
 typedef struct XCBGrabContext {
     const AVClass *class;
 
+    uint8_t *buffer;
+
     xcb_connection_t *conn;
     xcb_screen_t *screen;
     xcb_window_t window;
@@ -219,22 +221,16 @@  static int check_shm(xcb_connection_t *conn)
     return 0;
 }
 
-static void dealloc_shm(void *unused, uint8_t *data)
-{
-    shmdt(data);
-}
-
-static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
+static int allocate_shm(AVFormatContext *s)
 {
     XCBGrabContext *c = s->priv_data;
-    xcb_shm_get_image_cookie_t iq;
-    xcb_shm_get_image_reply_t *img;
-    xcb_drawable_t drawable = c->screen->root;
-    uint8_t *data;
     int size = c->frame_size + AV_INPUT_BUFFER_PADDING_SIZE;
-    int id   = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
-    xcb_generic_error_t *e = NULL;
+    uint8_t *data;
+    int id;
 
+    if (c->buffer)
+        return 0;
+    id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
     if (id == -1) {
         char errbuf[1024];
         int err = AVERROR(errno);
@@ -243,15 +239,31 @@  static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
                size, errbuf);
         return err;
     }
-
     xcb_shm_attach(c->conn, c->segment, id, 0);
+    data = shmat(id, NULL, 0);
+    shmctl(id, IPC_RMID, 0);
+    if ((intptr_t)data == -1 || !data)
+        return AVERROR(errno);
+    c->buffer = data;
+    return 0;
+}
+
+static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
+{
+    XCBGrabContext *c = s->priv_data;
+    xcb_shm_get_image_cookie_t iq;
+    xcb_shm_get_image_reply_t *img;
+    xcb_drawable_t drawable = c->screen->root;
+    xcb_generic_error_t *e = NULL;
+    int ret;
+
+    ret = allocate_shm(s);
+    if (ret < 0)
+        return ret;
 
     iq = xcb_shm_get_image(c->conn, drawable,
                            c->x, c->y, c->width, c->height, ~0,
                            XCB_IMAGE_FORMAT_Z_PIXMAP, c->segment, 0);
-
-    xcb_shm_detach(c->conn, c->segment);
-
     img = xcb_shm_get_image_reply(c->conn, iq, &e);
 
     xcb_flush(c->conn);
@@ -264,25 +276,12 @@  static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
                e->response_type, e->error_code,
                e->sequence, e->resource_id, e->minor_code, e->major_code);
 
-        shmctl(id, IPC_RMID, 0);
         return AVERROR(EACCES);
     }
 
     free(img);
 
-    data = shmat(id, NULL, 0);
-    shmctl(id, IPC_RMID, 0);
-
-    if ((intptr_t)data == -1)
-        return AVERROR(errno);
-
-    pkt->buf = av_buffer_create(data, size, dealloc_shm, NULL, 0);
-    if (!pkt->buf) {
-        shmdt(data);
-        return AVERROR(ENOMEM);
-    }
-
-    pkt->data = pkt->buf->data;
+    pkt->data = c->buffer;
     pkt->size = c->frame_size;
 
     return 0;
@@ -436,6 +435,12 @@  static av_cold int xcbgrab_read_close(AVFormatContext *s)
 {
     XCBGrabContext *ctx = s->priv_data;
 
+#if CONFIG_LIBXCB_SHM
+    if (ctx->buffer) {
+        shmdt(ctx->buffer);
+    }
+#endif
+
     xcb_disconnect(ctx->conn);
 
     return 0;