diff mbox

[FFmpeg-devel] swscale: add unscaled copy from yuv420p10 to p010

Message ID 20160901174938.7366-1-timo@rothenpieler.org
State Superseded
Headers show

Commit Message

Timo Rothenpieler Sept. 1, 2016, 5:49 p.m. UTC
---
 libswscale/swscale_unscaled.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

Comments

Oliver Collyer Sept. 1, 2016, 6:52 p.m. UTC | #1
Just sticking my head above the parapet, but shouldn’t things like...

> +            for (x = 0; x < c->srcW / 2; x++) {
> +                dstUV[x*2  ] = src[1][x] << 6;
> +                dstUV[x*2+1] = src[2][x] << 6;
> +            }

…be more efficiently written as...

uint16_t* tdstUV = dstUV;
uint16_t* tsrc1 = src[1];
uint16_t* tsrc2 = src[2];
for (x = c->srcW / 2; x > 0; x--) {
    *tdstUV++ = *tsrc1++ << 6;
    *tdstUV++ = *tsrc2++ << 6;
}

…or is that really old-school and a modern compiler does all that when optimising?

Or is readability considered more important than marginal gains in performance?

Oliver (time travelling from the 1980s)

> On 1 Sep 2016, at 20:49, Timo Rothenpieler <timo@rothenpieler.org> wrote:
> 
> ---
> libswscale/swscale_unscaled.c | 42 ++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 42 insertions(+)
> 
> diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
> index b231abe..f47e1f4 100644
> --- a/libswscale/swscale_unscaled.c
> +++ b/libswscale/swscale_unscaled.c
> @@ -197,6 +197,43 @@ static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
>     return srcSliceH;
> }
> 
> +static int planarToP010Wrapper(SwsContext *c, const uint8_t *src8[],
> +                               int srcStride[], int srcSliceY,
> +                               int srcSliceH, uint8_t *dstParam8[],
> +                               int dstStride[])
> +{
> +    uint16_t *src[] = {
> +        (uint16_t*)(src8[0] + srcStride[0] * srcSliceY),
> +        (uint16_t*)(src8[1] + srcStride[1] * srcSliceY),
> +        (uint16_t*)(src8[2] + srcStride[2] * srcSliceY)
> +    };
> +    uint16_t *dstY = (uint16_t*)(dstParam8[0] + dstStride[0] * srcSliceY);
> +    uint16_t *dstUV = (uint16_t*)(dstParam8[1] + dstStride[1] * srcSliceY / 2);
> +    int x, y;
> +
> +    av_assert0(!(srcStride[0] % 2 || srcStride[1] % 2 || srcStride[2] % 2 ||
> +                 dstStride[0] % 2 || dstStride[1] % 2));
> +
> +    for (y = srcSliceY; y < srcSliceY + srcSliceH; y++) {
> +        if (!(y & 1)) {
> +            for (x = 0; x < c->srcW / 2; x++) {
> +                dstUV[x*2  ] = src[1][x] << 6;
> +                dstUV[x*2+1] = src[2][x] << 6;
> +            }
> +            src[1] += srcStride[1] / 2;
> +            src[2] += srcStride[2] / 2;
> +            dstUV += dstStride[1] / 2;
> +        }
> +        for (x = 0; x < c->srcW; x++) {
> +            dstY[x] = src[0][x] << 6;
> +        }
> +        src[0] += srcStride[0] / 2;
> +        dstY += dstStride[0] / 2;
> +    }
> +
> +    return srcSliceH;
> +}
> +
> static int planarToYuy2Wrapper(SwsContext *c, const uint8_t *src[],
>                                int srcStride[], int srcSliceY, int srcSliceH,
>                                uint8_t *dstParam[], int dstStride[])
> @@ -1600,6 +1637,11 @@ void ff_get_unscaled_swscale(SwsContext *c)
>         !(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) {
>         c->swscale = ff_yuv2rgb_get_func_ptr(c);
>     }
> +    /* yuv420p10le_to_p010le */
> +    if ((srcFormat == AV_PIX_FMT_YUV420P10 || srcFormat == AV_PIX_FMT_YUVA420P10) &&
> +        dstFormat == AV_PIX_FMT_P010) {
> +        c->swscale = planarToP010Wrapper;
> +    }
> 
>     if (srcFormat == AV_PIX_FMT_YUV410P && !(dstH & 3) &&
>         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
> -- 
> 2.9.2
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Michael Niedermayer Sept. 1, 2016, 9:07 p.m. UTC | #2
On Thu, Sep 01, 2016 at 07:49:38PM +0200, Timo Rothenpieler wrote:
> ---
>  libswscale/swscale_unscaled.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 42 insertions(+)
>
> diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
> index b231abe..f47e1f4 100644
> --- a/libswscale/swscale_unscaled.c
> +++ b/libswscale/swscale_unscaled.c
> @@ -197,6 +197,43 @@ static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
>      return srcSliceH;
>  }
>  
> +static int planarToP010Wrapper(SwsContext *c, const uint8_t *src8[],
> +                               int srcStride[], int srcSliceY,
> +                               int srcSliceH, uint8_t *dstParam8[],
> +                               int dstStride[])
> +{

> +    uint16_t *src[] = {
> +        (uint16_t*)(src8[0] + srcStride[0] * srcSliceY),
> +        (uint16_t*)(src8[1] + srcStride[1] * srcSliceY),
> +        (uint16_t*)(src8[2] + srcStride[2] * srcSliceY)

this looks odd, why is this needed ?


[...]
Timo Rothenpieler Sept. 2, 2016, 8:38 a.m. UTC | #3
>> +    uint16_t *src[] = {
>> +        (uint16_t*)(src8[0] + srcStride[0] * srcSliceY),
>> +        (uint16_t*)(src8[1] + srcStride[1] * srcSliceY),
>> +        (uint16_t*)(src8[2] + srcStride[2] * srcSliceY)
> 
> this looks odd, why is this needed ?
> 

Without it, every

dstY[x] = src[0][x] << 6;

would turn into

dstY[x] = ((uint16_t*)(src8[0] + srcStride[0] * srcSliceY))[x] << 6;


So it improves readability and possibly moves some repeated calculations
out of the loop.
Could also just be 3 independent variables srcY/srcU/srcV, if the array
is what looks odd.
Michael Niedermayer Sept. 2, 2016, 9:02 a.m. UTC | #4
On Fri, Sep 02, 2016 at 10:38:39AM +0200, Timo Rothenpieler wrote:
> >> +    uint16_t *src[] = {
> >> +        (uint16_t*)(src8[0] + srcStride[0] * srcSliceY),
> >> +        (uint16_t*)(src8[1] + srcStride[1] * srcSliceY),
> >> +        (uint16_t*)(src8[2] + srcStride[2] * srcSliceY)
> > 
> > this looks odd, why is this needed ?
> > 
> 
> Without it, every
> 
> dstY[x] = src[0][x] << 6;
> 
> would turn into
> 
> dstY[x] = ((uint16_t*)(src8[0] + srcStride[0] * srcSliceY))[x] << 6;

you misunderstood me, why do you add srcSliceY? isnt src* already
pointing to the right spot ?

[...]
Timo Rothenpieler Sept. 2, 2016, 9:08 a.m. UTC | #5
Am 02.09.2016 um 11:02 schrieb Michael Niedermayer:
> On Fri, Sep 02, 2016 at 10:38:39AM +0200, Timo Rothenpieler wrote:
>>>> +    uint16_t *src[] = {
>>>> +        (uint16_t*)(src8[0] + srcStride[0] * srcSliceY),
>>>> +        (uint16_t*)(src8[1] + srcStride[1] * srcSliceY),
>>>> +        (uint16_t*)(src8[2] + srcStride[2] * srcSliceY)
>>>
>>> this looks odd, why is this needed ?
>>>
>>
>> Without it, every
>>
>> dstY[x] = src[0][x] << 6;
>>
>> would turn into
>>
>> dstY[x] = ((uint16_t*)(src8[0] + srcStride[0] * srcSliceY))[x] << 6;
> 
> you misunderstood me, why do you add srcSliceY? isnt src* already
> pointing to the right spot ?

Looking at the other functions, it indeed seems like it is.
Thanks, completely missed that.
Timo Rothenpieler Sept. 2, 2016, 9:12 a.m. UTC | #6
> Just sticking my head above the parapet, but shouldn’t things like...
> 
>> +            for (x = 0; x < c->srcW / 2; x++) {
>> +                dstUV[x*2  ] = src[1][x] << 6;
>> +                dstUV[x*2+1] = src[2][x] << 6;
>> +            }
> 
> …be more efficiently written as...
> 
> uint16_t* tdstUV = dstUV;
> uint16_t* tsrc1 = src[1];
> uint16_t* tsrc2 = src[2];
> for (x = c->srcW / 2; x > 0; x--) {
>     *tdstUV++ = *tsrc1++ << 6;
>     *tdstUV++ = *tsrc2++ << 6;
> }
> 
> …or is that really old-school and a modern compiler does all that when optimising?
> 
> Or is readability considered more important than marginal gains in performance?
> 
> Oliver (time travelling from the 1980s)

You would still have to add the remaining stride.
The linesize is usually larger than the width, so each line is properly
aligned.

So with your code, you'd still need something like

dstUV += dstStride[1] / 2 - 2 * x;
src[2] += srcStride[1] / 2 - x;
src[2] += srcStride[1] / 2 - x;

after it.
Oliver Collyer Sept. 2, 2016, 9:16 a.m. UTC | #7
> On 2 Sep 2016, at 12:12, Timo Rothenpieler <timo@rothenpieler.org> wrote:
> 
>> Just sticking my head above the parapet, but shouldn’t things like...
>> 
>>> +            for (x = 0; x < c->srcW / 2; x++) {
>>> +                dstUV[x*2  ] = src[1][x] << 6;
>>> +                dstUV[x*2+1] = src[2][x] << 6;
>>> +            }
>> 
>> …be more efficiently written as...
>> 
>> uint16_t* tdstUV = dstUV;
>> uint16_t* tsrc1 = src[1];
>> uint16_t* tsrc2 = src[2];
>> for (x = c->srcW / 2; x > 0; x--) {
>>    *tdstUV++ = *tsrc1++ << 6;
>>    *tdstUV++ = *tsrc2++ << 6;
>> }
>> 
>> …or is that really old-school and a modern compiler does all that when optimising?
>> 
>> Or is readability considered more important than marginal gains in performance?
>> 
>> Oliver (time travelling from the 1980s)
> 
> You would still have to add the remaining stride.
> The linesize is usually larger than the width, so each line is properly
> aligned.
> 
> So with your code, you'd still need something like
> 
> dstUV += dstStride[1] / 2 - 2 * x;
> src[2] += srcStride[1] / 2 - x;
> src[2] += srcStride[1] / 2 - x;
> 
> after it.

No, the lines after it remain unchanged - only the temporary variables are looping along the x.

src[1] += srcStride[1] / 2;
src[2] += srcStride[2] / 2;
dstUV += dstStride[1] / 2;

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Timo Rothenpieler Sept. 2, 2016, 9:55 a.m. UTC | #8
>>>
>>> …or is that really old-school and a modern compiler does all that when optimising?
>>>
>>> Or is readability considered more important than marginal gains in performance?
>>>
>>> Oliver (time travelling from the 1980s)
>>
>> You would still have to add the remaining stride.
>> The linesize is usually larger than the width, so each line is properly
>> aligned.
>>
>> So with your code, you'd still need something like
>>
>> dstUV += dstStride[1] / 2 - 2 * x;
>> src[2] += srcStride[1] / 2 - x;
>> src[2] += srcStride[1] / 2 - x;
>>
>> after it.
> 
> No, the lines after it remain unchanged - only the temporary variables are looping along the x.
> 
> src[1] += srcStride[1] / 2;
> src[2] += srcStride[2] / 2;
> dstUV += dstStride[1] / 2;


It is indeed very slightly faster.

Old:
[bench @ 0x2cbfb20] t:0.006181 avg:0.006270 max:0.013702 min:0.006080
New:
[bench @ 0x33bcb20] t:0.006195 avg:0.006225 max:0.013718 min:0.006060

It seems to be 0.5ms faster on average.
diff mbox

Patch

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index b231abe..f47e1f4 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -197,6 +197,43 @@  static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
+static int planarToP010Wrapper(SwsContext *c, const uint8_t *src8[],
+                               int srcStride[], int srcSliceY,
+                               int srcSliceH, uint8_t *dstParam8[],
+                               int dstStride[])
+{
+    uint16_t *src[] = {
+        (uint16_t*)(src8[0] + srcStride[0] * srcSliceY),
+        (uint16_t*)(src8[1] + srcStride[1] * srcSliceY),
+        (uint16_t*)(src8[2] + srcStride[2] * srcSliceY)
+    };
+    uint16_t *dstY = (uint16_t*)(dstParam8[0] + dstStride[0] * srcSliceY);
+    uint16_t *dstUV = (uint16_t*)(dstParam8[1] + dstStride[1] * srcSliceY / 2);
+    int x, y;
+
+    av_assert0(!(srcStride[0] % 2 || srcStride[1] % 2 || srcStride[2] % 2 ||
+                 dstStride[0] % 2 || dstStride[1] % 2));
+
+    for (y = srcSliceY; y < srcSliceY + srcSliceH; y++) {
+        if (!(y & 1)) {
+            for (x = 0; x < c->srcW / 2; x++) {
+                dstUV[x*2  ] = src[1][x] << 6;
+                dstUV[x*2+1] = src[2][x] << 6;
+            }
+            src[1] += srcStride[1] / 2;
+            src[2] += srcStride[2] / 2;
+            dstUV += dstStride[1] / 2;
+        }
+        for (x = 0; x < c->srcW; x++) {
+            dstY[x] = src[0][x] << 6;
+        }
+        src[0] += srcStride[0] / 2;
+        dstY += dstStride[0] / 2;
+    }
+
+    return srcSliceH;
+}
+
 static int planarToYuy2Wrapper(SwsContext *c, const uint8_t *src[],
                                int srcStride[], int srcSliceY, int srcSliceH,
                                uint8_t *dstParam[], int dstStride[])
@@ -1600,6 +1637,11 @@  void ff_get_unscaled_swscale(SwsContext *c)
         !(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) {
         c->swscale = ff_yuv2rgb_get_func_ptr(c);
     }
+    /* yuv420p10le_to_p010le */
+    if ((srcFormat == AV_PIX_FMT_YUV420P10 || srcFormat == AV_PIX_FMT_YUVA420P10) &&
+        dstFormat == AV_PIX_FMT_P010) {
+        c->swscale = planarToP010Wrapper;
+    }
 
     if (srcFormat == AV_PIX_FMT_YUV410P && !(dstH & 3) &&
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&