Message ID | 20160901174938.7366-1-timo@rothenpieler.org |
---|---|
State | Superseded |
Headers | show |
Just sticking my head above the parapet, but shouldn’t things like... > + for (x = 0; x < c->srcW / 2; x++) { > + dstUV[x*2 ] = src[1][x] << 6; > + dstUV[x*2+1] = src[2][x] << 6; > + } …be more efficiently written as... uint16_t* tdstUV = dstUV; uint16_t* tsrc1 = src[1]; uint16_t* tsrc2 = src[2]; for (x = c->srcW / 2; x > 0; x--) { *tdstUV++ = *tsrc1++ << 6; *tdstUV++ = *tsrc2++ << 6; } …or is that really old-school and a modern compiler does all that when optimising? Or is readability considered more important than marginal gains in performance? Oliver (time travelling from the 1980s) > On 1 Sep 2016, at 20:49, Timo Rothenpieler <timo@rothenpieler.org> wrote: > > --- > libswscale/swscale_unscaled.c | 42 ++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 42 insertions(+) > > diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c > index b231abe..f47e1f4 100644 > --- a/libswscale/swscale_unscaled.c > +++ b/libswscale/swscale_unscaled.c > @@ -197,6 +197,43 @@ static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[], > return srcSliceH; > } > > +static int planarToP010Wrapper(SwsContext *c, const uint8_t *src8[], > + int srcStride[], int srcSliceY, > + int srcSliceH, uint8_t *dstParam8[], > + int dstStride[]) > +{ > + uint16_t *src[] = { > + (uint16_t*)(src8[0] + srcStride[0] * srcSliceY), > + (uint16_t*)(src8[1] + srcStride[1] * srcSliceY), > + (uint16_t*)(src8[2] + srcStride[2] * srcSliceY) > + }; > + uint16_t *dstY = (uint16_t*)(dstParam8[0] + dstStride[0] * srcSliceY); > + uint16_t *dstUV = (uint16_t*)(dstParam8[1] + dstStride[1] * srcSliceY / 2); > + int x, y; > + > + av_assert0(!(srcStride[0] % 2 || srcStride[1] % 2 || srcStride[2] % 2 || > + dstStride[0] % 2 || dstStride[1] % 2)); > + > + for (y = srcSliceY; y < srcSliceY + srcSliceH; y++) { > + if (!(y & 1)) { > + for (x = 0; x < c->srcW / 2; x++) { > + dstUV[x*2 ] = src[1][x] << 6; > + dstUV[x*2+1] = src[2][x] << 6; > + } > + src[1] += srcStride[1] / 2; > + src[2] += srcStride[2] / 2; > + dstUV += dstStride[1] / 2; > + } > + for (x = 0; x < c->srcW; x++) { > + dstY[x] = src[0][x] << 6; > + } > + src[0] += srcStride[0] / 2; > + dstY += dstStride[0] / 2; > + } > + > + return srcSliceH; > +} > + > static int planarToYuy2Wrapper(SwsContext *c, const uint8_t *src[], > int srcStride[], int srcSliceY, int srcSliceH, > uint8_t *dstParam[], int dstStride[]) > @@ -1600,6 +1637,11 @@ void ff_get_unscaled_swscale(SwsContext *c) > !(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) { > c->swscale = ff_yuv2rgb_get_func_ptr(c); > } > + /* yuv420p10le_to_p010le */ > + if ((srcFormat == AV_PIX_FMT_YUV420P10 || srcFormat == AV_PIX_FMT_YUVA420P10) && > + dstFormat == AV_PIX_FMT_P010) { > + c->swscale = planarToP010Wrapper; > + } > > if (srcFormat == AV_PIX_FMT_YUV410P && !(dstH & 3) && > (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && > -- > 2.9.2 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
On Thu, Sep 01, 2016 at 07:49:38PM +0200, Timo Rothenpieler wrote: > --- > libswscale/swscale_unscaled.c | 42 ++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 42 insertions(+) > > diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c > index b231abe..f47e1f4 100644 > --- a/libswscale/swscale_unscaled.c > +++ b/libswscale/swscale_unscaled.c > @@ -197,6 +197,43 @@ static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[], > return srcSliceH; > } > > +static int planarToP010Wrapper(SwsContext *c, const uint8_t *src8[], > + int srcStride[], int srcSliceY, > + int srcSliceH, uint8_t *dstParam8[], > + int dstStride[]) > +{ > + uint16_t *src[] = { > + (uint16_t*)(src8[0] + srcStride[0] * srcSliceY), > + (uint16_t*)(src8[1] + srcStride[1] * srcSliceY), > + (uint16_t*)(src8[2] + srcStride[2] * srcSliceY) this looks odd, why is this needed ? [...]
>> + uint16_t *src[] = { >> + (uint16_t*)(src8[0] + srcStride[0] * srcSliceY), >> + (uint16_t*)(src8[1] + srcStride[1] * srcSliceY), >> + (uint16_t*)(src8[2] + srcStride[2] * srcSliceY) > > this looks odd, why is this needed ? > Without it, every dstY[x] = src[0][x] << 6; would turn into dstY[x] = ((uint16_t*)(src8[0] + srcStride[0] * srcSliceY))[x] << 6; So it improves readability and possibly moves some repeated calculations out of the loop. Could also just be 3 independent variables srcY/srcU/srcV, if the array is what looks odd.
On Fri, Sep 02, 2016 at 10:38:39AM +0200, Timo Rothenpieler wrote: > >> + uint16_t *src[] = { > >> + (uint16_t*)(src8[0] + srcStride[0] * srcSliceY), > >> + (uint16_t*)(src8[1] + srcStride[1] * srcSliceY), > >> + (uint16_t*)(src8[2] + srcStride[2] * srcSliceY) > > > > this looks odd, why is this needed ? > > > > Without it, every > > dstY[x] = src[0][x] << 6; > > would turn into > > dstY[x] = ((uint16_t*)(src8[0] + srcStride[0] * srcSliceY))[x] << 6; you misunderstood me, why do you add srcSliceY? isnt src* already pointing to the right spot ? [...]
Am 02.09.2016 um 11:02 schrieb Michael Niedermayer: > On Fri, Sep 02, 2016 at 10:38:39AM +0200, Timo Rothenpieler wrote: >>>> + uint16_t *src[] = { >>>> + (uint16_t*)(src8[0] + srcStride[0] * srcSliceY), >>>> + (uint16_t*)(src8[1] + srcStride[1] * srcSliceY), >>>> + (uint16_t*)(src8[2] + srcStride[2] * srcSliceY) >>> >>> this looks odd, why is this needed ? >>> >> >> Without it, every >> >> dstY[x] = src[0][x] << 6; >> >> would turn into >> >> dstY[x] = ((uint16_t*)(src8[0] + srcStride[0] * srcSliceY))[x] << 6; > > you misunderstood me, why do you add srcSliceY? isnt src* already > pointing to the right spot ? Looking at the other functions, it indeed seems like it is. Thanks, completely missed that.
> Just sticking my head above the parapet, but shouldn’t things like... > >> + for (x = 0; x < c->srcW / 2; x++) { >> + dstUV[x*2 ] = src[1][x] << 6; >> + dstUV[x*2+1] = src[2][x] << 6; >> + } > > …be more efficiently written as... > > uint16_t* tdstUV = dstUV; > uint16_t* tsrc1 = src[1]; > uint16_t* tsrc2 = src[2]; > for (x = c->srcW / 2; x > 0; x--) { > *tdstUV++ = *tsrc1++ << 6; > *tdstUV++ = *tsrc2++ << 6; > } > > …or is that really old-school and a modern compiler does all that when optimising? > > Or is readability considered more important than marginal gains in performance? > > Oliver (time travelling from the 1980s) You would still have to add the remaining stride. The linesize is usually larger than the width, so each line is properly aligned. So with your code, you'd still need something like dstUV += dstStride[1] / 2 - 2 * x; src[2] += srcStride[1] / 2 - x; src[2] += srcStride[1] / 2 - x; after it.
> On 2 Sep 2016, at 12:12, Timo Rothenpieler <timo@rothenpieler.org> wrote: > >> Just sticking my head above the parapet, but shouldn’t things like... >> >>> + for (x = 0; x < c->srcW / 2; x++) { >>> + dstUV[x*2 ] = src[1][x] << 6; >>> + dstUV[x*2+1] = src[2][x] << 6; >>> + } >> >> …be more efficiently written as... >> >> uint16_t* tdstUV = dstUV; >> uint16_t* tsrc1 = src[1]; >> uint16_t* tsrc2 = src[2]; >> for (x = c->srcW / 2; x > 0; x--) { >> *tdstUV++ = *tsrc1++ << 6; >> *tdstUV++ = *tsrc2++ << 6; >> } >> >> …or is that really old-school and a modern compiler does all that when optimising? >> >> Or is readability considered more important than marginal gains in performance? >> >> Oliver (time travelling from the 1980s) > > You would still have to add the remaining stride. > The linesize is usually larger than the width, so each line is properly > aligned. > > So with your code, you'd still need something like > > dstUV += dstStride[1] / 2 - 2 * x; > src[2] += srcStride[1] / 2 - x; > src[2] += srcStride[1] / 2 - x; > > after it. No, the lines after it remain unchanged - only the temporary variables are looping along the x. src[1] += srcStride[1] / 2; src[2] += srcStride[2] / 2; dstUV += dstStride[1] / 2; > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>> >>> …or is that really old-school and a modern compiler does all that when optimising? >>> >>> Or is readability considered more important than marginal gains in performance? >>> >>> Oliver (time travelling from the 1980s) >> >> You would still have to add the remaining stride. >> The linesize is usually larger than the width, so each line is properly >> aligned. >> >> So with your code, you'd still need something like >> >> dstUV += dstStride[1] / 2 - 2 * x; >> src[2] += srcStride[1] / 2 - x; >> src[2] += srcStride[1] / 2 - x; >> >> after it. > > No, the lines after it remain unchanged - only the temporary variables are looping along the x. > > src[1] += srcStride[1] / 2; > src[2] += srcStride[2] / 2; > dstUV += dstStride[1] / 2; It is indeed very slightly faster. Old: [bench @ 0x2cbfb20] t:0.006181 avg:0.006270 max:0.013702 min:0.006080 New: [bench @ 0x33bcb20] t:0.006195 avg:0.006225 max:0.013718 min:0.006060 It seems to be 0.5ms faster on average.
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index b231abe..f47e1f4 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -197,6 +197,43 @@ static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[], return srcSliceH; } +static int planarToP010Wrapper(SwsContext *c, const uint8_t *src8[], + int srcStride[], int srcSliceY, + int srcSliceH, uint8_t *dstParam8[], + int dstStride[]) +{ + uint16_t *src[] = { + (uint16_t*)(src8[0] + srcStride[0] * srcSliceY), + (uint16_t*)(src8[1] + srcStride[1] * srcSliceY), + (uint16_t*)(src8[2] + srcStride[2] * srcSliceY) + }; + uint16_t *dstY = (uint16_t*)(dstParam8[0] + dstStride[0] * srcSliceY); + uint16_t *dstUV = (uint16_t*)(dstParam8[1] + dstStride[1] * srcSliceY / 2); + int x, y; + + av_assert0(!(srcStride[0] % 2 || srcStride[1] % 2 || srcStride[2] % 2 || + dstStride[0] % 2 || dstStride[1] % 2)); + + for (y = srcSliceY; y < srcSliceY + srcSliceH; y++) { + if (!(y & 1)) { + for (x = 0; x < c->srcW / 2; x++) { + dstUV[x*2 ] = src[1][x] << 6; + dstUV[x*2+1] = src[2][x] << 6; + } + src[1] += srcStride[1] / 2; + src[2] += srcStride[2] / 2; + dstUV += dstStride[1] / 2; + } + for (x = 0; x < c->srcW; x++) { + dstY[x] = src[0][x] << 6; + } + src[0] += srcStride[0] / 2; + dstY += dstStride[0] / 2; + } + + return srcSliceH; +} + static int planarToYuy2Wrapper(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dstParam[], int dstStride[]) @@ -1600,6 +1637,11 @@ void ff_get_unscaled_swscale(SwsContext *c) !(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) { c->swscale = ff_yuv2rgb_get_func_ptr(c); } + /* yuv420p10le_to_p010le */ + if ((srcFormat == AV_PIX_FMT_YUV420P10 || srcFormat == AV_PIX_FMT_YUVA420P10) && + dstFormat == AV_PIX_FMT_P010) { + c->swscale = planarToP010Wrapper; + } if (srcFormat == AV_PIX_FMT_YUV410P && !(dstH & 3) && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&