Message ID | 20211114025653.654-2-mindmark@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v2,1/2] swscale/input: unify grayf32 funcs with rgbf32 funcs | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
On 11/13/2021 11:56 PM, mindmark@gmail.com wrote: > From: Mark Reid <mindmark@gmail.com> > > if the float pixel * 65535.0f > 2147483647.0f > lrintf may overfow and return negative values, depending on implementation. > nan and +/-inf values may also be implementation defined > > clamp the value first so lrintf so, always works. > > values <=0.0f, -inf, nan = 0.0f > values >=1.0f, +inf = 1.0f > > the clamping adds some performance overhead, but using a inline function > seems help the compiler optimize on the compiliers I tested. > > old timings > 213920 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips > 218830 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips > 223285 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips > 215405 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips > 208920 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips > 205115 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips > 212220 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips > > 216440 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips > 222450 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips > 228780 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips > 226900 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips > 223168 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips > 249340 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips > 233746 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips > > 173360 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips > 179970 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips > 182960 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips > 177040 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips > 170351 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips > 167136 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips > 165821 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips > > 181040 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips > 182920 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips > 180935 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips > 180897 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips > 179640 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips > 178912 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips > 177983 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips > > new timings > 228860 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips > 232400 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips > 237270 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips > 229992 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips > 222270 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips > 218896 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips > 216938 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips > > 232340 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips > 231830 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips > 242235 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips > 235210 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips > 229040 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips > 224996 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips > 223581 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips > > 179220 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips > 174790 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips > 182630 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips > 183002 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips > 181005 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips > 179390 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips > 192476 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips > > 195620 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips > 195860 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips > 198700 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips > 197252 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips > 195702 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips > 194853 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips > 194459 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips > --- > libswscale/input.c | 21 +++++++++++++-------- > 1 file changed, 13 insertions(+), 8 deletions(-) > > diff --git a/libswscale/input.c b/libswscale/input.c > index 90efdd2ffc..2a13846abe 100644 > --- a/libswscale/input.c > +++ b/libswscale/input.c > @@ -966,6 +966,11 @@ static av_always_inline void planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV, > > #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)): av_int2float(AV_RL32(src))) > > +static av_always_inline float clampf(float x, float min, float max) Use av_clipf() instead. > +{ > + return FFMIN(FFMAX(x, min), max); > +} > + > static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv) > { > int i; > @@ -973,7 +978,7 @@ static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s > uint16_t *dst = (uint16_t *)_dst; > > for (i = 0; i < width; i++) { > - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i))); > + dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f, 65535.0f)); > } > } > > @@ -987,9 +992,9 @@ static av_always_inline void planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV, > int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; > > for (i = 0; i < width; i++) { > - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); > - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); > - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); > + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f)); > + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f)); > + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f)); > > dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; > dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; > @@ -1005,9 +1010,9 @@ static av_always_inline void planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s > int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; > > for (i = 0; i < width; i++) { > - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); > - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); > - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); > + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f)); > + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f)); > + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f)); > > dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; > } > @@ -1021,7 +1026,7 @@ static av_always_inline void grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src, > uint16_t *dst = (uint16_t *)_dst; > > for (i = 0; i < width; ++i){ > - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i))); > + dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f, 65535.0f)); > } > } > > -- > 2.31.1.windows.1 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
On Sat, Nov 13, 2021 at 7:02 PM James Almer <jamrial@gmail.com> wrote: > On 11/13/2021 11:56 PM, mindmark@gmail.com wrote: > > From: Mark Reid <mindmark@gmail.com> > > > > if the float pixel * 65535.0f > 2147483647.0f > > lrintf may overfow and return negative values, depending on > implementation. > > nan and +/-inf values may also be implementation defined > > > > clamp the value first so lrintf so, always works. > > > > values <=0.0f, -inf, nan = 0.0f > > values >=1.0f, +inf = 1.0f > > > > the clamping adds some performance overhead, but using a inline function > > seems help the compiler optimize on the compiliers I tested. > > > > old timings > > 213920 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips > > 218830 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips > > 223285 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips > > 215405 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips > > 208920 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips > > 205115 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips > > 212220 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips > > > > 216440 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips > > 222450 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips > > 228780 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips > > 226900 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips > > 223168 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips > > 249340 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips > > 233746 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips > > > > 173360 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips > > 179970 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips > > 182960 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips > > 177040 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips > > 170351 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips > > 167136 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips > > 165821 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips > > > > 181040 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips > > 182920 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips > > 180935 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips > > 180897 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips > > 179640 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips > > 178912 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips > > 177983 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips > > > > new timings > > 228860 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips > > 232400 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips > > 237270 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips > > 229992 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips > > 222270 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips > > 218896 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips > > 216938 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips > > > > 232340 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips > > 231830 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips > > 242235 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips > > 235210 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips > > 229040 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips > > 224996 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips > > 223581 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips > > > > 179220 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips > > 174790 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips > > 182630 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips > > 183002 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips > > 181005 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips > > 179390 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips > > 192476 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips > > > > 195620 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips > > 195860 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips > > 198700 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips > > 197252 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips > > 195702 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips > > 194853 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips > > 194459 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips > > --- > > libswscale/input.c | 21 +++++++++++++-------- > > 1 file changed, 13 insertions(+), 8 deletions(-) > > > > diff --git a/libswscale/input.c b/libswscale/input.c > > index 90efdd2ffc..2a13846abe 100644 > > --- a/libswscale/input.c > > +++ b/libswscale/input.c > > @@ -966,6 +966,11 @@ static av_always_inline void > planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV, > > > > #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)): > av_int2float(AV_RL32(src))) > > > > +static av_always_inline float clampf(float x, float min, float max) > > Use av_clipf() instead. > Thanks. I should have thought of that, I'll send a version > > > +{ > > + return FFMIN(FFMAX(x, min), max); > > +} > > + > > static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const > uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv) > > { > > int i; > > @@ -973,7 +978,7 @@ static av_always_inline void > planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s > > uint16_t *dst = (uint16_t *)_dst; > > > > for (i = 0; i < width; i++) { > > - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i))); > > + dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f, > 65535.0f)); > > } > > } > > > > @@ -987,9 +992,9 @@ static av_always_inline void > planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV, > > int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = > rgb2yuv[BV_IDX]; > > > > for (i = 0; i < width; i++) { > > - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); > > - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); > > - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); > > + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, > 65535.0f)); > > + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, > 65535.0f)); > > + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, > 65535.0f)); > > > > dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - > 1))) >> RGB2YUV_SHIFT; > > dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - > 1))) >> RGB2YUV_SHIFT; > > @@ -1005,9 +1010,9 @@ static av_always_inline void > planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s > > int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = > rgb2yuv[BY_IDX]; > > > > for (i = 0; i < width; i++) { > > - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); > > - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); > > - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); > > + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, > 65535.0f)); > > + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, > 65535.0f)); > > + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, > 65535.0f)); > > > > dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - > 1))) >> RGB2YUV_SHIFT; > > } > > @@ -1021,7 +1026,7 @@ static av_always_inline void > grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src, > > uint16_t *dst = (uint16_t *)_dst; > > > > for (i = 0; i < width; ++i){ > > - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i))); > > + dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f, > 65535.0f)); > > } > > } > > > > -- > > 2.31.1.windows.1 > > > > _______________________________________________ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > To unsubscribe, visit link above, or email > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
On Sat, Nov 13, 2021 at 7:24 PM Mark Reid <mindmark@gmail.com> wrote: > > > On Sat, Nov 13, 2021 at 7:02 PM James Almer <jamrial@gmail.com> wrote: > >> On 11/13/2021 11:56 PM, mindmark@gmail.com wrote: >> > From: Mark Reid <mindmark@gmail.com> >> > >> > if the float pixel * 65535.0f > 2147483647.0f >> > lrintf may overfow and return negative values, depending on >> implementation. >> > nan and +/-inf values may also be implementation defined >> > >> > clamp the value first so lrintf so, always works. >> > >> > values <=0.0f, -inf, nan = 0.0f >> > values >=1.0f, +inf = 1.0f >> > >> > the clamping adds some performance overhead, but using a inline function >> > seems help the compiler optimize on the compiliers I tested. >> > >> > old timings >> > 213920 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips >> > 218830 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips >> > 223285 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips >> > 215405 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips >> > 208920 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips >> > 205115 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips >> > 212220 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips >> > >> > 216440 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips >> > 222450 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips >> > 228780 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips >> > 226900 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips >> > 223168 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips >> > 249340 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips >> > 233746 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips >> > >> > 173360 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips >> > 179970 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips >> > 182960 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips >> > 177040 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips >> > 170351 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips >> > 167136 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips >> > 165821 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips >> > >> > 181040 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips >> > 182920 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips >> > 180935 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips >> > 180897 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips >> > 179640 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips >> > 178912 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips >> > 177983 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips >> > >> > new timings >> > 228860 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips >> > 232400 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips >> > 237270 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips >> > 229992 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips >> > 222270 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips >> > 218896 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips >> > 216938 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips >> > >> > 232340 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips >> > 231830 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips >> > 242235 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips >> > 235210 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips >> > 229040 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips >> > 224996 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips >> > 223581 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips >> > >> > 179220 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips >> > 174790 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips >> > 182630 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips >> > 183002 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips >> > 181005 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips >> > 179390 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips >> > 192476 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips >> > >> > 195620 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips >> > 195860 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips >> > 198700 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips >> > 197252 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips >> > 195702 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips >> > 194853 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips >> > 194459 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips >> > --- >> > libswscale/input.c | 21 +++++++++++++-------- >> > 1 file changed, 13 insertions(+), 8 deletions(-) >> > >> > diff --git a/libswscale/input.c b/libswscale/input.c >> > index 90efdd2ffc..2a13846abe 100644 >> > --- a/libswscale/input.c >> > +++ b/libswscale/input.c >> > @@ -966,6 +966,11 @@ static av_always_inline void >> planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV, >> > >> > #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)): >> av_int2float(AV_RL32(src))) >> > >> > +static av_always_inline float clampf(float x, float min, float max) >> >> Use av_clipf() instead. >> > > Thanks. I should have thought of that, I'll send a version > > I don't think av_clipf works for the NAN case. With my clamp function I'm exploiting that MAX changes the value that is passed MIN and that NAN > 0.0f == false. so the FFMAX(NAN, 0.0f) is 0.0f It's a subtle thing, but it maps NAN to zero. The x86 asm version of av_clipf will work if maxss is done before minss, but the c version returns NAN if the value is NAN. I'll see if anything breaks if I change the behavior of av_clipf > >> > +{ >> > + return FFMIN(FFMAX(x, min), max); >> > +} >> > + >> > static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const >> uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv) >> > { >> > int i; >> > @@ -973,7 +978,7 @@ static av_always_inline void >> planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s >> > uint16_t *dst = (uint16_t *)_dst; >> > >> > for (i = 0; i < width; i++) { >> > - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i))); >> > + dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f, >> 65535.0f)); >> > } >> > } >> > >> > @@ -987,9 +992,9 @@ static av_always_inline void >> planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV, >> > int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = >> rgb2yuv[BV_IDX]; >> > >> > for (i = 0; i < width; i++) { >> > - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); >> > - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); >> > - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); >> > + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, >> 65535.0f)); >> > + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, >> 65535.0f)); >> > + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, >> 65535.0f)); >> > >> > dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - >> 1))) >> RGB2YUV_SHIFT; >> > dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - >> 1))) >> RGB2YUV_SHIFT; >> > @@ -1005,9 +1010,9 @@ static av_always_inline void >> planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s >> > int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = >> rgb2yuv[BY_IDX]; >> > >> > for (i = 0; i < width; i++) { >> > - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); >> > - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); >> > - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); >> > + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, >> 65535.0f)); >> > + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, >> 65535.0f)); >> > + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, >> 65535.0f)); >> > >> > dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - >> 1))) >> RGB2YUV_SHIFT; >> > } >> > @@ -1021,7 +1026,7 @@ static av_always_inline void >> grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src, >> > uint16_t *dst = (uint16_t *)_dst; >> > >> > for (i = 0; i < width; ++i){ >> > - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i))); >> > + dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f, >> 65535.0f)); >> > } >> > } >> > >> > -- >> > 2.31.1.windows.1 >> > >> > _______________________________________________ >> > ffmpeg-devel mailing list >> > ffmpeg-devel@ffmpeg.org >> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> > >> > To unsubscribe, visit link above, or email >> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >> > >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >> >
diff --git a/libswscale/input.c b/libswscale/input.c index 90efdd2ffc..2a13846abe 100644 --- a/libswscale/input.c +++ b/libswscale/input.c @@ -966,6 +966,11 @@ static av_always_inline void planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV, #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)): av_int2float(AV_RL32(src))) +static av_always_inline float clampf(float x, float min, float max) +{ + return FFMIN(FFMAX(x, min), max); +} + static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv) { int i; @@ -973,7 +978,7 @@ static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s uint16_t *dst = (uint16_t *)_dst; for (i = 0; i < width; i++) { - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i))); + dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f, 65535.0f)); } } @@ -987,9 +992,9 @@ static av_always_inline void planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV, int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; for (i = 0; i < width; i++) { - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f)); + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f)); + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f)); dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; @@ -1005,9 +1010,9 @@ static av_always_inline void planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; for (i = 0; i < width; i++) { - int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i))); - int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i))); - int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i))); + int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f)); + int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f)); + int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f)); dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; } @@ -1021,7 +1026,7 @@ static av_always_inline void grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src, uint16_t *dst = (uint16_t *)_dst; for (i = 0; i < width; ++i){ - dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i))); + dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f, 65535.0f)); } }
From: Mark Reid <mindmark@gmail.com> if the float pixel * 65535.0f > 2147483647.0f lrintf may overfow and return negative values, depending on implementation. nan and +/-inf values may also be implementation defined clamp the value first so lrintf so, always works. values <=0.0f, -inf, nan = 0.0f values >=1.0f, +inf = 1.0f the clamping adds some performance overhead, but using a inline function seems help the compiler optimize on the compiliers I tested. old timings 213920 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips 218830 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips 223285 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips 215405 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips 208920 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips 205115 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips 212220 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips 216440 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips 222450 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips 228780 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips 226900 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips 223168 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips 249340 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips 233746 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips 173360 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips 179970 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips 182960 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips 177040 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips 170351 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips 167136 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips 165821 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips 181040 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips 182920 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips 180935 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips 180897 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips 179640 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips 178912 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips 177983 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips new timings 228860 UNITS in planar_rgbf32le_to_uv, 1 runs, 0 skips 232400 UNITS in planar_rgbf32le_to_uv, 2 runs, 0 skips 237270 UNITS in planar_rgbf32le_to_uv, 4 runs, 0 skips 229992 UNITS in planar_rgbf32le_to_uv, 8 runs, 0 skips 222270 UNITS in planar_rgbf32le_to_uv, 16 runs, 0 skips 218896 UNITS in planar_rgbf32le_to_uv, 32 runs, 0 skips 216938 UNITS in planar_rgbf32le_to_uv, 64 runs, 0 skips 232340 UNITS in planar_rgbf32be_to_uv, 1 runs, 0 skips 231830 UNITS in planar_rgbf32be_to_uv, 2 runs, 0 skips 242235 UNITS in planar_rgbf32be_to_uv, 4 runs, 0 skips 235210 UNITS in planar_rgbf32be_to_uv, 8 runs, 0 skips 229040 UNITS in planar_rgbf32be_to_uv, 16 runs, 0 skips 224996 UNITS in planar_rgbf32be_to_uv, 32 runs, 0 skips 223581 UNITS in planar_rgbf32be_to_uv, 64 runs, 0 skips 179220 UNITS in planar_rgbf32le_to_y, 1 runs, 0 skips 174790 UNITS in planar_rgbf32le_to_y, 2 runs, 0 skips 182630 UNITS in planar_rgbf32le_to_y, 4 runs, 0 skips 183002 UNITS in planar_rgbf32le_to_y, 8 runs, 0 skips 181005 UNITS in planar_rgbf32le_to_y, 16 runs, 0 skips 179390 UNITS in planar_rgbf32le_to_y, 32 runs, 0 skips 192476 UNITS in planar_rgbf32le_to_y, 64 runs, 0 skips 195620 UNITS in planar_rgbf32be_to_y, 1 runs, 0 skips 195860 UNITS in planar_rgbf32be_to_y, 2 runs, 0 skips 198700 UNITS in planar_rgbf32be_to_y, 4 runs, 0 skips 197252 UNITS in planar_rgbf32be_to_y, 8 runs, 0 skips 195702 UNITS in planar_rgbf32be_to_y, 16 runs, 0 skips 194853 UNITS in planar_rgbf32be_to_y, 32 runs, 0 skips 194459 UNITS in planar_rgbf32be_to_y, 64 runs, 0 skips --- libswscale/input.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) -- 2.31.1.windows.1