diff mbox series

[FFmpeg-devel,v2,2/2] swscale/input: clamp rgbf32 values before lrintf

Message ID 20211114025653.654-2-mindmark@gmail.com
State New
Headers show
Series [FFmpeg-devel,v2,1/2] swscale/input: unify grayf32 funcs with rgbf32 funcs
Related show

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Mark Reid Nov. 14, 2021, 2:56 a.m. UTC
From: Mark Reid <mindmark@gmail.com>

if the float pixel * 65535.0f > 2147483647.0f
lrintf may overfow and return negative values, depending on implementation.
nan and +/-inf values may also be implementation defined

clamp the value first so lrintf so, always works.

values <=0.0f, -inf, nan = 0.0f
values >=1.0f, +inf      = 1.0f

the clamping adds some performance overhead, but using a inline function
seems help the compiler optimize on the compiliers I tested.

old timings
 213920 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
 218830 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
 223285 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
 215405 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
 208920 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
 205115 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
 212220 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips

 216440 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
 222450 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
 228780 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
 226900 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
 223168 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
 249340 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
 233746 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips

 173360 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
 179970 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
 182960 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
 177040 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
 170351 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
 167136 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
 165821 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips

 181040 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
 182920 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
 180935 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
 180897 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
 179640 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
 178912 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
 177983 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips

new timings
 228860 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
 232400 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
 237270 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
 229992 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
 222270 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
 218896 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
 216938 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips

 232340 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
 231830 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
 242235 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
 235210 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
 229040 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
 224996 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
 223581 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips

 179220 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
 174790 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
 182630 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
 183002 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
 181005 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
 179390 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
 192476 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips

 195620 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
 195860 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
 198700 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
 197252 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
 195702 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
 194853 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
 194459 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips
---
 libswscale/input.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

--
2.31.1.windows.1

Comments

James Almer Nov. 14, 2021, 3:02 a.m. UTC | #1
On 11/13/2021 11:56 PM, mindmark@gmail.com wrote:
> From: Mark Reid <mindmark@gmail.com>
> 
> if the float pixel * 65535.0f > 2147483647.0f
> lrintf may overfow and return negative values, depending on implementation.
> nan and +/-inf values may also be implementation defined
> 
> clamp the value first so lrintf so, always works.
> 
> values <=0.0f, -inf, nan = 0.0f
> values >=1.0f, +inf      = 1.0f
> 
> the clamping adds some performance overhead, but using a inline function
> seems help the compiler optimize on the compiliers I tested.
> 
> old timings
>   213920 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
>   218830 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
>   223285 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
>   215405 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
>   208920 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
>   205115 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
>   212220 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips
> 
>   216440 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
>   222450 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
>   228780 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
>   226900 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
>   223168 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
>   249340 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
>   233746 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips
> 
>   173360 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
>   179970 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
>   182960 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
>   177040 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
>   170351 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
>   167136 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
>   165821 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips
> 
>   181040 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
>   182920 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
>   180935 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
>   180897 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
>   179640 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
>   178912 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
>   177983 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips
> 
> new timings
>   228860 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
>   232400 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
>   237270 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
>   229992 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
>   222270 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
>   218896 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
>   216938 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips
> 
>   232340 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
>   231830 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
>   242235 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
>   235210 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
>   229040 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
>   224996 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
>   223581 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips
> 
>   179220 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
>   174790 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
>   182630 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
>   183002 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
>   181005 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
>   179390 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
>   192476 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips
> 
>   195620 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
>   195860 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
>   198700 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
>   197252 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
>   195702 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
>   194853 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
>   194459 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips
> ---
>   libswscale/input.c | 21 +++++++++++++--------
>   1 file changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/libswscale/input.c b/libswscale/input.c
> index 90efdd2ffc..2a13846abe 100644
> --- a/libswscale/input.c
> +++ b/libswscale/input.c
> @@ -966,6 +966,11 @@ static av_always_inline void planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV,
> 
>   #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)): av_int2float(AV_RL32(src)))
> 
> +static av_always_inline float clampf(float x, float min, float max)

Use av_clipf() instead.

> +{
> +    return FFMIN(FFMAX(x, min), max);
> +}
> +
>   static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv)
>   {
>       int i;
> @@ -973,7 +978,7 @@ static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s
>       uint16_t *dst        = (uint16_t *)_dst;
> 
>       for (i = 0; i < width; i++) {
> -        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i)));
> +        dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f, 65535.0f));
>       }
>   }
> 
> @@ -987,9 +992,9 @@ static av_always_inline void planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV,
>       int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
> 
>       for (i = 0; i < width; i++) {
> -        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
> -        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
> -        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
> +        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f));
> +        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f));
> +        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f));
> 
>           dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
>           dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
> @@ -1005,9 +1010,9 @@ static av_always_inline void planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s
>       int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
> 
>       for (i = 0; i < width; i++) {
> -        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
> -        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
> -        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
> +        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f));
> +        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f));
> +        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f));
> 
>           dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
>       }
> @@ -1021,7 +1026,7 @@ static av_always_inline void grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src,
>       uint16_t *dst    = (uint16_t *)_dst;
> 
>       for (i = 0; i < width; ++i){
> -        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i)));
> +        dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f,  65535.0f));
>       }
>   }
> 
> --
> 2.31.1.windows.1
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Mark Reid Nov. 14, 2021, 3:24 a.m. UTC | #2
On Sat, Nov 13, 2021 at 7:02 PM James Almer <jamrial@gmail.com> wrote:

> On 11/13/2021 11:56 PM, mindmark@gmail.com wrote:
> > From: Mark Reid <mindmark@gmail.com>
> >
> > if the float pixel * 65535.0f > 2147483647.0f
> > lrintf may overfow and return negative values, depending on
> implementation.
> > nan and +/-inf values may also be implementation defined
> >
> > clamp the value first so lrintf so, always works.
> >
> > values <=0.0f, -inf, nan = 0.0f
> > values >=1.0f, +inf      = 1.0f
> >
> > the clamping adds some performance overhead, but using a inline function
> > seems help the compiler optimize on the compiliers I tested.
> >
> > old timings
> >   213920 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
> >   218830 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
> >   223285 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
> >   215405 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
> >   208920 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
> >   205115 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
> >   212220 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips
> >
> >   216440 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
> >   222450 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
> >   228780 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
> >   226900 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
> >   223168 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
> >   249340 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
> >   233746 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips
> >
> >   173360 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
> >   179970 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
> >   182960 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
> >   177040 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
> >   170351 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
> >   167136 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
> >   165821 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips
> >
> >   181040 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
> >   182920 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
> >   180935 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
> >   180897 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
> >   179640 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
> >   178912 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
> >   177983 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips
> >
> > new timings
> >   228860 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
> >   232400 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
> >   237270 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
> >   229992 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
> >   222270 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
> >   218896 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
> >   216938 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips
> >
> >   232340 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
> >   231830 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
> >   242235 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
> >   235210 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
> >   229040 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
> >   224996 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
> >   223581 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips
> >
> >   179220 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
> >   174790 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
> >   182630 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
> >   183002 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
> >   181005 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
> >   179390 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
> >   192476 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips
> >
> >   195620 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
> >   195860 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
> >   198700 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
> >   197252 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
> >   195702 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
> >   194853 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
> >   194459 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips
> > ---
> >   libswscale/input.c | 21 +++++++++++++--------
> >   1 file changed, 13 insertions(+), 8 deletions(-)
> >
> > diff --git a/libswscale/input.c b/libswscale/input.c
> > index 90efdd2ffc..2a13846abe 100644
> > --- a/libswscale/input.c
> > +++ b/libswscale/input.c
> > @@ -966,6 +966,11 @@ static av_always_inline void
> planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV,
> >
> >   #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)):
> av_int2float(AV_RL32(src)))
> >
> > +static av_always_inline float clampf(float x, float min, float max)
>
> Use av_clipf() instead.
>

Thanks. I should have thought of that, I'll send a version


>
> > +{
> > +    return FFMIN(FFMAX(x, min), max);
> > +}
> > +
> >   static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const
> uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv)
> >   {
> >       int i;
> > @@ -973,7 +978,7 @@ static av_always_inline void
> planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s
> >       uint16_t *dst        = (uint16_t *)_dst;
> >
> >       for (i = 0; i < width; i++) {
> > -        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i)));
> > +        dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f,
> 65535.0f));
> >       }
> >   }
> >
> > @@ -987,9 +992,9 @@ static av_always_inline void
> planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV,
> >       int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> rgb2yuv[BV_IDX];
> >
> >       for (i = 0; i < width; i++) {
> > -        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
> > -        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
> > -        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
> > +        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f,
> 65535.0f));
> > +        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f,
> 65535.0f));
> > +        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f,
> 65535.0f));
> >
> >           dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT -
> 1))) >> RGB2YUV_SHIFT;
> >           dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT -
> 1))) >> RGB2YUV_SHIFT;
> > @@ -1005,9 +1010,9 @@ static av_always_inline void
> planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s
> >       int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by =
> rgb2yuv[BY_IDX];
> >
> >       for (i = 0; i < width; i++) {
> > -        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
> > -        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
> > -        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
> > +        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f,
> 65535.0f));
> > +        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f,
> 65535.0f));
> > +        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f,
> 65535.0f));
> >
> >           dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT -
> 1))) >> RGB2YUV_SHIFT;
> >       }
> > @@ -1021,7 +1026,7 @@ static av_always_inline void
> grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src,
> >       uint16_t *dst    = (uint16_t *)_dst;
> >
> >       for (i = 0; i < width; ++i){
> > -        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i)));
> > +        dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f,
> 65535.0f));
> >       }
> >   }
> >
> > --
> > 2.31.1.windows.1
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Mark Reid Nov. 14, 2021, 5:24 a.m. UTC | #3
On Sat, Nov 13, 2021 at 7:24 PM Mark Reid <mindmark@gmail.com> wrote:

>
>
> On Sat, Nov 13, 2021 at 7:02 PM James Almer <jamrial@gmail.com> wrote:
>
>> On 11/13/2021 11:56 PM, mindmark@gmail.com wrote:
>> > From: Mark Reid <mindmark@gmail.com>
>> >
>> > if the float pixel * 65535.0f > 2147483647.0f
>> > lrintf may overfow and return negative values, depending on
>> implementation.
>> > nan and +/-inf values may also be implementation defined
>> >
>> > clamp the value first so lrintf so, always works.
>> >
>> > values <=0.0f, -inf, nan = 0.0f
>> > values >=1.0f, +inf      = 1.0f
>> >
>> > the clamping adds some performance overhead, but using a inline function
>> > seems help the compiler optimize on the compiliers I tested.
>> >
>> > old timings
>> >   213920 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
>> >   218830 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
>> >   223285 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
>> >   215405 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
>> >   208920 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
>> >   205115 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
>> >   212220 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips
>> >
>> >   216440 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
>> >   222450 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
>> >   228780 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
>> >   226900 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
>> >   223168 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
>> >   249340 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
>> >   233746 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips
>> >
>> >   173360 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
>> >   179970 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
>> >   182960 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
>> >   177040 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
>> >   170351 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
>> >   167136 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
>> >   165821 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips
>> >
>> >   181040 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
>> >   182920 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
>> >   180935 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
>> >   180897 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
>> >   179640 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
>> >   178912 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
>> >   177983 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips
>> >
>> > new timings
>> >   228860 UNITS in planar_rgbf32le_to_uv,       1 runs,      0 skips
>> >   232400 UNITS in planar_rgbf32le_to_uv,       2 runs,      0 skips
>> >   237270 UNITS in planar_rgbf32le_to_uv,       4 runs,      0 skips
>> >   229992 UNITS in planar_rgbf32le_to_uv,       8 runs,      0 skips
>> >   222270 UNITS in planar_rgbf32le_to_uv,      16 runs,      0 skips
>> >   218896 UNITS in planar_rgbf32le_to_uv,      32 runs,      0 skips
>> >   216938 UNITS in planar_rgbf32le_to_uv,      64 runs,      0 skips
>> >
>> >   232340 UNITS in planar_rgbf32be_to_uv,       1 runs,      0 skips
>> >   231830 UNITS in planar_rgbf32be_to_uv,       2 runs,      0 skips
>> >   242235 UNITS in planar_rgbf32be_to_uv,       4 runs,      0 skips
>> >   235210 UNITS in planar_rgbf32be_to_uv,       8 runs,      0 skips
>> >   229040 UNITS in planar_rgbf32be_to_uv,      16 runs,      0 skips
>> >   224996 UNITS in planar_rgbf32be_to_uv,      32 runs,      0 skips
>> >   223581 UNITS in planar_rgbf32be_to_uv,      64 runs,      0 skips
>> >
>> >   179220 UNITS in planar_rgbf32le_to_y,       1 runs,      0 skips
>> >   174790 UNITS in planar_rgbf32le_to_y,       2 runs,      0 skips
>> >   182630 UNITS in planar_rgbf32le_to_y,       4 runs,      0 skips
>> >   183002 UNITS in planar_rgbf32le_to_y,       8 runs,      0 skips
>> >   181005 UNITS in planar_rgbf32le_to_y,      16 runs,      0 skips
>> >   179390 UNITS in planar_rgbf32le_to_y,      32 runs,      0 skips
>> >   192476 UNITS in planar_rgbf32le_to_y,      64 runs,      0 skips
>> >
>> >   195620 UNITS in planar_rgbf32be_to_y,       1 runs,      0 skips
>> >   195860 UNITS in planar_rgbf32be_to_y,       2 runs,      0 skips
>> >   198700 UNITS in planar_rgbf32be_to_y,       4 runs,      0 skips
>> >   197252 UNITS in planar_rgbf32be_to_y,       8 runs,      0 skips
>> >   195702 UNITS in planar_rgbf32be_to_y,      16 runs,      0 skips
>> >   194853 UNITS in planar_rgbf32be_to_y,      32 runs,      0 skips
>> >   194459 UNITS in planar_rgbf32be_to_y,      64 runs,      0 skips
>> > ---
>> >   libswscale/input.c | 21 +++++++++++++--------
>> >   1 file changed, 13 insertions(+), 8 deletions(-)
>> >
>> > diff --git a/libswscale/input.c b/libswscale/input.c
>> > index 90efdd2ffc..2a13846abe 100644
>> > --- a/libswscale/input.c
>> > +++ b/libswscale/input.c
>> > @@ -966,6 +966,11 @@ static av_always_inline void
>> planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV,
>> >
>> >   #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)):
>> av_int2float(AV_RL32(src)))
>> >
>> > +static av_always_inline float clampf(float x, float min, float max)
>>
>> Use av_clipf() instead.
>>
>
> Thanks. I should have thought of that, I'll send a version
>
>

I don't think av_clipf works for the NAN case.
With my clamp function I'm exploiting that MAX changes the value that is
passed MIN and that NAN > 0.0f == false.
so the FFMAX(NAN, 0.0f) is 0.0f
It's a subtle thing, but it maps NAN to zero.
The x86 asm version of av_clipf will work if maxss is done before minss,
but the c version returns NAN if the value is NAN.

I'll see if anything breaks if I change the behavior of av_clipf


>
>> > +{
>> > +    return FFMIN(FFMAX(x, min), max);
>> > +}
>> > +
>> >   static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const
>> uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv)
>> >   {
>> >       int i;
>> > @@ -973,7 +978,7 @@ static av_always_inline void
>> planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s
>> >       uint16_t *dst        = (uint16_t *)_dst;
>> >
>> >       for (i = 0; i < width; i++) {
>> > -        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i)));
>> > +        dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f,
>> 65535.0f));
>> >       }
>> >   }
>> >
>> > @@ -987,9 +992,9 @@ static av_always_inline void
>> planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV,
>> >       int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
>> rgb2yuv[BV_IDX];
>> >
>> >       for (i = 0; i < width; i++) {
>> > -        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
>> > -        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
>> > -        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
>> > +        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f,
>> 65535.0f));
>> > +        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f,
>> 65535.0f));
>> > +        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f,
>> 65535.0f));
>> >
>> >           dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT -
>> 1))) >> RGB2YUV_SHIFT;
>> >           dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT -
>> 1))) >> RGB2YUV_SHIFT;
>> > @@ -1005,9 +1010,9 @@ static av_always_inline void
>> planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s
>> >       int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by =
>> rgb2yuv[BY_IDX];
>> >
>> >       for (i = 0; i < width; i++) {
>> > -        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
>> > -        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
>> > -        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
>> > +        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f,
>> 65535.0f));
>> > +        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f,
>> 65535.0f));
>> > +        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f,
>> 65535.0f));
>> >
>> >           dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT -
>> 1))) >> RGB2YUV_SHIFT;
>> >       }
>> > @@ -1021,7 +1026,7 @@ static av_always_inline void
>> grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src,
>> >       uint16_t *dst    = (uint16_t *)_dst;
>> >
>> >       for (i = 0; i < width; ++i){
>> > -        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i)));
>> > +        dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f,
>> 65535.0f));
>> >       }
>> >   }
>> >
>> > --
>> > 2.31.1.windows.1
>> >
>> > _______________________________________________
>> > ffmpeg-devel mailing list
>> > ffmpeg-devel@ffmpeg.org
>> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> >
>> > To unsubscribe, visit link above, or email
>> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>> >
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
diff mbox series

Patch

diff --git a/libswscale/input.c b/libswscale/input.c
index 90efdd2ffc..2a13846abe 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -966,6 +966,11 @@  static av_always_inline void planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV,

 #define rdpx(src) (is_be ? av_int2float(AV_RB32(src)): av_int2float(AV_RL32(src)))

+static av_always_inline float clampf(float x, float min, float max)
+{
+    return FFMIN(FFMAX(x, min), max);
+}
+
 static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_src[4], int width, int is_be, int32_t *rgb2yuv)
 {
     int i;
@@ -973,7 +978,7 @@  static av_always_inline void planar_rgbf32_to_a(uint8_t *_dst, const uint8_t *_s
     uint16_t *dst        = (uint16_t *)_dst;

     for (i = 0; i < width; i++) {
-        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src[3] + i)));
+        dst[i] = lrintf(clampf(65535.0f * rdpx(src[3] + i), 0.0f, 65535.0f));
     }
 }

@@ -987,9 +992,9 @@  static av_always_inline void planar_rgbf32_to_uv(uint8_t *_dstU, uint8_t *_dstV,
     int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];

     for (i = 0; i < width; i++) {
-        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
-        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
-        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
+        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f));
+        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f));
+        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f));

         dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
         dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
@@ -1005,9 +1010,9 @@  static av_always_inline void planar_rgbf32_to_y(uint8_t *_dst, const uint8_t *_s
     int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];

     for (i = 0; i < width; i++) {
-        int g = av_clip_uint16(lrintf(65535.0f * rdpx(src[0] + i)));
-        int b = av_clip_uint16(lrintf(65535.0f * rdpx(src[1] + i)));
-        int r = av_clip_uint16(lrintf(65535.0f * rdpx(src[2] + i)));
+        int g = lrintf(clampf(65535.0f * rdpx(src[0] + i), 0.0f, 65535.0f));
+        int b = lrintf(clampf(65535.0f * rdpx(src[1] + i), 0.0f, 65535.0f));
+        int r = lrintf(clampf(65535.0f * rdpx(src[2] + i), 0.0f, 65535.0f));

         dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
     }
@@ -1021,7 +1026,7 @@  static av_always_inline void grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src,
     uint16_t *dst    = (uint16_t *)_dst;

     for (i = 0; i < width; ++i){
-        dst[i] = av_clip_uint16(lrintf(65535.0f * rdpx(src + i)));
+        dst[i] = lrintf(clampf(65535.0f * rdpx(src + i), 0.0f,  65535.0f));
     }
 }