diff mbox series

[FFmpeg-devel,v3,1/4] swscale/input: add rgbaf32 input support

Message ID 20221103040010.1134-2-mindmark@gmail.com
State New
Headers show
Series swscale rgbaf32 input/output support | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

Mark Reid Nov. 3, 2022, 4 a.m. UTC
From: Mark Reid <mindmark@gmail.com>

---
 libswscale/input.c | 172 +++++++++++++++++++++++++++++++++++++++++++++
 libswscale/utils.c |   4 ++
 2 files changed, 176 insertions(+)

Comments

Michael Niedermayer Nov. 13, 2022, 9:24 p.m. UTC | #1
On Wed, Nov 02, 2022 at 09:00:07PM -0700, mindmark@gmail.com wrote:
> From: Mark Reid <mindmark@gmail.com>
> 
> ---
>  libswscale/input.c | 172 +++++++++++++++++++++++++++++++++++++++++++++
>  libswscale/utils.c |   4 ++
>  2 files changed, 176 insertions(+)
> 
> diff --git a/libswscale/input.c b/libswscale/input.c
> index 7ff7bfaa01..4683284b0b 100644
> --- a/libswscale/input.c
> +++ b/libswscale/input.c
> @@ -1284,6 +1284,136 @@ static void rgbaf16##endian_name##ToA_c(uint8_t *_dst, const uint8_t *_src, cons
>  rgbaf16_funcs_endian(le, 0)
>  rgbaf16_funcs_endian(be, 1)
>  
> +#define rdpx(src) (is_be ? av_int2float(AV_RB32(&src)): av_int2float(AV_RL32(&src)))
> +
> +static av_always_inline void rgbaf32ToUV_half_endian(uint16_t *dstU, uint16_t *dstV, int is_be,
> +                                                     const float *src, int width,
> +                                                     int32_t *rgb2yuv, int comp)
> +{
> +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
> +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
> +    int i;
> +    for (i = 0; i < width; i++) {

> +        int r = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+0]), 0.0f, 65535.0f)) +
> +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+4]), 0.0f, 65535.0f))) >> 1;
> +        int g = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+1]), 0.0f, 65535.0f)) +
> +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+5]), 0.0f, 65535.0f))) >> 1;
> +        int b = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+2]), 0.0f, 65535.0f)) +
> +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+6]), 0.0f, 65535.0f))) >> 1;
> +
> +        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
> +        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;

I would expect this sort of code to use 2 lrintf() and 2 av_clipf() not 6



> +    }
> +}
> +
> +static av_always_inline void rgbaf32ToUV_endian(uint16_t *dstU, uint16_t *dstV, int is_be,
> +                                                const float *src, int width,
> +                                                int32_t *rgb2yuv, int comp)
> +{
> +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
> +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
> +    int i;
> +    for (i = 0; i < width; i++) {
> +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f, 65535.0f));
> +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f, 65535.0f));
> +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f, 65535.0f));
> +
> +        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
> +        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
> +    }
> +}
> +

> +static av_always_inline void rgbaf32ToY_endian(uint16_t *dst, const float *src, int is_be,
> +                                               int width, int32_t *rgb2yuv, int comp)
> +{
> +    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
> +    int i;
> +    for (i = 0; i < width; i++) {
> +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f, 65535.0f));
> +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f, 65535.0f));
> +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f, 65535.0f));
> +

> +        dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;

there is one output so there should be only need for one clip and one float->int

thx

[...]
Mark Reid Nov. 14, 2022, 1:50 a.m. UTC | #2
On Sun, Nov 13, 2022 at 1:25 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> On Wed, Nov 02, 2022 at 09:00:07PM -0700, mindmark@gmail.com wrote:
> > From: Mark Reid <mindmark@gmail.com>
> >
> > ---
> >  libswscale/input.c | 172 +++++++++++++++++++++++++++++++++++++++++++++
> >  libswscale/utils.c |   4 ++
> >  2 files changed, 176 insertions(+)
> >
> > diff --git a/libswscale/input.c b/libswscale/input.c
> > index 7ff7bfaa01..4683284b0b 100644
> > --- a/libswscale/input.c
> > +++ b/libswscale/input.c
> > @@ -1284,6 +1284,136 @@ static void rgbaf16##endian_name##ToA_c(uint8_t
> *_dst, const uint8_t *_src, cons
> >  rgbaf16_funcs_endian(le, 0)
> >  rgbaf16_funcs_endian(be, 1)
> >
> > +#define rdpx(src) (is_be ? av_int2float(AV_RB32(&src)):
> av_int2float(AV_RL32(&src)))
> > +
> > +static av_always_inline void rgbaf32ToUV_half_endian(uint16_t *dstU,
> uint16_t *dstV, int is_be,
> > +                                                     const float *src,
> int width,
> > +                                                     int32_t *rgb2yuv,
> int comp)
> > +{
> > +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> rgb2yuv[BU_IDX];
> > +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> rgb2yuv[BV_IDX];
> > +    int i;
> > +    for (i = 0; i < width; i++) {
>
> > +        int r = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+0]),
> 0.0f, 65535.0f)) +
> > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+4]),
> 0.0f, 65535.0f))) >> 1;
> > +        int g = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+1]),
> 0.0f, 65535.0f)) +
> > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+5]),
> 0.0f, 65535.0f))) >> 1;
> > +        int b = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+2]),
> 0.0f, 65535.0f)) +
> > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+6]),
> 0.0f, 65535.0f))) >> 1;
> > +
> > +        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> >> RGB2YUV_SHIFT;
> > +        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> >> RGB2YUV_SHIFT;
>
> I would expect this sort of code to use 2 lrintf() and 2 av_clipf() not 6
>
>
ya it is a bit excessive, I'll just remove the _half conversions for now,
they aren't strictly necessary as far as I can tell.


>
> > +    }
> > +}
> > +
> > +static av_always_inline void rgbaf32ToUV_endian(uint16_t *dstU,
> uint16_t *dstV, int is_be,
> > +                                                const float *src, int
> width,
> > +                                                int32_t *rgb2yuv, int
> comp)
> > +{
> > +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> rgb2yuv[BU_IDX];
> > +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> rgb2yuv[BV_IDX];
> > +    int i;
> > +    for (i = 0; i < width; i++) {
> > +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f,
> 65535.0f));
> > +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f,
> 65535.0f));
> > +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f,
> 65535.0f));
> > +
> > +        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> >> RGB2YUV_SHIFT;
> > +        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> >> RGB2YUV_SHIFT;
> > +    }
> > +}
> > +
>
> > +static av_always_inline void rgbaf32ToY_endian(uint16_t *dst, const
> float *src, int is_be,
> > +                                               int width, int32_t
> *rgb2yuv, int comp)
> > +{
> > +    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by =
> rgb2yuv[BY_IDX];
> > +    int i;
> > +    for (i = 0; i < width; i++) {
> > +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f,
> 65535.0f));
> > +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f,
> 65535.0f));
> > +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f,
> 65535.0f));
> > +
>
> > +        dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
>
> there is one output so there should be only need for one clip and one
> float->int
>

This is matching the f32 planar version. I think I was paranoid about
things being bitexact for tests and that's why it's currently being done
this way.
I'll see what happens if I introduce more float operations, could I perhaps
do this in a later patch? some asm might have to change too.


> thx
>
> [...]
> --
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Any man who breaks a law that conscience tells him is unjust and willingly
> accepts the penalty by staying in jail in order to arouse the conscience
> of
> the community on the injustice of the law is at that moment expressing the
> very highest respect for law. - Martin Luther King Jr
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Michael Niedermayer Nov. 14, 2022, 9:07 p.m. UTC | #3
On Sun, Nov 13, 2022 at 05:50:37PM -0800, Mark Reid wrote:
> On Sun, Nov 13, 2022 at 1:25 PM Michael Niedermayer <michael@niedermayer.cc>
> wrote:
> 
> > On Wed, Nov 02, 2022 at 09:00:07PM -0700, mindmark@gmail.com wrote:
> > > From: Mark Reid <mindmark@gmail.com>
> > >
> > > ---
> > >  libswscale/input.c | 172 +++++++++++++++++++++++++++++++++++++++++++++
> > >  libswscale/utils.c |   4 ++
> > >  2 files changed, 176 insertions(+)
> > >
> > > diff --git a/libswscale/input.c b/libswscale/input.c
> > > index 7ff7bfaa01..4683284b0b 100644
> > > --- a/libswscale/input.c
> > > +++ b/libswscale/input.c
> > > @@ -1284,6 +1284,136 @@ static void rgbaf16##endian_name##ToA_c(uint8_t
> > *_dst, const uint8_t *_src, cons
> > >  rgbaf16_funcs_endian(le, 0)
> > >  rgbaf16_funcs_endian(be, 1)
> > >
> > > +#define rdpx(src) (is_be ? av_int2float(AV_RB32(&src)):
> > av_int2float(AV_RL32(&src)))
> > > +
> > > +static av_always_inline void rgbaf32ToUV_half_endian(uint16_t *dstU,
> > uint16_t *dstV, int is_be,
> > > +                                                     const float *src,
> > int width,
> > > +                                                     int32_t *rgb2yuv,
> > int comp)
> > > +{
> > > +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> > rgb2yuv[BU_IDX];
> > > +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> > rgb2yuv[BV_IDX];
> > > +    int i;
> > > +    for (i = 0; i < width; i++) {
> >
> > > +        int r = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+0]),
> > 0.0f, 65535.0f)) +
> > > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+4]),
> > 0.0f, 65535.0f))) >> 1;
> > > +        int g = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+1]),
> > 0.0f, 65535.0f)) +
> > > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+5]),
> > 0.0f, 65535.0f))) >> 1;
> > > +        int b = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+2]),
> > 0.0f, 65535.0f)) +
> > > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+6]),
> > 0.0f, 65535.0f))) >> 1;
> > > +
> > > +        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> > >> RGB2YUV_SHIFT;
> > > +        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> > >> RGB2YUV_SHIFT;
> >
> > I would expect this sort of code to use 2 lrintf() and 2 av_clipf() not 6
> >
> >
> ya it is a bit excessive, I'll just remove the _half conversions for now,
> they aren't strictly necessary as far as I can tell.

do you see a problem with just factorizing them out ?
it shouldnt be hard to reorder the operations


> 
> 
> >
> > > +    }
> > > +}
> > > +
> > > +static av_always_inline void rgbaf32ToUV_endian(uint16_t *dstU,
> > uint16_t *dstV, int is_be,
> > > +                                                const float *src, int
> > width,
> > > +                                                int32_t *rgb2yuv, int
> > comp)
> > > +{
> > > +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> > rgb2yuv[BU_IDX];
> > > +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> > rgb2yuv[BV_IDX];
> > > +    int i;
> > > +    for (i = 0; i < width; i++) {
> > > +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f,
> > 65535.0f));
> > > +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f,
> > 65535.0f));
> > > +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f,
> > 65535.0f));
> > > +
> > > +        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> > >> RGB2YUV_SHIFT;
> > > +        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1)))
> > >> RGB2YUV_SHIFT;
> > > +    }
> > > +}
> > > +
> >
> > > +static av_always_inline void rgbaf32ToY_endian(uint16_t *dst, const
> > float *src, int is_be,
> > > +                                               int width, int32_t
> > *rgb2yuv, int comp)
> > > +{
> > > +    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by =
> > rgb2yuv[BY_IDX];
> > > +    int i;
> > > +    for (i = 0; i < width; i++) {
> > > +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f,
> > 65535.0f));
> > > +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f,
> > 65535.0f));
> > > +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f,
> > 65535.0f));
> > > +
> >
> > > +        dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >>
> > RGB2YUV_SHIFT;
> >
> > there is one output so there should be only need for one clip and one
> > float->int
> >
> 
> This is matching the f32 planar version. I think I was paranoid about
> things being bitexact for tests and that's why it's currently being done
> this way.
> I'll see what happens if I introduce more float operations, could I perhaps
> do this in a later patch? some asm might have to change too.

of course can be a seperate patch in a set. Maybe f32 planar can be changed
at the same time

thx

[...]
Mark Reid Nov. 14, 2022, 10:23 p.m. UTC | #4
On Mon, Nov 14, 2022 at 1:08 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> On Sun, Nov 13, 2022 at 05:50:37PM -0800, Mark Reid wrote:
> > On Sun, Nov 13, 2022 at 1:25 PM Michael Niedermayer <
> michael@niedermayer.cc>
> > wrote:
> >
> > > On Wed, Nov 02, 2022 at 09:00:07PM -0700, mindmark@gmail.com wrote:
> > > > From: Mark Reid <mindmark@gmail.com>
> > > >
> > > > ---
> > > >  libswscale/input.c | 172
> +++++++++++++++++++++++++++++++++++++++++++++
> > > >  libswscale/utils.c |   4 ++
> > > >  2 files changed, 176 insertions(+)
> > > >
> > > > diff --git a/libswscale/input.c b/libswscale/input.c
> > > > index 7ff7bfaa01..4683284b0b 100644
> > > > --- a/libswscale/input.c
> > > > +++ b/libswscale/input.c
> > > > @@ -1284,6 +1284,136 @@ static void
> rgbaf16##endian_name##ToA_c(uint8_t
> > > *_dst, const uint8_t *_src, cons
> > > >  rgbaf16_funcs_endian(le, 0)
> > > >  rgbaf16_funcs_endian(be, 1)
> > > >
> > > > +#define rdpx(src) (is_be ? av_int2float(AV_RB32(&src)):
> > > av_int2float(AV_RL32(&src)))
> > > > +
> > > > +static av_always_inline void rgbaf32ToUV_half_endian(uint16_t *dstU,
> > > uint16_t *dstV, int is_be,
> > > > +                                                     const float
> *src,
> > > int width,
> > > > +                                                     int32_t
> *rgb2yuv,
> > > int comp)
> > > > +{
> > > > +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> > > rgb2yuv[BU_IDX];
> > > > +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> > > rgb2yuv[BV_IDX];
> > > > +    int i;
> > > > +    for (i = 0; i < width; i++) {
> > >
> > > > +        int r = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+0]),
> > > 0.0f, 65535.0f)) +
> > > > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+4]),
> > > 0.0f, 65535.0f))) >> 1;
> > > > +        int g = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+1]),
> > > 0.0f, 65535.0f)) +
> > > > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+5]),
> > > 0.0f, 65535.0f))) >> 1;
> > > > +        int b = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+2]),
> > > 0.0f, 65535.0f)) +
> > > > +                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+6]),
> > > 0.0f, 65535.0f))) >> 1;
> > > > +
> > > > +        dstU[i] = (ru*r + gu*g + bu*b +
> (0x10001<<(RGB2YUV_SHIFT-1)))
> > > >> RGB2YUV_SHIFT;
> > > > +        dstV[i] = (rv*r + gv*g + bv*b +
> (0x10001<<(RGB2YUV_SHIFT-1)))
> > > >> RGB2YUV_SHIFT;
> > >
> > > I would expect this sort of code to use 2 lrintf() and 2 av_clipf()
> not 6
> > >
> > >
> > ya it is a bit excessive, I'll just remove the _half conversions for now,
> > they aren't strictly necessary as far as I can tell.
>
> do you see a problem with just factorizing them out ?
> it shouldnt be hard to reorder the operations
>

It's just fate checksums and float math that make me apprehensive :p.
hmm this code path doesn't actually seem to get tested by fate.
Now that I relook at it, the indexing looks wrong for the 3 channel formats
too.


>
> >
> >
> > >
> > > > +    }
> > > > +}
> > > > +
> > > > +static av_always_inline void rgbaf32ToUV_endian(uint16_t *dstU,
> > > uint16_t *dstV, int is_be,
> > > > +                                                const float *src,
> int
> > > width,
> > > > +                                                int32_t *rgb2yuv,
> int
> > > comp)
> > > > +{
> > > > +    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> > > rgb2yuv[BU_IDX];
> > > > +    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> > > rgb2yuv[BV_IDX];
> > > > +    int i;
> > > > +    for (i = 0; i < width; i++) {
> > > > +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]),
> 0.0f,
> > > 65535.0f));
> > > > +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]),
> 0.0f,
> > > 65535.0f));
> > > > +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]),
> 0.0f,
> > > 65535.0f));
> > > > +
> > > > +        dstU[i] = (ru*r + gu*g + bu*b +
> (0x10001<<(RGB2YUV_SHIFT-1)))
> > > >> RGB2YUV_SHIFT;
> > > > +        dstV[i] = (rv*r + gv*g + bv*b +
> (0x10001<<(RGB2YUV_SHIFT-1)))
> > > >> RGB2YUV_SHIFT;
> > > > +    }
> > > > +}
> > > > +
> > >
> > > > +static av_always_inline void rgbaf32ToY_endian(uint16_t *dst, const
> > > float *src, int is_be,
> > > > +                                               int width, int32_t
> > > *rgb2yuv, int comp)
> > > > +{
> > > > +    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by =
> > > rgb2yuv[BY_IDX];
> > > > +    int i;
> > > > +    for (i = 0; i < width; i++) {
> > > > +        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]),
> 0.0f,
> > > 65535.0f));
> > > > +        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]),
> 0.0f,
> > > 65535.0f));
> > > > +        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]),
> 0.0f,
> > > 65535.0f));
> > > > +
> > >
> > > > +        dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1)))
> >>
> > > RGB2YUV_SHIFT;
> > >
> > > there is one output so there should be only need for one clip and one
> > > float->int
> > >
> >
> > This is matching the f32 planar version. I think I was paranoid about
> > things being bitexact for tests and that's why it's currently being done
> > this way.
> > I'll see what happens if I introduce more float operations, could I
> perhaps
> > do this in a later patch? some asm might have to change too.
>
> of course can be a seperate patch in a set. Maybe f32 planar can be changed
> at the same time
>

great, I'll do that change together in a later patch.


>
> thx
>
> [...]
> --
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> In a rich man's house there is no place to spit but his face.
> -- Diogenes of Sinope
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libswscale/input.c b/libswscale/input.c
index 7ff7bfaa01..4683284b0b 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -1284,6 +1284,136 @@  static void rgbaf16##endian_name##ToA_c(uint8_t *_dst, const uint8_t *_src, cons
 rgbaf16_funcs_endian(le, 0)
 rgbaf16_funcs_endian(be, 1)
 
+#define rdpx(src) (is_be ? av_int2float(AV_RB32(&src)): av_int2float(AV_RL32(&src)))
+
+static av_always_inline void rgbaf32ToUV_half_endian(uint16_t *dstU, uint16_t *dstV, int is_be,
+                                                     const float *src, int width,
+                                                     int32_t *rgb2yuv, int comp)
+{
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int i;
+    for (i = 0; i < width; i++) {
+        int r = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+0]), 0.0f, 65535.0f)) +
+                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+4]), 0.0f, 65535.0f))) >> 1;
+        int g = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+1]), 0.0f, 65535.0f)) +
+                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+5]), 0.0f, 65535.0f))) >> 1;
+        int b = (lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+2]), 0.0f, 65535.0f)) +
+                 lrintf(av_clipf(65535.0f * rdpx(src[i*(comp*2)+6]), 0.0f, 65535.0f))) >> 1;
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void rgbaf32ToUV_endian(uint16_t *dstU, uint16_t *dstV, int is_be,
+                                                const float *src, int width,
+                                                int32_t *rgb2yuv, int comp)
+{
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int i;
+    for (i = 0; i < width; i++) {
+        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f, 65535.0f));
+        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f, 65535.0f));
+        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f, 65535.0f));
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void rgbaf32ToY_endian(uint16_t *dst, const float *src, int is_be,
+                                               int width, int32_t *rgb2yuv, int comp)
+{
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    int i;
+    for (i = 0; i < width; i++) {
+        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+0]), 0.0f, 65535.0f));
+        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+1]), 0.0f, 65535.0f));
+        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*comp+2]), 0.0f, 65535.0f));
+
+        dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void rgbaf32ToA_endian(uint16_t *dst, const float *src, int is_be,
+                                               int width, void *opq)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i] = lrintf(av_clipf(65535.0f * rdpx(src[i*4+3]), 0.0f, 65535.0f));
+    }
+}
+
+#undef rdpx
+
+#define rgbaf32_funcs_endian(endian_name, endian)                                                         \
+static void rgbf32##endian_name##ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,       \
+                                              const uint8_t *src1, const uint8_t *src2,                   \
+                                              int width, uint32_t *rgb2yuv, void *opq)                    \
+{                                                                                                         \
+    const float *src = (const float*)src1;                                                                \
+    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
+    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
+    av_assert1(src1==src2);                                                                               \
+    rgbaf32ToUV_half_endian(dstU, dstV, endian, src, width, rgb2yuv, 3);                                  \
+}                                                                                                         \
+static void rgbf32##endian_name##ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,            \
+                                         const uint8_t *src1, const uint8_t *src2,                        \
+                                         int width, uint32_t *rgb2yuv, void *opq)                         \
+{                                                                                                         \
+    const float *src = (const float*)src1;                                                                \
+    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
+    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
+    av_assert1(src1==src2);                                                                               \
+    rgbaf32ToUV_endian(dstU, dstV, endian, src, width, rgb2yuv, 3);                                       \
+}                                                                                                         \
+static void rgbf32##endian_name##ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0,        \
+                                        const uint8_t *unused1, int width, uint32_t *rgb2yuv, void *opq)  \
+{                                                                                                         \
+    const float *src = (const float*)_src;                                                                \
+    uint16_t *dst = (uint16_t*)_dst;                                                                      \
+    rgbaf32ToY_endian(dst, src, endian, width, rgb2yuv, 3);                                               \
+}                                                                                                         \
+static void rgbaf32##endian_name##ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,      \
+                                              const uint8_t *src1, const uint8_t *src2,                   \
+                                              int width, uint32_t *rgb2yuv, void *opq)                    \
+{                                                                                                         \
+    const float *src = (const float*)src1;                                                                \
+    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
+    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
+    av_assert1(src1==src2);                                                                               \
+    rgbaf32ToUV_half_endian(dstU, dstV, endian, src, width, rgb2yuv, 4);                                  \
+}                                                                                                         \
+static void rgbaf32##endian_name##ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,           \
+                                         const uint8_t *src1, const uint8_t *src2,                        \
+                                         int width, uint32_t *rgb2yuv, void *opq)                         \
+{                                                                                                         \
+    const float *src = (const float*)src1;                                                                \
+    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
+    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
+    av_assert1(src1==src2);                                                                               \
+    rgbaf32ToUV_endian(dstU, dstV, endian, src, width, rgb2yuv, 4);                                       \
+}                                                                                                         \
+static void rgbaf32##endian_name##ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0,       \
+                                        const uint8_t *unused1, int width, uint32_t *rgb2yuv, void *opq)  \
+{                                                                                                         \
+    const float *src = (const float*)_src;                                                                \
+    uint16_t *dst = (uint16_t*)_dst;                                                                      \
+    rgbaf32ToY_endian(dst, src, endian, width, rgb2yuv, 4);                                               \
+}                                                                                                         \
+static void rgbaf32##endian_name##ToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0,       \
+                                        const uint8_t *unused1, int width, uint32_t *unused2, void *opq)  \
+{                                                                                                         \
+    const float *src = (const float*)_src;                                                                \
+    uint16_t *dst = (uint16_t*)_dst;                                                                      \
+    rgbaf32ToA_endian(dst, src, endian, width, opq);                                                      \
+}
+
+rgbaf32_funcs_endian(le, 0)
+rgbaf32_funcs_endian(be, 1)
+
 av_cold void ff_sws_init_input_funcs(SwsContext *c)
 {
     enum AVPixelFormat srcFormat = c->srcFormat;
@@ -1570,6 +1700,18 @@  av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_RGBAF16LE:
             c->chrToYV12 = rgbaf16leToUV_half_c;
             break;
+        case AV_PIX_FMT_RGBF32BE:
+            c->chrToYV12 = rgbf32beToUV_half_c;
+            break;
+        case AV_PIX_FMT_RGBAF32BE:
+            c->chrToYV12 = rgbaf32beToUV_half_c;
+            break;
+        case AV_PIX_FMT_RGBF32LE:
+            c->chrToYV12 = rgbf32leToUV_half_c;
+            break;
+        case AV_PIX_FMT_RGBAF32LE:
+            c->chrToYV12 = rgbaf32leToUV_half_c;
+            break;
         }
     } else {
         switch (srcFormat) {
@@ -1663,6 +1805,18 @@  av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_RGBAF16LE:
             c->chrToYV12 = rgbaf16leToUV_c;
             break;
+        case AV_PIX_FMT_RGBF32BE:
+            c->chrToYV12 = rgbf32beToUV_c;
+            break;
+        case AV_PIX_FMT_RGBAF32BE:
+            c->chrToYV12 = rgbaf32beToUV_c;
+            break;
+        case AV_PIX_FMT_RGBF32LE:
+            c->chrToYV12 = rgbf32leToUV_c;
+            break;
+        case AV_PIX_FMT_RGBAF32LE:
+            c->chrToYV12 = rgbaf32leToUV_c;
+            break;
         }
     }
 
@@ -1973,6 +2127,18 @@  av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_RGBAF16LE:
         c->lumToYV12 = rgbaf16leToY_c;
         break;
+    case AV_PIX_FMT_RGBF32BE:
+        c->lumToYV12 = rgbf32beToY_c;
+        break;
+    case AV_PIX_FMT_RGBAF32BE:
+        c->lumToYV12 = rgbaf32beToY_c;
+        break;
+    case AV_PIX_FMT_RGBF32LE:
+        c->lumToYV12 = rgbf32leToY_c;
+        break;
+    case AV_PIX_FMT_RGBAF32LE:
+        c->lumToYV12 = rgbaf32leToY_c;
+        break;
     }
     if (c->needAlpha) {
         if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
@@ -1998,6 +2164,12 @@  av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_RGBAF16LE:
             c->alpToYV12 = rgbaf16leToA_c;
             break;
+        case AV_PIX_FMT_RGBAF32BE:
+            c->alpToYV12 = rgbaf32beToA_c;
+            break;
+        case AV_PIX_FMT_RGBAF32LE:
+            c->alpToYV12 = rgbaf32leToA_c;
+            break;
         case AV_PIX_FMT_YA8:
             c->alpToYV12 = uyvyToY_c;
             break;
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 45baa22b23..6da1f21e25 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -266,6 +266,10 @@  static const FormatEntry format_entries[] = {
     [AV_PIX_FMT_VUYX]        = { 1, 1 },
     [AV_PIX_FMT_RGBAF16BE]   = { 1, 0 },
     [AV_PIX_FMT_RGBAF16LE]   = { 1, 0 },
+    [AV_PIX_FMT_RGBF32BE]    = { 1, 0 },
+    [AV_PIX_FMT_RGBF32LE]    = { 1, 0 },
+    [AV_PIX_FMT_RGBAF32BE]   = { 1, 0 },
+    [AV_PIX_FMT_RGBAF32LE]   = { 1, 0 },
     [AV_PIX_FMT_XV30LE]      = { 1, 1 },
     [AV_PIX_FMT_XV36LE]      = { 1, 1 },
 };