diff mbox series

[FFmpeg-devel,3/3] swscale/output: Don't call av_pix_fmt_desc_get() in a loop

Message ID GV1P250MB073751CB4FFB76280DD1A6708F409@GV1P250MB0737.EURP250.PROD.OUTLOOK.COM
State Superseded
Headers show
Series [FFmpeg-devel,1/2] swscale/input: Remove spec-incompliant '; ' | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Andreas Rheinhardt Sept. 8, 2022, 3:31 a.m. UTC
Up until now, libswscale/output.c used a macro to write
an output pixel which involved a call to av_pix_fmt_desc_get()
to find out whether the input pixel format is BE or LE
despite this being known at compile-time (there are templates
per pixfmt). Even worse, these calls are made in a loop,
so that e.g. there are eight calls to av_pix_fmt_desc_get()
for every pixel processed in yuv2rgba64_X_c_template()
for 64bit RGB formats.

This commit modifies these macros to ensure that isBE()
is evaluated at compile-time. This saved 41184B of .text
for me (GCC 11.2, -O3).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
This must be the lowest-hanging fruit in the whole codebase.
Two other question: Why do all these functions in swscale_internal.h
take an enum AVPixelFormat instead of accepting an AVPixFmtDescriptor?
And would making av_pix_fmt_desc_get() av_const be beneficial?

 libswscale/output.c | 101 +++++++++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 43 deletions(-)

Comments

Paul B Mahol Sept. 16, 2022, 8:39 a.m. UTC | #1
On 9/8/22, Andreas Rheinhardt <andreas.rheinhardt@outlook.com> wrote:
> Up until now, libswscale/output.c used a macro to write
> an output pixel which involved a call to av_pix_fmt_desc_get()
> to find out whether the input pixel format is BE or LE
> despite this being known at compile-time (there are templates
> per pixfmt). Even worse, these calls are made in a loop,
> so that e.g. there are eight calls to av_pix_fmt_desc_get()
> for every pixel processed in yuv2rgba64_X_c_template()
> for 64bit RGB formats.
>

LGTM for whole set.

Got nice speed boost for not SIMD optimized conversions in swscale.

> This commit modifies these macros to ensure that isBE()
> is evaluated at compile-time. This saved 41184B of .text
> for me (GCC 11.2, -O3).
>
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
> This must be the lowest-hanging fruit in the whole codebase.
> Two other question: Why do all these functions in swscale_internal.h
> take an enum AVPixelFormat instead of accepting an AVPixFmtDescriptor?
> And would making av_pix_fmt_desc_get() av_const be beneficial?
>
>  libswscale/output.c | 101 +++++++++++++++++++++++++-------------------
>  1 file changed, 58 insertions(+), 43 deletions(-)
>
> diff --git a/libswscale/output.c b/libswscale/output.c
> index 40a4476c6d..590334eb57 100644
> --- a/libswscale/output.c
> +++ b/libswscale/output.c
> @@ -919,7 +919,7 @@ YUV2PACKEDWRAPPER(yuv2, 422, uyvy422,
> AV_PIX_FMT_UYVY422)
>  #define R_B ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE
> || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? R : B)
>  #define B_R ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE
> || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? B : R)
>  #define output_pixel(pos, val) \
> -    if (isBE(target)) { \
> +    if (is_be) { \
>          AV_WB16(pos, val); \
>      } else { \
>          AV_WL16(pos, val); \
> @@ -931,7 +931,8 @@ yuv2ya16_X_c_template(SwsContext *c, const int16_t
> *lumFilter,
>                          const int16_t *chrFilter, const int32_t
> **unused_chrUSrc,
>                          const int32_t **unused_chrVSrc, int
> unused_chrFilterSize,
>                          const int32_t **alpSrc, uint16_t *dest, int dstW,
> -                        int y, enum AVPixelFormat target, int
> unused_hasAlpha, int unused_eightbytes)
> +                        int y, enum AVPixelFormat target,
> +                        int unused_hasAlpha, int unused_eightbytes, int
> is_be)
>  {
>      int hasAlpha = !!alpSrc;
>      int i;
> @@ -968,7 +969,8 @@ yuv2ya16_2_c_template(SwsContext *c, const int32_t
> *buf[2],
>                          const int32_t *unused_ubuf[2], const int32_t
> *unused_vbuf[2],
>                          const int32_t *abuf[2], uint16_t *dest, int dstW,
>                          int yalpha, int unused_uvalpha, int y,
> -                        enum AVPixelFormat target, int unused_hasAlpha, int
> unused_eightbytes)
> +                        enum AVPixelFormat target, int unused_hasAlpha,
> +                        int unused_eightbytes, int is_be)
>  {
>      int hasAlpha = abuf && abuf[0] && abuf[1];
>      const int32_t *buf0  = buf[0],  *buf1  = buf[1],
> @@ -999,7 +1001,8 @@ static av_always_inline void
>  yuv2ya16_1_c_template(SwsContext *c, const int32_t *buf0,
>                          const int32_t *unused_ubuf[2], const int32_t
> *unused_vbuf[2],
>                          const int32_t *abuf0, uint16_t *dest, int dstW,
> -                        int unused_uvalpha, int y, enum AVPixelFormat
> target, int unused_hasAlpha, int unused_eightbytes)
> +                        int unused_uvalpha, int y, enum AVPixelFormat
> target,
> +                        int unused_hasAlpha, int unused_eightbytes, int
> is_be)
>  {
>      int hasAlpha = !!abuf0;
>      int i;
> @@ -1027,7 +1030,8 @@ yuv2rgba64_X_c_template(SwsContext *c, const int16_t
> *lumFilter,
>                         const int16_t *chrFilter, const int32_t **chrUSrc,
>                         const int32_t **chrVSrc, int chrFilterSize,
>                         const int32_t **alpSrc, uint16_t *dest, int dstW,
> -                       int y, enum AVPixelFormat target, int hasAlpha, int
> eightbytes)
> +                       int y, enum AVPixelFormat target, int hasAlpha, int
> eightbytes,
> +                       int is_be)
>  {
>      int i;
>      int A1 = 0xffff<<14, A2 = 0xffff<<14;
> @@ -1108,7 +1112,8 @@ yuv2rgba64_2_c_template(SwsContext *c, const int32_t
> *buf[2],
>                         const int32_t *ubuf[2], const int32_t *vbuf[2],
>                         const int32_t *abuf[2], uint16_t *dest, int dstW,
>                         int yalpha, int uvalpha, int y,
> -                       enum AVPixelFormat target, int hasAlpha, int
> eightbytes)
> +                       enum AVPixelFormat target, int hasAlpha, int
> eightbytes,
> +                       int is_be)
>  {
>      const int32_t *buf0  = buf[0],  *buf1  = buf[1],
>                    *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
> @@ -1172,7 +1177,8 @@ static av_always_inline void
>  yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
>                         const int32_t *ubuf[2], const int32_t *vbuf[2],
>                         const int32_t *abuf0, uint16_t *dest, int dstW,
> -                       int uvalpha, int y, enum AVPixelFormat target, int
> hasAlpha, int eightbytes)
> +                       int uvalpha, int y, enum AVPixelFormat target,
> +                       int hasAlpha, int eightbytes, int is_be)
>  {
>      const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
>      int i;
> @@ -1277,7 +1283,8 @@ yuv2rgba64_full_X_c_template(SwsContext *c, const
> int16_t *lumFilter,
>                         const int16_t *chrFilter, const int32_t **chrUSrc,
>                         const int32_t **chrVSrc, int chrFilterSize,
>                         const int32_t **alpSrc, uint16_t *dest, int dstW,
> -                       int y, enum AVPixelFormat target, int hasAlpha, int
> eightbytes)
> +                       int y, enum AVPixelFormat target, int hasAlpha,
> +                       int eightbytes, int is_be)
>  {
>      int i;
>      int A = 0xffff<<14;
> @@ -1340,7 +1347,8 @@ yuv2rgba64_full_2_c_template(SwsContext *c, const
> int32_t *buf[2],
>                         const int32_t *ubuf[2], const int32_t *vbuf[2],
>                         const int32_t *abuf[2], uint16_t *dest, int dstW,
>                         int yalpha, int uvalpha, int y,
> -                       enum AVPixelFormat target, int hasAlpha, int
> eightbytes)
> +                       enum AVPixelFormat target, int hasAlpha, int
> eightbytes,
> +                       int is_be)
>  {
>      const int32_t *buf0  = buf[0],  *buf1  = buf[1],
>                    *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
> @@ -1391,7 +1399,8 @@ static av_always_inline void
>  yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
>                         const int32_t *ubuf[2], const int32_t *vbuf[2],
>                         const int32_t *abuf0, uint16_t *dest, int dstW,
> -                       int uvalpha, int y, enum AVPixelFormat target, int
> hasAlpha, int eightbytes)
> +                       int uvalpha, int y, enum AVPixelFormat target,
> +                       int hasAlpha, int eightbytes, int is_be)
>  {
>      const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
>      int i;
> @@ -1468,7 +1477,11 @@ yuv2rgba64_full_1_c_template(SwsContext *c, const
> int32_t *buf0,
>  #undef r_b
>  #undef b_r
>
> -#define YUV2PACKED16WRAPPER(name, base, ext, fmt, hasAlpha, eightbytes) \
> +#define IS_BE_LE 0
> +#define IS_BE_BE 1
> +#define IS_BE(BE_LE) IS_BE_ ## BE_LE
> +
> +#define YUV2PACKED16WRAPPER_0(name, base, ext, fmt, is_be, hasAlpha,
> eightbytes) \
>  static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
>                          const int16_t **_lumSrc, int lumFilterSize, \
>                          const int16_t *chrFilter, const int16_t **_chrUSrc,
> \
> @@ -1483,7 +1496,7 @@ static void name ## ext ## _X_c(SwsContext *c, const
> int16_t *lumFilter, \
>      uint16_t *dest = (uint16_t *) _dest; \
>      name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
>                            chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
> -                          alpSrc, dest, dstW, y, fmt, hasAlpha,
> eightbytes); \
> +                          alpSrc, dest, dstW, y, fmt, hasAlpha, eightbytes,
> is_be); \
>  } \
>   \
>  static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
> @@ -1497,7 +1510,7 @@ static void name ## ext ## _2_c(SwsContext *c, const
> int16_t *_buf[2], \
>                    **abuf = (const int32_t **) _abuf; \
>      uint16_t *dest = (uint16_t *) _dest; \
>      name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
> -                          dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha,
> eightbytes); \
> +                          dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha,
> eightbytes, is_be); \
>  } \
>   \
>  static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
> @@ -1511,36 +1524,38 @@ static void name ## ext ## _1_c(SwsContext *c, const
> int16_t *_buf0, \
>                    *abuf0 = (const int32_t *)  _abuf0; \
>      uint16_t *dest = (uint16_t *) _dest; \
>      name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
> -                                  dstW, uvalpha, y, fmt, hasAlpha,
> eightbytes); \
> -}
> -
> -YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48be, AV_PIX_FMT_RGB48BE, 0, 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48le, AV_PIX_FMT_RGB48LE, 0, 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48be, AV_PIX_FMT_BGR48BE, 0, 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48le, AV_PIX_FMT_BGR48LE, 0, 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64BE, 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64LE, 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64BE, 0, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64LE, 0, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64BE, 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64LE, 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64BE, 0, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64LE, 0, 1)
> -YUV2PACKED16WRAPPER(yuv2, ya16, ya16be, AV_PIX_FMT_YA16BE, 1, 0)
> -YUV2PACKED16WRAPPER(yuv2, ya16, ya16le, AV_PIX_FMT_YA16LE, 1, 0)
> -
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48be_full, AV_PIX_FMT_RGB48BE, 0,
> 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48le_full, AV_PIX_FMT_RGB48LE, 0,
> 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48be_full, AV_PIX_FMT_BGR48BE, 0,
> 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48le_full, AV_PIX_FMT_BGR48LE, 0,
> 0)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64be_full, AV_PIX_FMT_RGBA64BE,
> 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64le_full, AV_PIX_FMT_RGBA64LE,
> 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64be_full, AV_PIX_FMT_RGBA64BE,
> 0, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64le_full, AV_PIX_FMT_RGBA64LE,
> 0, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64be_full, AV_PIX_FMT_BGRA64BE,
> 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64le_full, AV_PIX_FMT_BGRA64LE,
> 1, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64be_full, AV_PIX_FMT_BGRA64BE,
> 0, 1)
> -YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64le_full, AV_PIX_FMT_BGRA64LE,
> 0, 1)
> +                                  dstW, uvalpha, y, fmt, hasAlpha,
> eightbytes, is_be); \
> +}
> +#define YUV2PACKED16WRAPPER(name, base, ext, fmt, endianness, hasAlpha,
> eightbytes) \
> +    YUV2PACKED16WRAPPER_0(name, base, ext, fmt ## endianness,
> IS_BE(endianness), hasAlpha, eightbytes)
> +
> +YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48be, AV_PIX_FMT_RGB48, BE, 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48le, AV_PIX_FMT_RGB48, LE, 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48be, AV_PIX_FMT_BGR48, BE, 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48le, AV_PIX_FMT_BGR48, LE, 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64, BE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64, LE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64, BE, 0, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64, LE, 0, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64, BE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64, LE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64, BE, 0, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64, LE, 0, 1)
> +YUV2PACKED16WRAPPER(yuv2, ya16, ya16be, AV_PIX_FMT_YA16, BE, 1, 0)
> +YUV2PACKED16WRAPPER(yuv2, ya16, ya16le, AV_PIX_FMT_YA16, LE, 1, 0)
> +
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48be_full, AV_PIX_FMT_RGB48, BE,
> 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48le_full, AV_PIX_FMT_RGB48, LE,
> 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48be_full, AV_PIX_FMT_BGR48, BE,
> 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48le_full, AV_PIX_FMT_BGR48, LE,
> 0, 0)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64be_full, AV_PIX_FMT_RGBA64,
> BE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64le_full, AV_PIX_FMT_RGBA64,
> LE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64be_full, AV_PIX_FMT_RGBA64,
> BE, 0, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64le_full, AV_PIX_FMT_RGBA64,
> LE, 0, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64be_full, AV_PIX_FMT_BGRA64,
> BE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64le_full, AV_PIX_FMT_BGRA64,
> LE, 1, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64be_full, AV_PIX_FMT_BGRA64,
> BE, 0, 1)
> +YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64le_full, AV_PIX_FMT_BGRA64,
> LE, 0, 1)
>
>  /*
>   * Write out 2 RGB pixels in the target pixel format. This function takes
> a
> --
> 2.34.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libswscale/output.c b/libswscale/output.c
index 40a4476c6d..590334eb57 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -919,7 +919,7 @@  YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
 #define R_B ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? R : B)
 #define B_R ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? B : R)
 #define output_pixel(pos, val) \
-    if (isBE(target)) { \
+    if (is_be) { \
         AV_WB16(pos, val); \
     } else { \
         AV_WL16(pos, val); \
@@ -931,7 +931,8 @@  yuv2ya16_X_c_template(SwsContext *c, const int16_t *lumFilter,
                         const int16_t *chrFilter, const int32_t **unused_chrUSrc,
                         const int32_t **unused_chrVSrc, int unused_chrFilterSize,
                         const int32_t **alpSrc, uint16_t *dest, int dstW,
-                        int y, enum AVPixelFormat target, int unused_hasAlpha, int unused_eightbytes)
+                        int y, enum AVPixelFormat target,
+                        int unused_hasAlpha, int unused_eightbytes, int is_be)
 {
     int hasAlpha = !!alpSrc;
     int i;
@@ -968,7 +969,8 @@  yuv2ya16_2_c_template(SwsContext *c, const int32_t *buf[2],
                         const int32_t *unused_ubuf[2], const int32_t *unused_vbuf[2],
                         const int32_t *abuf[2], uint16_t *dest, int dstW,
                         int yalpha, int unused_uvalpha, int y,
-                        enum AVPixelFormat target, int unused_hasAlpha, int unused_eightbytes)
+                        enum AVPixelFormat target, int unused_hasAlpha,
+                        int unused_eightbytes, int is_be)
 {
     int hasAlpha = abuf && abuf[0] && abuf[1];
     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
@@ -999,7 +1001,8 @@  static av_always_inline void
 yuv2ya16_1_c_template(SwsContext *c, const int32_t *buf0,
                         const int32_t *unused_ubuf[2], const int32_t *unused_vbuf[2],
                         const int32_t *abuf0, uint16_t *dest, int dstW,
-                        int unused_uvalpha, int y, enum AVPixelFormat target, int unused_hasAlpha, int unused_eightbytes)
+                        int unused_uvalpha, int y, enum AVPixelFormat target,
+                        int unused_hasAlpha, int unused_eightbytes, int is_be)
 {
     int hasAlpha = !!abuf0;
     int i;
@@ -1027,7 +1030,8 @@  yuv2rgba64_X_c_template(SwsContext *c, const int16_t *lumFilter,
                        const int16_t *chrFilter, const int32_t **chrUSrc,
                        const int32_t **chrVSrc, int chrFilterSize,
                        const int32_t **alpSrc, uint16_t *dest, int dstW,
-                       int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int y, enum AVPixelFormat target, int hasAlpha, int eightbytes,
+                       int is_be)
 {
     int i;
     int A1 = 0xffff<<14, A2 = 0xffff<<14;
@@ -1108,7 +1112,8 @@  yuv2rgba64_2_c_template(SwsContext *c, const int32_t *buf[2],
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf[2], uint16_t *dest, int dstW,
                        int yalpha, int uvalpha, int y,
-                       enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       enum AVPixelFormat target, int hasAlpha, int eightbytes,
+                       int is_be)
 {
     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
@@ -1172,7 +1177,8 @@  static av_always_inline void
 yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf0, uint16_t *dest, int dstW,
-                       int uvalpha, int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int uvalpha, int y, enum AVPixelFormat target,
+                       int hasAlpha, int eightbytes, int is_be)
 {
     const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
     int i;
@@ -1277,7 +1283,8 @@  yuv2rgba64_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
                        const int16_t *chrFilter, const int32_t **chrUSrc,
                        const int32_t **chrVSrc, int chrFilterSize,
                        const int32_t **alpSrc, uint16_t *dest, int dstW,
-                       int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int y, enum AVPixelFormat target, int hasAlpha,
+                       int eightbytes, int is_be)
 {
     int i;
     int A = 0xffff<<14;
@@ -1340,7 +1347,8 @@  yuv2rgba64_full_2_c_template(SwsContext *c, const int32_t *buf[2],
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf[2], uint16_t *dest, int dstW,
                        int yalpha, int uvalpha, int y,
-                       enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       enum AVPixelFormat target, int hasAlpha, int eightbytes,
+                       int is_be)
 {
     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
@@ -1391,7 +1399,8 @@  static av_always_inline void
 yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf0, uint16_t *dest, int dstW,
-                       int uvalpha, int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int uvalpha, int y, enum AVPixelFormat target,
+                       int hasAlpha, int eightbytes, int is_be)
 {
     const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
     int i;
@@ -1468,7 +1477,11 @@  yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
 #undef r_b
 #undef b_r
 
-#define YUV2PACKED16WRAPPER(name, base, ext, fmt, hasAlpha, eightbytes) \
+#define IS_BE_LE 0
+#define IS_BE_BE 1
+#define IS_BE(BE_LE) IS_BE_ ## BE_LE
+
+#define YUV2PACKED16WRAPPER_0(name, base, ext, fmt, is_be, hasAlpha, eightbytes) \
 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
                         const int16_t **_lumSrc, int lumFilterSize, \
                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
@@ -1483,7 +1496,7 @@  static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
     uint16_t *dest = (uint16_t *) _dest; \
     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                          alpSrc, dest, dstW, y, fmt, hasAlpha, eightbytes); \
+                          alpSrc, dest, dstW, y, fmt, hasAlpha, eightbytes, is_be); \
 } \
  \
 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
@@ -1497,7 +1510,7 @@  static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
                   **abuf = (const int32_t **) _abuf; \
     uint16_t *dest = (uint16_t *) _dest; \
     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
-                          dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha, eightbytes); \
+                          dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha, eightbytes, is_be); \
 } \
  \
 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
@@ -1511,36 +1524,38 @@  static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
                   *abuf0 = (const int32_t *)  _abuf0; \
     uint16_t *dest = (uint16_t *) _dest; \
     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
-                                  dstW, uvalpha, y, fmt, hasAlpha, eightbytes); \
-}
-
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48be, AV_PIX_FMT_RGB48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48le, AV_PIX_FMT_RGB48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48be, AV_PIX_FMT_BGR48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48le, AV_PIX_FMT_BGR48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64LE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64LE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, ya16, ya16be, AV_PIX_FMT_YA16BE, 1, 0)
-YUV2PACKED16WRAPPER(yuv2, ya16, ya16le, AV_PIX_FMT_YA16LE, 1, 0)
-
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48be_full, AV_PIX_FMT_RGB48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48le_full, AV_PIX_FMT_RGB48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48be_full, AV_PIX_FMT_BGR48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48le_full, AV_PIX_FMT_BGR48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64be_full, AV_PIX_FMT_RGBA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64le_full, AV_PIX_FMT_RGBA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64be_full, AV_PIX_FMT_RGBA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64le_full, AV_PIX_FMT_RGBA64LE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64be_full, AV_PIX_FMT_BGRA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64le_full, AV_PIX_FMT_BGRA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64be_full, AV_PIX_FMT_BGRA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64le_full, AV_PIX_FMT_BGRA64LE, 0, 1)
+                                  dstW, uvalpha, y, fmt, hasAlpha, eightbytes, is_be); \
+}
+#define YUV2PACKED16WRAPPER(name, base, ext, fmt, endianness, hasAlpha, eightbytes) \
+    YUV2PACKED16WRAPPER_0(name, base, ext, fmt ## endianness, IS_BE(endianness), hasAlpha, eightbytes)
+
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48be, AV_PIX_FMT_RGB48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48le, AV_PIX_FMT_RGB48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48be, AV_PIX_FMT_BGR48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48le, AV_PIX_FMT_BGR48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64, LE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64, LE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, ya16, ya16be, AV_PIX_FMT_YA16, BE, 1, 0)
+YUV2PACKED16WRAPPER(yuv2, ya16, ya16le, AV_PIX_FMT_YA16, LE, 1, 0)
+
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48be_full, AV_PIX_FMT_RGB48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48le_full, AV_PIX_FMT_RGB48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48be_full, AV_PIX_FMT_BGR48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48le_full, AV_PIX_FMT_BGR48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64be_full, AV_PIX_FMT_RGBA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64le_full, AV_PIX_FMT_RGBA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64be_full, AV_PIX_FMT_RGBA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64le_full, AV_PIX_FMT_RGBA64, LE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64be_full, AV_PIX_FMT_BGRA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64le_full, AV_PIX_FMT_BGRA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64be_full, AV_PIX_FMT_BGRA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64le_full, AV_PIX_FMT_BGRA64, LE, 0, 1)
 
 /*
  * Write out 2 RGB pixels in the target pixel format. This function takes a