diff mbox series

[FFmpeg-devel,v3,4/4] swscale/bswapdsp: copy over bswapdsp from avcodec

Message ID 20221213024216.259-4-mindmark@gmail.com
State New
Headers show
Series [FFmpeg-devel,v3,1/4] avcodec/bswapdsp: remove unused cextern | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Mark Reid Dec. 13, 2022, 2:42 a.m. UTC
From: Mark Reid <mindmark@gmail.com>

There are some places in input.c that could use it too
but they aren't currently being pass the SwsContext
---
 libswscale/Makefile              |   1 +
 libswscale/bswapdsp.c            |  59 ++++++++++++
 libswscale/bswapdsp.h            |  66 +++++++++++++
 libswscale/output.c              |  36 +++----
 libswscale/riscv/Makefile        |   7 +-
 libswscale/riscv/bswapdsp_init.c |  46 +++++++++
 libswscale/riscv/bswapdsp_rvb.S  |  68 +++++++++++++
 libswscale/riscv/bswapdsp_rvv.S  |  62 ++++++++++++
 libswscale/swscale_internal.h    |   3 +
 libswscale/swscale_unscaled.c    |  26 ++---
 libswscale/utils.c               |   2 +
 libswscale/x86/Makefile          |   6 +-
 libswscale/x86/bswapdsp.asm      | 157 +++++++++++++++++++++++++++++++
 libswscale/x86/bswapdsp_init.c   |  40 ++++++++
 14 files changed, 537 insertions(+), 42 deletions(-)
 create mode 100644 libswscale/bswapdsp.c
 create mode 100644 libswscale/bswapdsp.h
 create mode 100644 libswscale/riscv/bswapdsp_init.c
 create mode 100644 libswscale/riscv/bswapdsp_rvb.S
 create mode 100644 libswscale/riscv/bswapdsp_rvv.S
 create mode 100644 libswscale/x86/bswapdsp.asm
 create mode 100644 libswscale/x86/bswapdsp_init.c
diff mbox series

Patch

diff --git a/libswscale/Makefile b/libswscale/Makefile
index 757997b401..4a916739c3 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -6,6 +6,7 @@  HEADERS = swscale.h                                                     \
           version_major.h                                               \
 
 OBJS = alphablend.o                                     \
+       bswapdsp.o                                       \
        hscale.o                                         \
        hscale_fast_bilinear.o                           \
        gamma.o                                          \
diff --git a/libswscale/bswapdsp.c b/libswscale/bswapdsp.c
new file mode 100644
index 0000000000..a164d89a76
--- /dev/null
+++ b/libswscale/bswapdsp.c
@@ -0,0 +1,59 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/bswap.h"
+#include "bswapdsp.h"
+
+static void bswap32_buf(uint32_t *dst, const uint32_t *src, int len)
+{
+    int i;
+
+    for (i = 0; i + 8 <= len; i += 8) {
+        dst[i + 0] = av_bswap32(src[i + 0]);
+        dst[i + 1] = av_bswap32(src[i + 1]);
+        dst[i + 2] = av_bswap32(src[i + 2]);
+        dst[i + 3] = av_bswap32(src[i + 3]);
+        dst[i + 4] = av_bswap32(src[i + 4]);
+        dst[i + 5] = av_bswap32(src[i + 5]);
+        dst[i + 6] = av_bswap32(src[i + 6]);
+        dst[i + 7] = av_bswap32(src[i + 7]);
+    }
+    for (; i < len; i++)
+        dst[i + 0] = av_bswap32(src[i + 0]);
+}
+
+static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
+{
+    while (len--)
+        *dst++ = av_bswap16(*src++);
+}
+
+av_cold void ff_sws_bswapdsp_init(BswapDSPContext *c)
+{
+    c->bswap32_buf = bswap32_buf;
+    c->bswap16_buf = bswap16_buf;
+
+#if ARCH_RISCV
+    ff_sws_bswapdsp_init_riscv(c);
+#elif ARCH_X86
+    ff_sws_bswapdsp_init_x86(c);
+#endif
+}
diff --git a/libswscale/bswapdsp.h b/libswscale/bswapdsp.h
new file mode 100644
index 0000000000..f2e12d1b8f
--- /dev/null
+++ b/libswscale/bswapdsp.h
@@ -0,0 +1,66 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_BSWAPDSP_H
+#define SWSCALE_BSWAPDSP_H
+
+#include <stdint.h>
+
+/**
+ * @file
+ * Optimized buffer byte swapping routines.
+ */
+
+typedef struct BswapDSPContext {
+    /**
+     * Byte swap 32 bit elements in a buffer.
+
+     * @param dst Destination buffer.
+     * @param src Source buffer, may be the same as dst.
+     * @param len The number of elements in the buffer.
+     *
+     */
+     /** @{ */
+    void (*bswap32_buf)(uint32_t *dst, const uint32_t *src, int len);
+    /** @} */
+
+    /**
+     * Byte swap 16 bit elements in a buffer.
+     *
+     * @param dst Destination buffer.
+     * @param src Source buffer, may be the same as dst.
+     * @param len The number of elements in the buffer.
+     *
+     */
+    /** @{ */
+    void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len);
+    /** @} */
+} BswapDSPContext;
+
+
+/**
+ * Initialize BswapDSPContext function pointers.
+ *
+ * @param c pointer to BswapDSPContext
+ *
+ */
+void ff_sws_bswapdsp_init(BswapDSPContext *c);
+void ff_sws_bswapdsp_init_riscv(BswapDSPContext *c);
+void ff_sws_bswapdsp_init_x86(BswapDSPContext *c);
+
+#endif /* SWSCALE_BSWAPDSP_H */
diff --git a/libswscale/output.c b/libswscale/output.c
index 5c85bff971..cd44081e3d 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -2313,13 +2313,11 @@  yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter,
         }
     }
     if (SH != 22 && (!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
-        for (i = 0; i < dstW; i++) {
-            dest16[0][i] = av_bswap16(dest16[0][i]);
-            dest16[1][i] = av_bswap16(dest16[1][i]);
-            dest16[2][i] = av_bswap16(dest16[2][i]);
-            if (hasAlpha)
-                dest16[3][i] = av_bswap16(dest16[3][i]);
-        }
+        c->bsdsp.bswap16_buf(dest16[0], dest16[0], dstW);
+        c->bsdsp.bswap16_buf(dest16[1], dest16[1], dstW);
+        c->bsdsp.bswap16_buf(dest16[2], dest16[2], dstW);
+        if (hasAlpha)
+            c->bsdsp.bswap16_buf(dest16[3], dest16[3], dstW);
     }
 }
 
@@ -2385,13 +2383,11 @@  yuv2gbrp16_full_X_c(SwsContext *c, const int16_t *lumFilter,
             dest16[3][i] = av_clip_uintp2(A, 30) >> 14;
     }
     if ((!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
-        for (i = 0; i < dstW; i++) {
-            dest16[0][i] = av_bswap16(dest16[0][i]);
-            dest16[1][i] = av_bswap16(dest16[1][i]);
-            dest16[2][i] = av_bswap16(dest16[2][i]);
-            if (hasAlpha)
-                dest16[3][i] = av_bswap16(dest16[3][i]);
-        }
+        c->bsdsp.bswap16_buf(dest16[0], dest16[0], dstW);
+        c->bsdsp.bswap16_buf(dest16[1], dest16[1], dstW);
+        c->bsdsp.bswap16_buf(dest16[2], dest16[2], dstW);
+        if (hasAlpha)
+            c->bsdsp.bswap16_buf(dest16[3], dest16[3], dstW);
     }
 }
 
@@ -2461,13 +2457,11 @@  yuv2gbrpf32_full_X_c(SwsContext *c, const int16_t *lumFilter,
             dest32[3][i] = av_float2int(float_mult * (float)(av_clip_uintp2(A, 30) >> 14));
     }
     if ((!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
-        for (i = 0; i < dstW; i++) {
-            dest32[0][i] = av_bswap32(dest32[0][i]);
-            dest32[1][i] = av_bswap32(dest32[1][i]);
-            dest32[2][i] = av_bswap32(dest32[2][i]);
-            if (hasAlpha)
-                dest32[3][i] = av_bswap32(dest32[3][i]);
-        }
+        c->bsdsp.bswap32_buf(dest32[0], dest32[0], dstW);
+        c->bsdsp.bswap32_buf(dest32[1], dest32[1], dstW);
+        c->bsdsp.bswap32_buf(dest32[2], dest32[2], dstW);
+        if (hasAlpha)
+            c->bsdsp.bswap32_buf(dest32[3], dest32[3], dstW);
     }
 }
 
diff --git a/libswscale/riscv/Makefile b/libswscale/riscv/Makefile
index 214d877b62..bb33f2b707 100644
--- a/libswscale/riscv/Makefile
+++ b/libswscale/riscv/Makefile
@@ -1,2 +1,5 @@ 
-OBJS += riscv/rgb2rgb.o
-RVV-OBJS += riscv/rgb2rgb_rvv.o
+OBJS += riscv/bswapdsp_init.o \
+		riscv/bswapdsp_rvb.o \
+        riscv/rgb2rgb.o
+RVV-OBJS += riscv/bswapdsp_rvv.o \
+            riscv/rgb2rgb_rvv.o
diff --git a/libswscale/riscv/bswapdsp_init.c b/libswscale/riscv/bswapdsp_init.c
new file mode 100644
index 0000000000..9fedcde3fa
--- /dev/null
+++ b/libswscale/riscv/bswapdsp_init.c
@@ -0,0 +1,46 @@ 
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libswscale/bswapdsp.h"
+
+void ff_sws_bswap32_buf_rvb(uint32_t *dst, const uint32_t *src, int len);
+void ff_sws_bswap32_buf_rvv(uint32_t *dst, const uint32_t *src, int len);
+void ff_sws_bswap16_buf_rvv(uint16_t *dst, const uint16_t *src, int len);
+
+av_cold void ff_sws_bswapdsp_init_riscv(BswapDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if (__riscv_xlen >= 64)
+    if (cpu_flags & AV_CPU_FLAG_RVB_BASIC)
+        c->bswap32_buf = ff_sws_bswap32_buf_rvb;
+#endif
+#if HAVE_RVV
+    if (cpu_flags & AV_CPU_FLAG_RVV_I32) {
+        c->bswap32_buf = ff_sws_bswap32_buf_rvv;
+        c->bswap16_buf = ff_sws_bswap16_buf_rvv;
+    }
+#endif
+}
diff --git a/libswscale/riscv/bswapdsp_rvb.S b/libswscale/riscv/bswapdsp_rvb.S
new file mode 100644
index 0000000000..92edbce7cd
--- /dev/null
+++ b/libswscale/riscv/bswapdsp_rvb.S
@@ -0,0 +1,68 @@ 
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+#if (__riscv_xlen >= 64)
+func ff_sws_bswap32_buf_rvb, zbb
+        andi    t0, a1, 4
+        beqz    t0, 1f
+        /* Align a1 (input) to 64-bit */
+        lwu     t0, (a1)
+        addi    a0, a0, 4
+        rev8    t0, t0
+        addi    a2, a2, -1
+        srli    t0, t0, __riscv_xlen - 32
+        addi    a1, a1, 4
+        sw      t0, -4(a0)
+1:
+        andi    a3, a2, -2
+        sh2add  a2, a2, a0
+        beqz    a3, 3f
+        sh2add  a3, a3, a0
+2:      /* 2 elements (64 bits) at a time on a 64-bit boundary */
+        ld      t0,  (a1)
+        addi    a0, a0, 8
+        rev8    t0, t0
+#if (__riscv_xlen == 64)
+        srli    t2, t0, 32
+        sw      t0, -4(a0)
+#else
+        srli    t1, t0, __riscv_xlen - 64
+        srli    t2, t0, __riscv_xlen - 32
+        sw      t1, -4(a0)
+#endif
+        addi    a1, a1, 8
+        sw      t2, -8(a0)
+        bne     a0, a3, 2b
+3:
+        beq     a0, a2, 5f
+4:      /* Process last element */
+        lwu     t0, (a1)
+        addi    a0, a0, 4
+        rev8    t0, t0
+        addi    a1, a1, 4
+        srli    t0, t0, __riscv_xlen - 32
+        sw      t0, -4(a0)
+5:
+        ret
+endfunc
+#endif
diff --git a/libswscale/riscv/bswapdsp_rvv.S b/libswscale/riscv/bswapdsp_rvv.S
new file mode 100644
index 0000000000..923cb9bc9d
--- /dev/null
+++ b/libswscale/riscv/bswapdsp_rvv.S
@@ -0,0 +1,62 @@ 
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_sws_bswap32_buf_rvv, zve32x
+        li      t4, 4
+        addi    t1, a0, 1
+        addi    t2, a0, 2
+        addi    t3, a0, 3
+1:
+        vsetvli    t0, a2, e8, m1, ta, ma
+        vlseg4e8.v v8, (a1)
+        sub        a2, a2, t0
+        sh2add     a1, t0, a1
+        vsse8.v    v8, (t3), t4
+        sh2add     t3, t0, t3
+        vsse8.v    v9, (t2), t4
+        sh2add     t2, t0, t2
+        vsse8.v    v10, (t1), t4
+        sh2add     t1, t0, t1
+        vsse8.v    v11, (a0), t4
+        sh2add     a0, t0, a0
+        bnez       a2, 1b
+
+        ret
+endfunc
+
+func ff_sws_bswap16_buf_rvv, zve32x
+        li      t2, 2
+        addi    t1, a0, 1
+1:
+        vsetvli    t0, a2, e8, m1, ta, ma
+        vlseg2e8.v v8, (a1)
+        sub        a2, a2, t0
+        sh1add     a1, t0, a1
+        vsse8.v    v8, (t1), t2
+        sh1add     t1, t0, t1
+        vsse8.v    v9, (a0), t2
+        sh1add     a0, t0, a0
+        bnez       a2, 1b
+
+        ret
+endfunc
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index abeebbb002..30ce4907cc 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -36,6 +36,7 @@ 
 #include "libavutil/slicethread.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavutil/half2float.h"
+#include "bswapdsp.h"
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
@@ -682,6 +683,8 @@  typedef struct SwsContext {
     atomic_int   data_unaligned_warned;
 
     Half2FloatTables *h2f_tables;
+
+    BswapDSPContext bsdsp;
 } SwsContext;
 //FIXME check init (where 0)
 
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 9af2e7ecc3..0010ab24d1 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -468,7 +468,7 @@  static int bswap_16bpc(SwsContext *c, const uint8_t *src[],
                               int srcStride[], int srcSliceY, int srcSliceH,
                               uint8_t *dst[], int dstStride[])
 {
-    int i, j, p;
+    int i, p;
 
     for (p = 0; p < 4; p++) {
         int srcstr = srcStride[p] / 2;
@@ -480,9 +480,7 @@  static int bswap_16bpc(SwsContext *c, const uint8_t *src[],
             continue;
         dstPtr += (srcSliceY >> c->chrDstVSubSample) * dststr;
         for (i = 0; i < (srcSliceH >> c->chrDstVSubSample); i++) {
-            for (j = 0; j < min_stride; j++) {
-                dstPtr[j] = av_bswap16(srcPtr[j]);
-            }
+            c->bsdsp.bswap16_buf(dstPtr, srcPtr, min_stride);
             srcPtr += srcstr;
             dstPtr += dststr;
         }
@@ -495,7 +493,7 @@  static int bswap_32bpc(SwsContext *c, const uint8_t *src[],
                               int srcStride[], int srcSliceY, int srcSliceH,
                               uint8_t *dst[], int dstStride[])
 {
-    int i, j, p;
+    int i, p;
 
     for (p = 0; p < 4; p++) {
         int srcstr = srcStride[p] / 4;
@@ -507,9 +505,7 @@  static int bswap_32bpc(SwsContext *c, const uint8_t *src[],
             continue;
         dstPtr += (srcSliceY >> c->chrDstVSubSample) * dststr;
         for (i = 0; i < (srcSliceH >> c->chrDstVSubSample); i++) {
-            for (j = 0; j < min_stride; j++) {
-                dstPtr[j] = av_bswap32(srcPtr[j]);
-            }
+            c->bsdsp.bswap32_buf(dstPtr, srcPtr, min_stride);
             srcPtr += srcstr;
             dstPtr += dststr;
         }
@@ -1616,19 +1612,17 @@  static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
             conv(srcPtr, dstPtr + dstStride[0] * srcSliceY,
                  (srcSliceH - 1) * srcStride[0] + c->srcW * srcBpp);
         else {
-            int i, j;
+            int i;
             dstPtr += dstStride[0] * srcSliceY;
 
             for (i = 0; i < srcSliceH; i++) {
                 if(src_bswap) {
-                    for(j=0; j<c->srcW; j++)
-                        ((uint16_t*)c->formatConvBuffer)[j] = av_bswap16(((uint16_t*)srcPtr)[j]);
+                    c->bsdsp.bswap16_buf((uint16_t*)c->formatConvBuffer, (uint16_t*)srcPtr, c->srcW);
                     conv(c->formatConvBuffer, dstPtr, c->srcW * srcBpp);
                 }else
                     conv(srcPtr, dstPtr, c->srcW * srcBpp);
                 if(dst_bswap)
-                    for(j=0; j<c->srcW; j++)
-                        ((uint16_t*)dstPtr)[j] = av_bswap16(((uint16_t*)dstPtr)[j]);
+                    c->bsdsp.bswap16_buf((uint16_t*)dstPtr, (uint16_t*)dstPtr, c->srcW);
                 srcPtr += srcStride[0];
                 dstPtr += dstStride[0];
             }
@@ -1932,16 +1926,14 @@  static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
                       isBE(c->srcFormat) != isBE(c->dstFormat)) {
 
                 for (i = 0; i < height; i++) {
-                    for (j = 0; j < length; j++)
-                        ((uint16_t *) dstPtr)[j] = av_bswap16(((const uint16_t *) srcPtr)[j]);
+                    c->bsdsp.bswap16_buf((uint16_t *)dstPtr, (const uint16_t *)srcPtr, length);
                     srcPtr += srcStride[plane];
                     dstPtr += dstStride[plane];
                 }
             } else if (isFloat(c->srcFormat) && isFloat(c->dstFormat) &&
                        isBE(c->srcFormat) != isBE(c->dstFormat)) { /* swap float plane */
                 for (i = 0; i < height; i++) {
-                    for (j = 0; j < length; j++)
-                        ((uint32_t *) dstPtr)[j] = av_bswap32(((const uint32_t *) srcPtr)[j]);
+                    c->bsdsp.bswap32_buf((uint32_t *)dstPtr, (const uint32_t *)srcPtr, length);
                     srcPtr += srcStride[plane];
                     dstPtr += dstStride[plane];
                 }
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 90734f66ef..3ff8c2f84f 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1921,6 +1921,8 @@  static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
         return 0;
     }
 
+    ff_sws_bswapdsp_init(&c->bsdsp);
+
     /* unscaled special cases */
     if (unscaled && !usesHFilter && !usesVFilter &&
         (c->srcRange == c->dstRange || isAnyRGB(dstFormat) ||
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 68391494be..f7cdda5a25 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -1,6 +1,7 @@ 
 $(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
 
-OBJS                            += x86/rgb2rgb.o                        \
+OBJS                            += x86/bswapdsp_init.o                  \
+                                   x86/rgb2rgb.o                        \
                                    x86/swscale.o                        \
                                    x86/yuv2rgb.o                        \
 
@@ -8,7 +9,8 @@  MMX-OBJS                        += x86/hscale_fast_bilinear_simd.o      \
 
 OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 
-X86ASM-OBJS                     += x86/input.o                          \
+X86ASM-OBJS                     += x86/bswapdsp.o                       \
+                                   x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
                                    x86/scale_avx2.o                          \
diff --git a/libswscale/x86/bswapdsp.asm b/libswscale/x86/bswapdsp.asm
new file mode 100644
index 0000000000..84fc6fb000
--- /dev/null
+++ b/libswscale/x86/bswapdsp.asm
@@ -0,0 +1,157 @@ 
+;******************************************************************************
+;* optimized bswap buffer functions
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+SECTION .text
+
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS  1
+    mov      r3d, r2d
+    sar      r2d, 3
+    jz       .left4_%1
+%if cpuflag(avx2)
+    sar      r2d, 1
+    jz       .left8_%1
+%endif
+.loop8_%1:
+    mov%1    m0, [r1 +  0]
+    mov%1    m1, [r1 + mmsize]
+%if cpuflag(ssse3)||cpuflag(avx2)
+    pshufb   m0, m2
+    pshufb   m1, m2
+    mov%1    [r0 +  0], m0
+    mov%1    [r0 + mmsize], m1
+%else
+    pshuflw  m0, m0, 10110001b
+    pshuflw  m1, m1, 10110001b
+    pshufhw  m0, m0, 10110001b
+    pshufhw  m1, m1, 10110001b
+    mova     m2, m0
+    mova     m3, m1
+    psllw    m0, 8
+    psllw    m1, 8
+    psrlw    m2, 8
+    psrlw    m3, 8
+    por      m2, m0
+    por      m3, m1
+    mov%1    [r0 +  0], m2
+    mov%1    [r0 + 16], m3
+%endif
+    add      r0, mmsize*2
+    add      r1, mmsize*2
+    dec      r2d
+    jnz      .loop8_%1
+%if cpuflag(avx2)
+.left8_%1:
+    mov      r2d, r3d
+    test     r3d, 8
+    jz       .left4_%1
+    mov%1    m0, [r1]
+    pshufb   m0, m2
+    mov%1    [r0 +  0], m0
+    add r1, mmsize
+    add r0, mmsize
+%endif
+.left4_%1:
+    mov      r2d, r3d
+    test     r3d, 4
+    jz       .left
+    mov%1    xm0, [r1]
+%if cpuflag(ssse3)
+    pshufb   xm0, xm2
+    mov%1    [r0], xm0
+%else
+    pshuflw  m0, m0, 10110001b
+    pshufhw  m0, m0, 10110001b
+    mova     m2, m0
+    psllw    m0, 8
+    psrlw    m2, 8
+    por      m2, m0
+    mov%1    [r0], m2
+%endif
+    add      r1, 16
+    add      r0, 16
+%endmacro
+
+; void bswap32_buf(uint32_t *dst, const uint32_t *src, int w);
+%macro BSWAP32_BUF 0
+%if cpuflag(ssse3)||cpuflag(avx2)
+cglobal sws_bswap32_buf, 3,4,3
+    mov      r3, r1
+    VBROADCASTI128  m2, [pb_bswap32]
+%else
+cglobal sws_bswap32_buf, 3,4,5
+    mov      r3, r1
+%endif
+    or       r3, r0
+    test     r3, mmsize - 1
+    jz       .start_align
+    BSWAP_LOOPS  u
+    jmp      .left
+.start_align:
+    BSWAP_LOOPS  a
+.left:
+%if cpuflag(ssse3)
+    test     r2d, 2
+    jz       .left1
+    movq     xm0, [r1]
+    pshufb   xm0, xm2
+    movq     [r0], xm0
+    add      r1, 8
+    add      r0, 8
+.left1:
+    test     r2d, 1
+    jz       .end
+    mov      r2d, [r1]
+    bswap    r2d
+    mov      [r0], r2d
+%else
+    and      r2d, 3
+    jz       .end
+.loop2:
+    mov      r3d, [r1]
+    bswap    r3d
+    mov      [r0], r3d
+    add      r1, 4
+    add      r0, 4
+    dec      r2d
+    jnz      .loop2
+%endif
+.end:
+    RET
+%endmacro
+
+INIT_XMM sse2
+BSWAP32_BUF
+
+INIT_XMM ssse3
+BSWAP32_BUF
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+BSWAP32_BUF
+%endif
diff --git a/libswscale/x86/bswapdsp_init.c b/libswscale/x86/bswapdsp_init.c
new file mode 100644
index 0000000000..57f2944db7
--- /dev/null
+++ b/libswscale/x86/bswapdsp_init.c
@@ -0,0 +1,40 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libswscale/bswapdsp.h"
+
+void ff_sws_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
+void ff_sws_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_sws_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
+
+av_cold void ff_sws_bswapdsp_init_x86(BswapDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->bswap32_buf = ff_sws_bswap32_buf_sse2;
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->bswap32_buf = ff_sws_bswap32_buf_ssse3;
+    if (EXTERNAL_AVX2_FAST(cpu_flags))
+        c->bswap32_buf = ff_sws_bswap32_buf_avx2;
+}