@@ -6,6 +6,7 @@ HEADERS = swscale.h \
version_major.h \
OBJS = alphablend.o \
+ bswapdsp.o \
hscale.o \
hscale_fast_bilinear.o \
gamma.o \
new file mode 100644
@@ -0,0 +1,59 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/bswap.h"
+#include "bswapdsp.h"
+
+static void bswap32_buf(uint32_t *dst, const uint32_t *src, int len)
+{
+ int i;
+
+ for (i = 0; i + 8 <= len; i += 8) {
+ dst[i + 0] = av_bswap32(src[i + 0]);
+ dst[i + 1] = av_bswap32(src[i + 1]);
+ dst[i + 2] = av_bswap32(src[i + 2]);
+ dst[i + 3] = av_bswap32(src[i + 3]);
+ dst[i + 4] = av_bswap32(src[i + 4]);
+ dst[i + 5] = av_bswap32(src[i + 5]);
+ dst[i + 6] = av_bswap32(src[i + 6]);
+ dst[i + 7] = av_bswap32(src[i + 7]);
+ }
+ for (; i < len; i++)
+ dst[i + 0] = av_bswap32(src[i + 0]);
+}
+
+static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
+{
+ while (len--)
+ *dst++ = av_bswap16(*src++);
+}
+
+av_cold void ff_sws_bswapdsp_init(BswapDSPContext *c)
+{
+ c->bswap32_buf = bswap32_buf;
+ c->bswap16_buf = bswap16_buf;
+
+#if ARCH_RISCV
+ ff_sws_bswapdsp_init_riscv(c);
+#elif ARCH_X86
+ ff_sws_bswapdsp_init_x86(c);
+#endif
+}
new file mode 100644
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_BSWAPDSP_H
+#define SWSCALE_BSWAPDSP_H
+
+#include <stdint.h>
+
+/**
+ * @file
+ * Optimized buffer byte swapping routines.
+ */
+
+typedef struct BswapDSPContext {
+ /**
+ * Byte swap 32 bit elements in a buffer.
+
+ * @param dst Destination buffer.
+ * @param src Source buffer, may be the same as dst.
+ * @param len The number of elements in the buffer.
+ *
+ */
+ /** @{ */
+ void (*bswap32_buf)(uint32_t *dst, const uint32_t *src, int len);
+ /** @} */
+
+ /**
+ * Byte swap 16 bit elements in a buffer.
+ *
+ * @param dst Destination buffer.
+ * @param src Source buffer, may be the same as dst.
+ * @param len The number of elements in the buffer.
+ *
+ */
+ /** @{ */
+ void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len);
+ /** @} */
+} BswapDSPContext;
+
+
+/**
+ * Initialize BswapDSPContext function pointers.
+ *
+ * @param c pointer to BswapDSPContext
+ *
+ */
+void ff_sws_bswapdsp_init(BswapDSPContext *c);
+void ff_sws_bswapdsp_init_riscv(BswapDSPContext *c);
+void ff_sws_bswapdsp_init_x86(BswapDSPContext *c);
+
+#endif /* SWSCALE_BSWAPDSP_H */
@@ -2313,13 +2313,11 @@ yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter,
}
}
if (SH != 22 && (!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
- for (i = 0; i < dstW; i++) {
- dest16[0][i] = av_bswap16(dest16[0][i]);
- dest16[1][i] = av_bswap16(dest16[1][i]);
- dest16[2][i] = av_bswap16(dest16[2][i]);
- if (hasAlpha)
- dest16[3][i] = av_bswap16(dest16[3][i]);
- }
+ c->bsdsp.bswap16_buf(dest16[0], dest16[0], dstW);
+ c->bsdsp.bswap16_buf(dest16[1], dest16[1], dstW);
+ c->bsdsp.bswap16_buf(dest16[2], dest16[2], dstW);
+ if (hasAlpha)
+ c->bsdsp.bswap16_buf(dest16[3], dest16[3], dstW);
}
}
@@ -2385,13 +2383,11 @@ yuv2gbrp16_full_X_c(SwsContext *c, const int16_t *lumFilter,
dest16[3][i] = av_clip_uintp2(A, 30) >> 14;
}
if ((!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
- for (i = 0; i < dstW; i++) {
- dest16[0][i] = av_bswap16(dest16[0][i]);
- dest16[1][i] = av_bswap16(dest16[1][i]);
- dest16[2][i] = av_bswap16(dest16[2][i]);
- if (hasAlpha)
- dest16[3][i] = av_bswap16(dest16[3][i]);
- }
+ c->bsdsp.bswap16_buf(dest16[0], dest16[0], dstW);
+ c->bsdsp.bswap16_buf(dest16[1], dest16[1], dstW);
+ c->bsdsp.bswap16_buf(dest16[2], dest16[2], dstW);
+ if (hasAlpha)
+ c->bsdsp.bswap16_buf(dest16[3], dest16[3], dstW);
}
}
@@ -2461,13 +2457,11 @@ yuv2gbrpf32_full_X_c(SwsContext *c, const int16_t *lumFilter,
dest32[3][i] = av_float2int(float_mult * (float)(av_clip_uintp2(A, 30) >> 14));
}
if ((!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
- for (i = 0; i < dstW; i++) {
- dest32[0][i] = av_bswap32(dest32[0][i]);
- dest32[1][i] = av_bswap32(dest32[1][i]);
- dest32[2][i] = av_bswap32(dest32[2][i]);
- if (hasAlpha)
- dest32[3][i] = av_bswap32(dest32[3][i]);
- }
+ c->bsdsp.bswap32_buf(dest32[0], dest32[0], dstW);
+ c->bsdsp.bswap32_buf(dest32[1], dest32[1], dstW);
+ c->bsdsp.bswap32_buf(dest32[2], dest32[2], dstW);
+ if (hasAlpha)
+ c->bsdsp.bswap32_buf(dest32[3], dest32[3], dstW);
}
}
@@ -1,2 +1,5 @@
-OBJS += riscv/rgb2rgb.o
-RVV-OBJS += riscv/rgb2rgb_rvv.o
+OBJS += riscv/bswapdsp_init.o \
+ riscv/bswapdsp_rvb.o \
+ riscv/rgb2rgb.o
+RVV-OBJS += riscv/bswapdsp_rvv.o \
+ riscv/rgb2rgb_rvv.o
new file mode 100644
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libswscale/bswapdsp.h"
+
+void ff_sws_bswap32_buf_rvb(uint32_t *dst, const uint32_t *src, int len);
+void ff_sws_bswap32_buf_rvv(uint32_t *dst, const uint32_t *src, int len);
+void ff_sws_bswap16_buf_rvv(uint16_t *dst, const uint16_t *src, int len);
+
+av_cold void ff_sws_bswapdsp_init_riscv(BswapDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if (__riscv_xlen >= 64)
+ if (cpu_flags & AV_CPU_FLAG_RVB_BASIC)
+ c->bswap32_buf = ff_sws_bswap32_buf_rvb;
+#endif
+#if HAVE_RVV
+ if (cpu_flags & AV_CPU_FLAG_RVV_I32) {
+ c->bswap32_buf = ff_sws_bswap32_buf_rvv;
+ c->bswap16_buf = ff_sws_bswap16_buf_rvv;
+ }
+#endif
+}
new file mode 100644
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+#if (__riscv_xlen >= 64)
+func ff_sws_bswap32_buf_rvb, zbb
+ andi t0, a1, 4
+ beqz t0, 1f
+ /* Align a1 (input) to 64-bit */
+ lwu t0, (a1)
+ addi a0, a0, 4
+ rev8 t0, t0
+ addi a2, a2, -1
+ srli t0, t0, __riscv_xlen - 32
+ addi a1, a1, 4
+ sw t0, -4(a0)
+1:
+ andi a3, a2, -2
+ sh2add a2, a2, a0
+ beqz a3, 3f
+ sh2add a3, a3, a0
+2: /* 2 elements (64 bits) at a time on a 64-bit boundary */
+ ld t0, (a1)
+ addi a0, a0, 8
+ rev8 t0, t0
+#if (__riscv_xlen == 64)
+ srli t2, t0, 32
+ sw t0, -4(a0)
+#else
+ srli t1, t0, __riscv_xlen - 64
+ srli t2, t0, __riscv_xlen - 32
+ sw t1, -4(a0)
+#endif
+ addi a1, a1, 8
+ sw t2, -8(a0)
+ bne a0, a3, 2b
+3:
+ beq a0, a2, 5f
+4: /* Process last element */
+ lwu t0, (a1)
+ addi a0, a0, 4
+ rev8 t0, t0
+ addi a1, a1, 4
+ srli t0, t0, __riscv_xlen - 32
+ sw t0, -4(a0)
+5:
+ ret
+endfunc
+#endif
new file mode 100644
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_sws_bswap32_buf_rvv, zve32x
+ li t4, 4
+ addi t1, a0, 1
+ addi t2, a0, 2
+ addi t3, a0, 3
+1:
+ vsetvli t0, a2, e8, m1, ta, ma
+ vlseg4e8.v v8, (a1)
+ sub a2, a2, t0
+ sh2add a1, t0, a1
+ vsse8.v v8, (t3), t4
+ sh2add t3, t0, t3
+ vsse8.v v9, (t2), t4
+ sh2add t2, t0, t2
+ vsse8.v v10, (t1), t4
+ sh2add t1, t0, t1
+ vsse8.v v11, (a0), t4
+ sh2add a0, t0, a0
+ bnez a2, 1b
+
+ ret
+endfunc
+
+func ff_sws_bswap16_buf_rvv, zve32x
+ li t2, 2
+ addi t1, a0, 1
+1:
+ vsetvli t0, a2, e8, m1, ta, ma
+ vlseg2e8.v v8, (a1)
+ sub a2, a2, t0
+ sh1add a1, t0, a1
+ vsse8.v v8, (t1), t2
+ sh1add t1, t0, t1
+ vsse8.v v9, (a0), t2
+ sh1add a0, t0, a0
+ bnez a2, 1b
+
+ ret
+endfunc
@@ -36,6 +36,7 @@
#include "libavutil/slicethread.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavutil/half2float.h"
+#include "bswapdsp.h"
#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
@@ -682,6 +683,8 @@ typedef struct SwsContext {
atomic_int data_unaligned_warned;
Half2FloatTables *h2f_tables;
+
+ BswapDSPContext bsdsp;
} SwsContext;
//FIXME check init (where 0)
@@ -468,7 +468,7 @@ static int bswap_16bpc(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int i, j, p;
+ int i, p;
for (p = 0; p < 4; p++) {
int srcstr = srcStride[p] / 2;
@@ -480,9 +480,7 @@ static int bswap_16bpc(SwsContext *c, const uint8_t *src[],
continue;
dstPtr += (srcSliceY >> c->chrDstVSubSample) * dststr;
for (i = 0; i < (srcSliceH >> c->chrDstVSubSample); i++) {
- for (j = 0; j < min_stride; j++) {
- dstPtr[j] = av_bswap16(srcPtr[j]);
- }
+ c->bsdsp.bswap16_buf(dstPtr, srcPtr, min_stride);
srcPtr += srcstr;
dstPtr += dststr;
}
@@ -495,7 +493,7 @@ static int bswap_32bpc(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int i, j, p;
+ int i, p;
for (p = 0; p < 4; p++) {
int srcstr = srcStride[p] / 4;
@@ -507,9 +505,7 @@ static int bswap_32bpc(SwsContext *c, const uint8_t *src[],
continue;
dstPtr += (srcSliceY >> c->chrDstVSubSample) * dststr;
for (i = 0; i < (srcSliceH >> c->chrDstVSubSample); i++) {
- for (j = 0; j < min_stride; j++) {
- dstPtr[j] = av_bswap32(srcPtr[j]);
- }
+ c->bsdsp.bswap32_buf(dstPtr, srcPtr, min_stride);
srcPtr += srcstr;
dstPtr += dststr;
}
@@ -1616,19 +1612,17 @@ static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
conv(srcPtr, dstPtr + dstStride[0] * srcSliceY,
(srcSliceH - 1) * srcStride[0] + c->srcW * srcBpp);
else {
- int i, j;
+ int i;
dstPtr += dstStride[0] * srcSliceY;
for (i = 0; i < srcSliceH; i++) {
if(src_bswap) {
- for(j=0; j<c->srcW; j++)
- ((uint16_t*)c->formatConvBuffer)[j] = av_bswap16(((uint16_t*)srcPtr)[j]);
+ c->bsdsp.bswap16_buf((uint16_t*)c->formatConvBuffer, (uint16_t*)srcPtr, c->srcW);
conv(c->formatConvBuffer, dstPtr, c->srcW * srcBpp);
}else
conv(srcPtr, dstPtr, c->srcW * srcBpp);
if(dst_bswap)
- for(j=0; j<c->srcW; j++)
- ((uint16_t*)dstPtr)[j] = av_bswap16(((uint16_t*)dstPtr)[j]);
+ c->bsdsp.bswap16_buf((uint16_t*)dstPtr, (uint16_t*)dstPtr, c->srcW);
srcPtr += srcStride[0];
dstPtr += dstStride[0];
}
@@ -1932,16 +1926,14 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
isBE(c->srcFormat) != isBE(c->dstFormat)) {
for (i = 0; i < height; i++) {
- for (j = 0; j < length; j++)
- ((uint16_t *) dstPtr)[j] = av_bswap16(((const uint16_t *) srcPtr)[j]);
+ c->bsdsp.bswap16_buf((uint16_t *)dstPtr, (const uint16_t *)srcPtr, length);
srcPtr += srcStride[plane];
dstPtr += dstStride[plane];
}
} else if (isFloat(c->srcFormat) && isFloat(c->dstFormat) &&
isBE(c->srcFormat) != isBE(c->dstFormat)) { /* swap float plane */
for (i = 0; i < height; i++) {
- for (j = 0; j < length; j++)
- ((uint32_t *) dstPtr)[j] = av_bswap32(((const uint32_t *) srcPtr)[j]);
+ c->bsdsp.bswap32_buf((uint32_t *)dstPtr, (const uint32_t *)srcPtr, length);
srcPtr += srcStride[plane];
dstPtr += dstStride[plane];
}
@@ -1921,6 +1921,8 @@ static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
return 0;
}
+ ff_sws_bswapdsp_init(&c->bsdsp);
+
/* unscaled special cases */
if (unscaled && !usesHFilter && !usesVFilter &&
(c->srcRange == c->dstRange || isAnyRGB(dstFormat) ||
@@ -1,6 +1,7 @@
$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
-OBJS += x86/rgb2rgb.o \
+OBJS += x86/bswapdsp_init.o \
+ x86/rgb2rgb.o \
x86/swscale.o \
x86/yuv2rgb.o \
@@ -8,7 +9,8 @@ MMX-OBJS += x86/hscale_fast_bilinear_simd.o \
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
-X86ASM-OBJS += x86/input.o \
+X86ASM-OBJS += x86/bswapdsp.o \
+ x86/input.o \
x86/output.o \
x86/scale.o \
x86/scale_avx2.o \
new file mode 100644
@@ -0,0 +1,157 @@
+;******************************************************************************
+;* optimized bswap buffer functions
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+SECTION .text
+
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS 1
+ mov r3d, r2d
+ sar r2d, 3
+ jz .left4_%1
+%if cpuflag(avx2)
+ sar r2d, 1
+ jz .left8_%1
+%endif
+.loop8_%1:
+ mov%1 m0, [r1 + 0]
+ mov%1 m1, [r1 + mmsize]
+%if cpuflag(ssse3)||cpuflag(avx2)
+ pshufb m0, m2
+ pshufb m1, m2
+ mov%1 [r0 + 0], m0
+ mov%1 [r0 + mmsize], m1
+%else
+ pshuflw m0, m0, 10110001b
+ pshuflw m1, m1, 10110001b
+ pshufhw m0, m0, 10110001b
+ pshufhw m1, m1, 10110001b
+ mova m2, m0
+ mova m3, m1
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m2, 8
+ psrlw m3, 8
+ por m2, m0
+ por m3, m1
+ mov%1 [r0 + 0], m2
+ mov%1 [r0 + 16], m3
+%endif
+ add r0, mmsize*2
+ add r1, mmsize*2
+ dec r2d
+ jnz .loop8_%1
+%if cpuflag(avx2)
+.left8_%1:
+ mov r2d, r3d
+ test r3d, 8
+ jz .left4_%1
+ mov%1 m0, [r1]
+ pshufb m0, m2
+ mov%1 [r0 + 0], m0
+ add r1, mmsize
+ add r0, mmsize
+%endif
+.left4_%1:
+ mov r2d, r3d
+ test r3d, 4
+ jz .left
+ mov%1 xm0, [r1]
+%if cpuflag(ssse3)
+ pshufb xm0, xm2
+ mov%1 [r0], xm0
+%else
+ pshuflw m0, m0, 10110001b
+ pshufhw m0, m0, 10110001b
+ mova m2, m0
+ psllw m0, 8
+ psrlw m2, 8
+ por m2, m0
+ mov%1 [r0], m2
+%endif
+ add r1, 16
+ add r0, 16
+%endmacro
+
+; void bswap32_buf(uint32_t *dst, const uint32_t *src, int w);
+%macro BSWAP32_BUF 0
+%if cpuflag(ssse3)||cpuflag(avx2)
+cglobal sws_bswap32_buf, 3,4,3
+ mov r3, r1
+ VBROADCASTI128 m2, [pb_bswap32]
+%else
+cglobal sws_bswap32_buf, 3,4,5
+ mov r3, r1
+%endif
+ or r3, r0
+ test r3, mmsize - 1
+ jz .start_align
+ BSWAP_LOOPS u
+ jmp .left
+.start_align:
+ BSWAP_LOOPS a
+.left:
+%if cpuflag(ssse3)
+ test r2d, 2
+ jz .left1
+ movq xm0, [r1]
+ pshufb xm0, xm2
+ movq [r0], xm0
+ add r1, 8
+ add r0, 8
+.left1:
+ test r2d, 1
+ jz .end
+ mov r2d, [r1]
+ bswap r2d
+ mov [r0], r2d
+%else
+ and r2d, 3
+ jz .end
+.loop2:
+ mov r3d, [r1]
+ bswap r3d
+ mov [r0], r3d
+ add r1, 4
+ add r0, 4
+ dec r2d
+ jnz .loop2
+%endif
+.end:
+ RET
+%endmacro
+
+INIT_XMM sse2
+BSWAP32_BUF
+
+INIT_XMM ssse3
+BSWAP32_BUF
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+BSWAP32_BUF
+%endif
new file mode 100644
@@ -0,0 +1,40 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libswscale/bswapdsp.h"
+
+void ff_sws_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
+void ff_sws_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_sws_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
+
+av_cold void ff_sws_bswapdsp_init_x86(BswapDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ c->bswap32_buf = ff_sws_bswap32_buf_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ c->bswap32_buf = ff_sws_bswap32_buf_ssse3;
+ if (EXTERNAL_AVX2_FAST(cpu_flags))
+ c->bswap32_buf = ff_sws_bswap32_buf_avx2;
+}
From: Mark Reid <mindmark@gmail.com> There are some places in input.c that could use it too but they aren't currently being pass the SwsContext --- libswscale/Makefile | 1 + libswscale/bswapdsp.c | 59 ++++++++++++ libswscale/bswapdsp.h | 66 +++++++++++++ libswscale/output.c | 36 +++---- libswscale/riscv/Makefile | 7 +- libswscale/riscv/bswapdsp_init.c | 46 +++++++++ libswscale/riscv/bswapdsp_rvb.S | 68 +++++++++++++ libswscale/riscv/bswapdsp_rvv.S | 62 ++++++++++++ libswscale/swscale_internal.h | 3 + libswscale/swscale_unscaled.c | 26 ++--- libswscale/utils.c | 2 + libswscale/x86/Makefile | 6 +- libswscale/x86/bswapdsp.asm | 157 +++++++++++++++++++++++++++++++ libswscale/x86/bswapdsp_init.c | 40 ++++++++ 14 files changed, 537 insertions(+), 42 deletions(-) create mode 100644 libswscale/bswapdsp.c create mode 100644 libswscale/bswapdsp.h create mode 100644 libswscale/riscv/bswapdsp_init.c create mode 100644 libswscale/riscv/bswapdsp_rvb.S create mode 100644 libswscale/riscv/bswapdsp_rvv.S create mode 100644 libswscale/x86/bswapdsp.asm create mode 100644 libswscale/x86/bswapdsp_init.c