@@ -152,6 +152,12 @@ void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
@@ -200,6 +206,13 @@ av_cold void rgb2rgb_init_x86(void)
shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
}
+ if (EXTERNAL_AVX512(cpu_flags)) {
+ shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512;
+ shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512;
+ shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512;
+ shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512;
+ shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512;
+ }
if (EXTERNAL_AVX(cpu_flags)) {
uyvytoyuv422 = ff_uyvytoyuv422_avx;
}
@@ -168,6 +168,14 @@ SHUFFLE_BYTES 1, 2, 3, 0
SHUFFLE_BYTES 3, 0, 1, 2
SHUFFLE_BYTES 3, 2, 1, 0
%endif
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+SHUFFLE_BYTES 2, 1, 0, 3
+SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
+%endif
%endif
;-----------------------------------------------------------------------------------------------
Performance data(Less is better): shuffle_bytes_avx2 0.94288 shuffle_bytes_avx512 0.60049 Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> --- libswscale/x86/rgb2rgb.c | 13 +++++++++++++ libswscale/x86/rgb_2_rgb.asm | 8 ++++++++ 2 files changed, 21 insertions(+)