@@ -167,6 +167,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx512(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
#endif
av_cold void rgb2rgb_init_x86(void)
@@ -222,5 +225,8 @@ av_cold void rgb2rgb_init_x86(void)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
uyvytoyuv422 = ff_uyvytoyuv422_avx2;
}
+ if (EXTERNAL_AVX512(cpu_flags)) {
+ uyvytoyuv422 = ff_uyvytoyuv422_avx512;
+ }
#endif
}
@@ -31,7 +31,9 @@ pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+pd_permd512_uv: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7
+pq_permq512_yy: dq 0, 2, 4, 6, 1, 3, 5, 7
SECTION .text
@@ -193,7 +195,11 @@ SHUFFLE_BYTES 3, 2, 1, 0
%macro UYVY_TO_YUV422 0
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
pxor m0, m0
+%if mmsize == 64
+ vpternlogd m1, m1, m1, 0xff
+%else
pcmpeqw m1, m1
+%endif
psrlw m1, 8
movsxdifnidn wq, wd
@@ -212,6 +218,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
%if mmsize == 32
movu m15, [pd_permd256_uv]
+%elif mmsize == 64
+ movu m14, [pq_permq512_yy]
+ movu m15, [pd_permd512_uv]
%endif
.loop_line:
@@ -265,6 +274,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m6, m7 ; YYYY YYYY...
VPERM q, 32, m6, m6, 0xd8
+ VPERM q, 64, m6, m14, m6
movu [ydstq + wq], m6
@@ -277,6 +287,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m6, m7 ; YYYY YYYY...
VPERM q, 32, m6, m6, 0xd8
+ VPERM q, 64, m6, m14, m6
movu [ydstq + wq + mmsize], m6
@@ -295,6 +306,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m6, m7 ; UUUU
VPERM d, 32, m6, m15, m6
+ VPERM d, 64, m6, m15, m6
movu [udstq + whalfq], m6
@@ -304,6 +316,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m2, m4 ; VVVV
VPERM d, 32, m2, m15, m2
+ VPERM d, 64, m2, m15, m2
movu [vdstq + whalfq], m2
@@ -340,4 +353,8 @@ UYVY_TO_YUV422
INIT_YMM avx2
UYVY_TO_YUV422
%endif
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+UYVY_TO_YUV422
+%endif
%endif
With the accelerating by means of AVX512, the uyvytoyuv422 can be faster. Performance data(Less is better): uyvytoyuv422_avx2 0.27915 uyvytoyuv422_avx512 0.16442 Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> --- libswscale/x86/rgb2rgb.c | 6 ++++++ libswscale/x86/rgb_2_rgb.asm | 17 +++++++++++++++++ 2 files changed, 23 insertions(+)