Message ID | 20210227002723.23754-1-onemda@gmail.com |
---|---|
State | Superseded |
Headers | show |
Series | [FFmpeg-devel,1] avcodec/x86: add cfhdenc SIMD |
Related | show |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
On 2/26/2021 9:27 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol <onemda@gmail.com> > --- > libavcodec/cfhdencdsp.c | 3 + > libavcodec/x86/Makefile | 2 + > libavcodec/x86/cfhdencdsp.asm | 431 +++++++++++++++++++++++++++++++ > libavcodec/x86/cfhdencdsp_init.c | 48 ++++ > 4 files changed, 484 insertions(+) > create mode 100644 libavcodec/x86/cfhdencdsp.asm > create mode 100644 libavcodec/x86/cfhdencdsp_init.c > > diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c > index 0becb76d1d..b979e9e09a 100644 > --- a/libavcodec/cfhdencdsp.c > +++ b/libavcodec/cfhdencdsp.c > @@ -73,4 +73,7 @@ av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c) > { > c->horiz_filter = horiz_filter; > c->vert_filter = vert_filter; > + > + if (ARCH_X86) > + ff_cfhdencdsp_init_x86(c); > } > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index 884dc0c759..6361161180 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -51,6 +51,7 @@ OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o > OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o > OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o > OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp_init.o > +OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp_init.o > OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o > OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o > OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o > @@ -154,6 +155,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o > X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o > X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o > X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o > +X86ASM-OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp.o > X86ASM-OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp.o > X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o > X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ > diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm > new file mode 100644 > index 0000000000..be51c77c46 > --- /dev/null > +++ b/libavcodec/x86/cfhdencdsp.asm > @@ -0,0 +1,431 @@ > +;****************************************************************************** > +;* x86-optimized functions for the CFHD encoder > +;* Copyright (c) 2021 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +pw_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1 > +pw_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1 > +pw_p5_n11: dw 5, -11, 5, -11, 5, -11, 5, -11 > +pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11 > +pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5 > +pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5 > +pd_4: times 4 dd 4 > +pw_n4: times 8 dw -4 > +pw_n1: times 8 dw -1 cextern pw_m1 > +cextern pw_1 > +cextern pw_4 > + > +SECTION .text > + > +%if ARCH_X86_64 > +INIT_XMM sse2 > +cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp > + shl istrided, 1 > + shl lwidthd, 1 > + shl hwidthd, 1 These are ptrdiff_t, so you can use the q suffix just fine. > + mova m7, [pd_4] > + mova m8, [pw_1] > + mova m9, [pw_n1] > + mova m10,[pw_p1_n1] > + neg yq This one is int, so: movsxdifnidn yq, yd neg yq > +.looph: > + movsx xq, word [inputq] > + > + movsx tempq, word [inputq + 2] > + add tempq, xq > + > + movd xm0, tempd > + packssdw m0, m0 > + pextrw tempd, xm0, 0 movd tempd, m0 There's no reason to use pextrw if you're going to read only 16 bits right below, then discard the value. > + mov word [lowq], tempw > + > + movsx xq, word [inputq] > + imul xq, 5 > + movsx tempq, word [inputq + 2] > + imul tempq, -11 > + add tempq, xq > + > + movsx xq, word [inputq + 4] > + imul xq, 4 > + add tempq, xq > + > + movsx xq, word [inputq + 6] > + imul xq, 4 > + add tempq, xq > + > + movsx xq, word [inputq + 8] > + imul xq, -1 > + add tempq, xq > + > + movsx xq, word [inputq + 10] > + imul xq, -1 > + add tempq, xq > + > + add tempq, 4 > + sar tempq, 3 > + > + movd xm0, tempd > + packssdw m0, m0 > + pextrw tempd, xm0, 0 Same. > + mov word [highq], tempw > + > + mov xq, 2 > + > +.loopw: > + movu m0, [inputq + xq * 2] > + movu m1, [inputq + xq * 2 + mmsize] > + > + pmaddwd m0, m8 > + pmaddwd m1, m8 > + > + packssdw m0, m1 > + movu [lowq+xq], m0 > + > + movu m2, [inputq + xq * 2 - 4] > + movu m3, [inputq + xq * 2 - 4 + mmsize] > + > + pmaddwd m2, m9 > + pmaddwd m3, m9 > + > + movu m0, [inputq + xq * 2 + 4] > + movu m1, [inputq + xq * 2 + 4 + mmsize] > + > + pmaddwd m0, m8 > + pmaddwd m1, m8 > + > + paddd m0, m2 > + paddd m1, m3 > + > + paddd m0, m7 > + paddd m1, m7 > + > + psrad m0, 3 > + psrad m1, 3 > + > + movu m5, [inputq + xq * 2 + 0] > + movu m6, [inputq + xq * 2 + mmsize] > + > + pmaddwd m5, m10 > + pmaddwd m6, m10 > + > + paddd m0, m5 > + paddd m1, m6 > + > + packssdw m0, m1 > + movu [highq+xq], m0 > + > + add xq, mmsize > + cmp xq, widthq width is an int, so use movsxdifnidn on it alongside y. > + jl .loopw > + > + add lowq, widthq > + add highq, widthq > + add inputq, widthq > + add inputq, widthq Twice? If this is correct, you could use lea. > + > + movsx xq, word [inputq - 4] > + movsx tempq, word [inputq - 2] > + add tempq, xq > + > + movd xm0, tempd > + packssdw m0, m0 > + pextrw tempd, xm0, 0 also movd. > + mov word [lowq-2], tempw > + > + movsx tempq, word [inputq - 4] > + imul tempq, 11 > + movsx xq, word [inputq - 2] > + imul xq, -5 > + add tempq, xq > + > + movsx xq, word [inputq - 6] > + imul xq, -4 > + add tempq, xq > + > + movsx xq, word [inputq - 8] > + imul xq, -4 > + add tempq, xq > + > + movsx xq, word [inputq - 10] > + add tempq, xq > + > + movsx xq, word [inputq - 12] > + add tempq, xq > + > + add tempq, 4 > + sar tempq, 3 > + > + movd xm0, tempd > + packssdw m0, m0 > + pextrw tempd, xm0, 0 Same. > + mov word [highq-2], tempw > + > + sub inputq, widthq > + sub inputq, widthq > + sub highq, widthq > + sub lowq, widthq > + > + add lowq, lwidthq > + add highq, hwidthq > + add inputq, istrideq > + add yq, 1 > + jl .looph > + > + RET > +%endif > + > +%if ARCH_X86_64 > +INIT_XMM sse2 > +cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos > + shl istrided, 1 > + shl widthd, 1 > + > + sub heightq, 2 heightd > + > + xor xq, xq > + > + mova m7, [pd_4] > + mova m8, [pw_1] > + mova m9, [pw_n1] > + mova m10,[pw_p1_n1] > + mova m11,[pw_n1_p1] > + mova m12,[pw_4] > + mova m13,[pw_n4] > +.loopw: > + mov yq, 2 > + > + mov posq, xq > + movu m0, [inputq + posq] > + add posq, istrideq > + movu m1, [inputq + posq] > + > + paddsw m0, m1 > + > + movu [lowq + xq], m0 > + > + mov posq, xq > + > + movu m0, [inputq + posq] > + add posq, istrideq > + movu m1, [inputq + posq] > + add posq, istrideq > + movu m2, [inputq + posq] > + add posq, istrideq > + movu m3, [inputq + posq] > + add posq, istrideq > + movu m4, [inputq + posq] > + add posq, istrideq > + movu m5, [inputq + posq] > + > + mova m6, m0 > + punpcklwd m0, m1 > + punpckhwd m1, m6 > + > + mova m6, m2 > + punpcklwd m2, m3 > + punpckhwd m3, m6 > + > + mova m6, m4 > + punpcklwd m4, m5 > + punpckhwd m5, m6 > + > + pmaddwd m0, [pw_p5_n11] > + pmaddwd m1, [pw_n11_p5] > + pmaddwd m2, m12 > + pmaddwd m3, m12 > + pmaddwd m4, m9 > + pmaddwd m5, m9 > + > + paddd m0, m2 > + paddd m1, m3 > + paddd m0, m4 > + paddd m1, m5 > + > + paddd m0, m7 > + paddd m1, m7 > + > + psrad m0, 3 > + psrad m1, 3 > + packssdw m0, m1 > + > + movu [highq + xq], m0 > + > +.looph: > + > + mov posq, istrideq > + imul posq, yq > + add posq, xq > + > + movu m0, [inputq + posq] > + > + add posq, istrideq > + movu m1, [inputq + posq] > + > + paddsw m0, m1 > + > + mov posq, lwidthq > + imul posq, yq > + add posq, xq > + > + movu [lowq + posq], m0 > + > + add yq, -2 > + > + mov posq, istrideq > + imul posq, yq > + add posq, xq > + > + movu m0, [inputq + posq] > + add posq, istrideq > + movu m1, [inputq + posq] > + add posq, istrideq > + movu m2, [inputq + posq] > + add posq, istrideq > + movu m3, [inputq + posq] > + add posq, istrideq > + movu m4, [inputq + posq] > + add posq, istrideq > + movu m5, [inputq + posq] > + > + add yq, 2 > + > + mova m6, m0 > + punpcklwd m0, m1 > + punpckhwd m1, m6 > + > + mova m6, m2 > + punpcklwd m2, m3 > + punpckhwd m3, m6 > + > + mova m6, m4 > + punpcklwd m4, m5 > + punpckhwd m5, m6 > + > + pmaddwd m0, m9 > + pmaddwd m1, m9 > + pmaddwd m2, m10 > + pmaddwd m3, m11 > + pmaddwd m4, m8 > + pmaddwd m5, m8 > + > + paddd m0, m4 > + paddd m1, m5 > + > + paddd m0, m7 > + paddd m1, m7 > + > + psrad m0, 3 > + psrad m1, 3 > + paddd m0, m2 > + paddd m1, m3 > + packssdw m0, m1 > + > + mov posq, hwidthq > + imul posq, yq > + add posq, xq > + > + movu [highq + posq], m0 > + > + add yq, 2 > + cmp yq, heightq > + jl .looph > + > + mov posq, istrideq > + imul posq, yq > + add posq, xq > + > + movu m0, [inputq + posq] > + add posq, istrideq > + movu m1, [inputq + posq] > + > + paddsw m0, m1 > + > + mov posq, lwidthq > + imul posq, yq > + add posq, xq > + > + movu [lowq + posq], m0 > + > + sub yq, 4 > + > + mov posq, istrideq > + imul posq, yq > + add posq, xq > + > + movu m0, [inputq + posq] > + add posq, istrideq > + movu m1, [inputq + posq] > + add posq, istrideq > + movu m2, [inputq + posq] > + add posq, istrideq > + movu m3, [inputq + posq] > + add posq, istrideq > + movu m4, [inputq + posq] > + add posq, istrideq > + movu m5, [inputq + posq] > + > + add yq, 4 > + > + mova m6, m0 > + punpcklwd m0, m1 > + punpckhwd m1, m6 > + > + mova m6, m2 > + punpcklwd m2, m3 > + punpckhwd m3, m6 > + > + mova m6, m4 > + punpcklwd m4, m5 > + punpckhwd m5, m6 > + > + pmaddwd m0, m8 > + pmaddwd m1, m8 > + pmaddwd m2, m13 > + pmaddwd m3, m13 > + pmaddwd m4, [pw_p11_n5] > + pmaddwd m5, [pw_n5_p11] > + > + paddd m4, m2 > + paddd m5, m3 > + > + paddd m4, m0 > + paddd m5, m1 > + > + paddd m4, m7 > + paddd m5, m7 > + > + psrad m4, 3 > + psrad m5, 3 > + packssdw m4, m5 > + > + mov posq, hwidthq > + imul posq, yq > + add posq, xq > + > + movu [highq + posq], m4 > + > + add xq, mmsize > + cmp xq, widthq > + jl .loopw > + RET > +%endif > diff --git a/libavcodec/x86/cfhdencdsp_init.c b/libavcodec/x86/cfhdencdsp_init.c > new file mode 100644 > index 0000000000..28f1dd504d > --- /dev/null > +++ b/libavcodec/x86/cfhdencdsp_init.c > @@ -0,0 +1,48 @@ > +/* > + * Copyright (c) 2021 Paul B Mahol > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include <stdint.h> > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/avcodec.h" > +#include "libavcodec/cfhdencdsp.h" > + > +void ff_cfhdenc_horiz_filter_sse2(int16_t *input, int16_t *low, int16_t *high, > + ptrdiff_t in_stride, ptrdiff_t low_stride, > + ptrdiff_t high_stride, > + int width, int height); > +void ff_cfhdenc_vert_filter_sse2(int16_t *input, int16_t *low, int16_t *high, > + ptrdiff_t in_stride, ptrdiff_t low_stride, > + ptrdiff_t high_stride, > + int width, int height); > + > +av_cold void ff_cfhdencdsp_init_x86(CFHDEncDSPContext *c) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > +#if ARCH_X86_64 > + if (EXTERNAL_SSE2(cpu_flags)) { > + c->horiz_filter = ff_cfhdenc_horiz_filter_sse2; > + c->vert_filter = ff_cfhdenc_vert_filter_sse2; > + } > +#endif > +} >
diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c index 0becb76d1d..b979e9e09a 100644 --- a/libavcodec/cfhdencdsp.c +++ b/libavcodec/cfhdencdsp.c @@ -73,4 +73,7 @@ av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c) { c->horiz_filter = horiz_filter; c->vert_filter = vert_filter; + + if (ARCH_X86) + ff_cfhdencdsp_init_x86(c); } diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 884dc0c759..6361161180 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -51,6 +51,7 @@ OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp_init.o +OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp_init.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o @@ -154,6 +155,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o +X86ASM-OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp.o X86ASM-OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp.o X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm new file mode 100644 index 0000000000..be51c77c46 --- /dev/null +++ b/libavcodec/x86/cfhdencdsp.asm @@ -0,0 +1,431 @@ +;****************************************************************************** +;* x86-optimized functions for the CFHD encoder +;* Copyright (c) 2021 Paul B Mahol +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1 +pw_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1 +pw_p5_n11: dw 5, -11, 5, -11, 5, -11, 5, -11 +pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11 +pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5 +pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5 +pd_4: times 4 dd 4 +pw_n4: times 8 dw -4 +pw_n1: times 8 dw -1 +cextern pw_1 +cextern pw_4 + +SECTION .text + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp + shl istrided, 1 + shl lwidthd, 1 + shl hwidthd, 1 + mova m7, [pd_4] + mova m8, [pw_1] + mova m9, [pw_n1] + mova m10,[pw_p1_n1] + neg yq +.looph: + movsx xq, word [inputq] + + movsx tempq, word [inputq + 2] + add tempq, xq + + movd xm0, tempd + packssdw m0, m0 + pextrw tempd, xm0, 0 + mov word [lowq], tempw + + movsx xq, word [inputq] + imul xq, 5 + movsx tempq, word [inputq + 2] + imul tempq, -11 + add tempq, xq + + movsx xq, word [inputq + 4] + imul xq, 4 + add tempq, xq + + movsx xq, word [inputq + 6] + imul xq, 4 + add tempq, xq + + movsx xq, word [inputq + 8] + imul xq, -1 + add tempq, xq + + movsx xq, word [inputq + 10] + imul xq, -1 + add tempq, xq + + add tempq, 4 + sar tempq, 3 + + movd xm0, tempd + packssdw m0, m0 + pextrw tempd, xm0, 0 + mov word [highq], tempw + + mov xq, 2 + +.loopw: + movu m0, [inputq + xq * 2] + movu m1, [inputq + xq * 2 + mmsize] + + pmaddwd m0, m8 + pmaddwd m1, m8 + + packssdw m0, m1 + movu [lowq+xq], m0 + + movu m2, [inputq + xq * 2 - 4] + movu m3, [inputq + xq * 2 - 4 + mmsize] + + pmaddwd m2, m9 + pmaddwd m3, m9 + + movu m0, [inputq + xq * 2 + 4] + movu m1, [inputq + xq * 2 + 4 + mmsize] + + pmaddwd m0, m8 + pmaddwd m1, m8 + + paddd m0, m2 + paddd m1, m3 + + paddd m0, m7 + paddd m1, m7 + + psrad m0, 3 + psrad m1, 3 + + movu m5, [inputq + xq * 2 + 0] + movu m6, [inputq + xq * 2 + mmsize] + + pmaddwd m5, m10 + pmaddwd m6, m10 + + paddd m0, m5 + paddd m1, m6 + + packssdw m0, m1 + movu [highq+xq], m0 + + add xq, mmsize + cmp xq, widthq + jl .loopw + + add lowq, widthq + add highq, widthq + add inputq, widthq + add inputq, widthq + + movsx xq, word [inputq - 4] + movsx tempq, word [inputq - 2] + add tempq, xq + + movd xm0, tempd + packssdw m0, m0 + pextrw tempd, xm0, 0 + mov word [lowq-2], tempw + + movsx tempq, word [inputq - 4] + imul tempq, 11 + movsx xq, word [inputq - 2] + imul xq, -5 + add tempq, xq + + movsx xq, word [inputq - 6] + imul xq, -4 + add tempq, xq + + movsx xq, word [inputq - 8] + imul xq, -4 + add tempq, xq + + movsx xq, word [inputq - 10] + add tempq, xq + + movsx xq, word [inputq - 12] + add tempq, xq + + add tempq, 4 + sar tempq, 3 + + movd xm0, tempd + packssdw m0, m0 + pextrw tempd, xm0, 0 + mov word [highq-2], tempw + + sub inputq, widthq + sub inputq, widthq + sub highq, widthq + sub lowq, widthq + + add lowq, lwidthq + add highq, hwidthq + add inputq, istrideq + add yq, 1 + jl .looph + + RET +%endif + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos + shl istrided, 1 + shl widthd, 1 + + sub heightq, 2 + + xor xq, xq + + mova m7, [pd_4] + mova m8, [pw_1] + mova m9, [pw_n1] + mova m10,[pw_p1_n1] + mova m11,[pw_n1_p1] + mova m12,[pw_4] + mova m13,[pw_n4] +.loopw: + mov yq, 2 + + mov posq, xq + movu m0, [inputq + posq] + add posq, istrideq + movu m1, [inputq + posq] + + paddsw m0, m1 + + movu [lowq + xq], m0 + + mov posq, xq + + movu m0, [inputq + posq] + add posq, istrideq + movu m1, [inputq + posq] + add posq, istrideq + movu m2, [inputq + posq] + add posq, istrideq + movu m3, [inputq + posq] + add posq, istrideq + movu m4, [inputq + posq] + add posq, istrideq + movu m5, [inputq + posq] + + mova m6, m0 + punpcklwd m0, m1 + punpckhwd m1, m6 + + mova m6, m2 + punpcklwd m2, m3 + punpckhwd m3, m6 + + mova m6, m4 + punpcklwd m4, m5 + punpckhwd m5, m6 + + pmaddwd m0, [pw_p5_n11] + pmaddwd m1, [pw_n11_p5] + pmaddwd m2, m12 + pmaddwd m3, m12 + pmaddwd m4, m9 + pmaddwd m5, m9 + + paddd m0, m2 + paddd m1, m3 + paddd m0, m4 + paddd m1, m5 + + paddd m0, m7 + paddd m1, m7 + + psrad m0, 3 + psrad m1, 3 + packssdw m0, m1 + + movu [highq + xq], m0 + +.looph: + + mov posq, istrideq + imul posq, yq + add posq, xq + + movu m0, [inputq + posq] + + add posq, istrideq + movu m1, [inputq + posq] + + paddsw m0, m1 + + mov posq, lwidthq + imul posq, yq + add posq, xq + + movu [lowq + posq], m0 + + add yq, -2 + + mov posq, istrideq + imul posq, yq + add posq, xq + + movu m0, [inputq + posq] + add posq, istrideq + movu m1, [inputq + posq] + add posq, istrideq + movu m2, [inputq + posq] + add posq, istrideq + movu m3, [inputq + posq] + add posq, istrideq + movu m4, [inputq + posq] + add posq, istrideq + movu m5, [inputq + posq] + + add yq, 2 + + mova m6, m0 + punpcklwd m0, m1 + punpckhwd m1, m6 + + mova m6, m2 + punpcklwd m2, m3 + punpckhwd m3, m6 + + mova m6, m4 + punpcklwd m4, m5 + punpckhwd m5, m6 + + pmaddwd m0, m9 + pmaddwd m1, m9 + pmaddwd m2, m10 + pmaddwd m3, m11 + pmaddwd m4, m8 + pmaddwd m5, m8 + + paddd m0, m4 + paddd m1, m5 + + paddd m0, m7 + paddd m1, m7 + + psrad m0, 3 + psrad m1, 3 + paddd m0, m2 + paddd m1, m3 + packssdw m0, m1 + + mov posq, hwidthq + imul posq, yq + add posq, xq + + movu [highq + posq], m0 + + add yq, 2 + cmp yq, heightq + jl .looph + + mov posq, istrideq + imul posq, yq + add posq, xq + + movu m0, [inputq + posq] + add posq, istrideq + movu m1, [inputq + posq] + + paddsw m0, m1 + + mov posq, lwidthq + imul posq, yq + add posq, xq + + movu [lowq + posq], m0 + + sub yq, 4 + + mov posq, istrideq + imul posq, yq + add posq, xq + + movu m0, [inputq + posq] + add posq, istrideq + movu m1, [inputq + posq] + add posq, istrideq + movu m2, [inputq + posq] + add posq, istrideq + movu m3, [inputq + posq] + add posq, istrideq + movu m4, [inputq + posq] + add posq, istrideq + movu m5, [inputq + posq] + + add yq, 4 + + mova m6, m0 + punpcklwd m0, m1 + punpckhwd m1, m6 + + mova m6, m2 + punpcklwd m2, m3 + punpckhwd m3, m6 + + mova m6, m4 + punpcklwd m4, m5 + punpckhwd m5, m6 + + pmaddwd m0, m8 + pmaddwd m1, m8 + pmaddwd m2, m13 + pmaddwd m3, m13 + pmaddwd m4, [pw_p11_n5] + pmaddwd m5, [pw_n5_p11] + + paddd m4, m2 + paddd m5, m3 + + paddd m4, m0 + paddd m5, m1 + + paddd m4, m7 + paddd m5, m7 + + psrad m4, 3 + psrad m5, 3 + packssdw m4, m5 + + mov posq, hwidthq + imul posq, yq + add posq, xq + + movu [highq + posq], m4 + + add xq, mmsize + cmp xq, widthq + jl .loopw + RET +%endif diff --git a/libavcodec/x86/cfhdencdsp_init.c b/libavcodec/x86/cfhdencdsp_init.c new file mode 100644 index 0000000000..28f1dd504d --- /dev/null +++ b/libavcodec/x86/cfhdencdsp_init.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021 Paul B Mahol + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/cfhdencdsp.h" + +void ff_cfhdenc_horiz_filter_sse2(int16_t *input, int16_t *low, int16_t *high, + ptrdiff_t in_stride, ptrdiff_t low_stride, + ptrdiff_t high_stride, + int width, int height); +void ff_cfhdenc_vert_filter_sse2(int16_t *input, int16_t *low, int16_t *high, + ptrdiff_t in_stride, ptrdiff_t low_stride, + ptrdiff_t high_stride, + int width, int height); + +av_cold void ff_cfhdencdsp_init_x86(CFHDEncDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + +#if ARCH_X86_64 + if (EXTERNAL_SSE2(cpu_flags)) { + c->horiz_filter = ff_cfhdenc_horiz_filter_sse2; + c->vert_filter = ff_cfhdenc_vert_filter_sse2; + } +#endif +}
Signed-off-by: Paul B Mahol <onemda@gmail.com> --- libavcodec/cfhdencdsp.c | 3 + libavcodec/x86/Makefile | 2 + libavcodec/x86/cfhdencdsp.asm | 431 +++++++++++++++++++++++++++++++ libavcodec/x86/cfhdencdsp_init.c | 48 ++++ 4 files changed, 484 insertions(+) create mode 100644 libavcodec/x86/cfhdencdsp.asm create mode 100644 libavcodec/x86/cfhdencdsp_init.c