Message ID | 20191021154504.14753-1-onemda@gmail.com |
---|---|
State | Superseded |
Headers | show |
On 10/21/2019 12:45 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol <onemda@gmail.com> > --- > libavfilter/transpose.h | 10 +++ > libavfilter/vf_transpose.c | 18 ++--- > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_transpose.asm | 104 ++++++++++++++++++++++++++++ > libavfilter/x86/vf_transpose_init.c | 49 +++++++++++++ > 5 files changed, 174 insertions(+), 9 deletions(-) > create mode 100644 libavfilter/x86/vf_transpose.asm > create mode 100644 libavfilter/x86/vf_transpose_init.c > > diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h > index aa262b9487..f73a42864f 100644 > --- a/libavfilter/transpose.h > +++ b/libavfilter/transpose.h > @@ -34,4 +34,14 @@ enum TransposeDir { > TRANSPOSE_VFLIP, > }; > > +typedef struct TransVtable { > + void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, > + uint8_t *dst, ptrdiff_t dst_linesize); > + void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, > + uint8_t *dst, ptrdiff_t dst_linesize, > + int w, int h); > +} TransVtable; > + > +void ff_transpose_init_x86(TransVtable *v, int pixstep); > + > #endif > diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c > index dd54947bd9..16ac6c311a 100644 > --- a/libavfilter/vf_transpose.c > +++ b/libavfilter/vf_transpose.c > @@ -40,14 +40,6 @@ > #include "video.h" > #include "transpose.h" > > -typedef struct TransVtable { > - void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, > - uint8_t *dst, ptrdiff_t dst_linesize); > - void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, > - uint8_t *dst, ptrdiff_t dst_linesize, > - int w, int h); > -} TransVtable; > - > typedef struct TransContext { > const AVClass *class; > int hsub, vsub; > @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink) > } > } > > - av_log(ctx, AV_LOG_VERBOSE, > + if (ARCH_X86) { > + for (int i = 0; i < 4; i++) { > + TransVtable *v = &s->vtables[i]; > + > + ff_transpose_init_x86(v, s->pixsteps[i]); > + } > + } > + > + av_log(ctx, AV_LOG_VERBOSE, > "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n", > inlink->w, inlink->h, s->dir, outlink->w, outlink->h, > s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise", > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index 06f832e36c..8d97e46c3f 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o > OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o > OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o > OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o > +OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose_init.o > OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o > OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o > OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o > @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o > X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o > X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o > X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o > +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose.o > X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o > X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o > X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o > diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm > new file mode 100644 > index 0000000000..6d925d5d97 > --- /dev/null > +++ b/libavfilter/x86/vf_transpose.asm > @@ -0,0 +1,104 @@ > +;***************************************************************************** > +;* x86-optimized functions for transpose filter > +;* > +;* Copyright (C) 2019 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION .text > + > +;------------------------------------------------------------------------------ > +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize, > +; uint8_t *dst, ptrdiff_t dst_linesize) > +;------------------------------------------------------------------------------ > + > +INIT_XMM sse4 > +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize > + movu m0, [srcq] > + add srcq, src_linesizeq > + movu m1, [srcq] > + add srcq, src_linesizeq > + movu m2, [srcq] > + add srcq, src_linesizeq > + movu m3, [srcq] > + add srcq, src_linesizeq > + movu m4, [srcq] > + add srcq, src_linesizeq > + movu m5, [srcq] > + add srcq, src_linesizeq > + movu m6, [srcq] > + add srcq, src_linesizeq > + movu m7, [srcq] > + > + TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7 > + > + movq [dstq], m0 > + add dstq, dst_linesizeq > + movq [dstq], m1 > + add dstq, dst_linesizeq > + movq [dstq], m2 > + add dstq, dst_linesizeq > + movq [dstq], m3 > + add dstq, dst_linesizeq > + movq [dstq], m4 > + add dstq, dst_linesizeq > + movq [dstq], m5 > + add dstq, dst_linesizeq > + movq [dstq], m6 > + add dstq, dst_linesizeq > + movq [dstq], m7 > + RET lea linesize3q, [src_linesizeq*3] movu m0, [srcq+src_linesizeq*0] movu m1, [srcq+src_linesizeq*1] movu m2, [srcq+src_linesizeq*2] movu m3, [srcq+linesize3q] lea srcq, [srcq+src_linesizeq*4] movu m4, [srcq+src_linesizeq*0] movu m5, [srcq+src_linesizeq*1] movu m6, [srcq+src_linesizeq*2] movu m7, [srcq+linesize3q] TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7 lea linesize3q, [dst_linesizeq*3] movu [dstq+dst_linesizeq*0], m0 movu [dstq+dst_linesizeq*1], m1 movu [dstq+dst_linesizeq*2], m2 movu [dstq+linesize3q], m3 lea dstq, [dstq+dst_linesizeq*4] movu [dstq+dst_linesizeq*0], m4 movu [dstq+dst_linesizeq*1], m5 movu [dstq+dst_linesizeq*2], m6 movu [dstq+linesize3q], m7 RET Then something similar for the function below. > + > +INIT_XMM sse4 > +cglobal transpose_8x8_16, 4,4,9, src, src_linesize, dst, dst_linesize > + movu m0, [srcq] > + add srcq, src_linesizeq > + movu m1, [srcq] > + add srcq, src_linesizeq > + movu m2, [srcq] > + add srcq, src_linesizeq > + movu m3, [srcq] > + add srcq, src_linesizeq > + movu m4, [srcq] > + add srcq, src_linesizeq > + movu m5, [srcq] > + add srcq, src_linesizeq > + movu m6, [srcq] > + add srcq, src_linesizeq > + movu m7, [srcq] > + > + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 > + > + movu [dstq], m0 > + add dstq, dst_linesizeq > + movu [dstq], m1 > + add dstq, dst_linesizeq > + movu [dstq], m2 > + add dstq, dst_linesizeq > + movu [dstq], m3 > + add dstq, dst_linesizeq > + movu [dstq], m4 > + add dstq, dst_linesizeq > + movu [dstq], m5 > + add dstq, dst_linesizeq > + movu [dstq], m6 > + add dstq, dst_linesizeq > + movu [dstq], m7 > + RET > diff --git a/libavfilter/x86/vf_transpose_init.c b/libavfilter/x86/vf_transpose_init.c > new file mode 100644 > index 0000000000..4f5acd5e56 > --- /dev/null > +++ b/libavfilter/x86/vf_transpose_init.c > @@ -0,0 +1,49 @@ > +/* > + * Copyright (C) 2019 Paul B Mahol > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/mem.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/cpu.h" > +#include "libavfilter/transpose.h" > + > +void ff_transpose_8x8_8_sse4(uint8_t *src, > + ptrdiff_t src_linesize, > + uint8_t *dst, > + ptrdiff_t dst_linesize); > + > +void ff_transpose_8x8_16_sse4(uint8_t *src, > + ptrdiff_t src_linesize, > + uint8_t *dst, > + ptrdiff_t dst_linesize); > + > +av_cold void ff_transpose_init_x86(TransVtable *v, int pixstep) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_SSE4(cpu_flags) && pixstep == 1) { > + v->transpose_8x8 = ff_transpose_8x8_8_sse4; > + } > + > + if (EXTERNAL_SSE4(cpu_flags) && pixstep == 2) { > + v->transpose_8x8 = ff_transpose_8x8_16_sse4; > + } > +} >
On 10/21/2019 12:45 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol <onemda@gmail.com> > --- > libavfilter/transpose.h | 10 +++ > libavfilter/vf_transpose.c | 18 ++--- > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_transpose.asm | 104 ++++++++++++++++++++++++++++ > libavfilter/x86/vf_transpose_init.c | 49 +++++++++++++ > 5 files changed, 174 insertions(+), 9 deletions(-) > create mode 100644 libavfilter/x86/vf_transpose.asm > create mode 100644 libavfilter/x86/vf_transpose_init.c > > diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h > index aa262b9487..f73a42864f 100644 > --- a/libavfilter/transpose.h > +++ b/libavfilter/transpose.h > @@ -34,4 +34,14 @@ enum TransposeDir { > TRANSPOSE_VFLIP, > }; > > +typedef struct TransVtable { > + void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, > + uint8_t *dst, ptrdiff_t dst_linesize); > + void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, > + uint8_t *dst, ptrdiff_t dst_linesize, > + int w, int h); > +} TransVtable; > + > +void ff_transpose_init_x86(TransVtable *v, int pixstep); > + > #endif > diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c > index dd54947bd9..16ac6c311a 100644 > --- a/libavfilter/vf_transpose.c > +++ b/libavfilter/vf_transpose.c > @@ -40,14 +40,6 @@ > #include "video.h" > #include "transpose.h" > > -typedef struct TransVtable { > - void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, > - uint8_t *dst, ptrdiff_t dst_linesize); > - void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, > - uint8_t *dst, ptrdiff_t dst_linesize, > - int w, int h); > -} TransVtable; > - > typedef struct TransContext { > const AVClass *class; > int hsub, vsub; > @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink) > } > } > > - av_log(ctx, AV_LOG_VERBOSE, > + if (ARCH_X86) { > + for (int i = 0; i < 4; i++) { > + TransVtable *v = &s->vtables[i]; > + > + ff_transpose_init_x86(v, s->pixsteps[i]); > + } > + } > + > + av_log(ctx, AV_LOG_VERBOSE, > "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n", > inlink->w, inlink->h, s->dir, outlink->w, outlink->h, > s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise", > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index 06f832e36c..8d97e46c3f 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o > OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o > OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o > OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o > +OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose_init.o > OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o > OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o > OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o > @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o > X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o > X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o > X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o > +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose.o > X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o > X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o > X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o > diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm > new file mode 100644 > index 0000000000..6d925d5d97 > --- /dev/null > +++ b/libavfilter/x86/vf_transpose.asm > @@ -0,0 +1,104 @@ > +;***************************************************************************** > +;* x86-optimized functions for transpose filter > +;* > +;* Copyright (C) 2019 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION .text > + > +;------------------------------------------------------------------------------ > +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize, > +; uint8_t *dst, ptrdiff_t dst_linesize) > +;------------------------------------------------------------------------------ > + > +INIT_XMM sse4 > +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize > + movu m0, [srcq] > + add srcq, src_linesizeq > + movu m1, [srcq] > + add srcq, src_linesizeq > + movu m2, [srcq] > + add srcq, src_linesizeq > + movu m3, [srcq] > + add srcq, src_linesizeq > + movu m4, [srcq] > + add srcq, src_linesizeq > + movu m5, [srcq] > + add srcq, src_linesizeq > + movu m6, [srcq] > + add srcq, src_linesizeq > + movu m7, [srcq] > + > + TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7 > + > + movq [dstq], m0 > + add dstq, dst_linesizeq > + movq [dstq], m1 > + add dstq, dst_linesizeq > + movq [dstq], m2 > + add dstq, dst_linesizeq > + movq [dstq], m3 > + add dstq, dst_linesizeq > + movq [dstq], m4 > + add dstq, dst_linesizeq > + movq [dstq], m5 > + add dstq, dst_linesizeq > + movq [dstq], m6 > + add dstq, dst_linesizeq > + movq [dstq], m7 > + RET > + > +INIT_XMM sse4 > +cglobal transpose_8x8_16, 4,4,9, src, src_linesize, dst, dst_linesize > + movu m0, [srcq] > + add srcq, src_linesizeq > + movu m1, [srcq] > + add srcq, src_linesizeq > + movu m2, [srcq] > + add srcq, src_linesizeq > + movu m3, [srcq] > + add srcq, src_linesizeq > + movu m4, [srcq] > + add srcq, src_linesizeq > + movu m5, [srcq] > + add srcq, src_linesizeq > + movu m6, [srcq] > + add srcq, src_linesizeq > + movu m7, [srcq] > + > + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 For x86_32 this needs memory arguments (Either reserved stack space, or the dst buffer when it's aligned), otherwise it will not compile. If you don't want to do it, then just wrap this one function with x86_64 preprocessor checks here and below. > + > + movu [dstq], m0 > + add dstq, dst_linesizeq > + movu [dstq], m1 > + add dstq, dst_linesizeq > + movu [dstq], m2 > + add dstq, dst_linesizeq > + movu [dstq], m3 > + add dstq, dst_linesizeq > + movu [dstq], m4 > + add dstq, dst_linesizeq > + movu [dstq], m5 > + add dstq, dst_linesizeq > + movu [dstq], m6 > + add dstq, dst_linesizeq > + movu [dstq], m7 > + RET > diff --git a/libavfilter/x86/vf_transpose_init.c b/libavfilter/x86/vf_transpose_init.c > new file mode 100644 > index 0000000000..4f5acd5e56 > --- /dev/null > +++ b/libavfilter/x86/vf_transpose_init.c > @@ -0,0 +1,49 @@ > +/* > + * Copyright (C) 2019 Paul B Mahol > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/mem.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/cpu.h" > +#include "libavfilter/transpose.h" > + > +void ff_transpose_8x8_8_sse4(uint8_t *src, > + ptrdiff_t src_linesize, > + uint8_t *dst, > + ptrdiff_t dst_linesize); > + > +void ff_transpose_8x8_16_sse4(uint8_t *src, > + ptrdiff_t src_linesize, > + uint8_t *dst, > + ptrdiff_t dst_linesize); > + > +av_cold void ff_transpose_init_x86(TransVtable *v, int pixstep) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_SSE4(cpu_flags) && pixstep == 1) { > + v->transpose_8x8 = ff_transpose_8x8_8_sse4; > + } > + > + if (EXTERNAL_SSE4(cpu_flags) && pixstep == 2) { > + v->transpose_8x8 = ff_transpose_8x8_16_sse4; > + } > +} >
On 10/21/2019 1:36 PM, James Almer wrote: > On 10/21/2019 12:45 PM, Paul B Mahol wrote: >> Signed-off-by: Paul B Mahol <onemda@gmail.com> >> --- >> libavfilter/transpose.h | 10 +++ >> libavfilter/vf_transpose.c | 18 ++--- >> libavfilter/x86/Makefile | 2 + >> libavfilter/x86/vf_transpose.asm | 104 ++++++++++++++++++++++++++++ >> libavfilter/x86/vf_transpose_init.c | 49 +++++++++++++ >> 5 files changed, 174 insertions(+), 9 deletions(-) >> create mode 100644 libavfilter/x86/vf_transpose.asm >> create mode 100644 libavfilter/x86/vf_transpose_init.c >> >> diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h >> index aa262b9487..f73a42864f 100644 >> --- a/libavfilter/transpose.h >> +++ b/libavfilter/transpose.h >> @@ -34,4 +34,14 @@ enum TransposeDir { >> TRANSPOSE_VFLIP, >> }; >> >> +typedef struct TransVtable { >> + void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize); >> + void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize, >> + int w, int h); >> +} TransVtable; >> + >> +void ff_transpose_init_x86(TransVtable *v, int pixstep); >> + >> #endif >> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c >> index dd54947bd9..16ac6c311a 100644 >> --- a/libavfilter/vf_transpose.c >> +++ b/libavfilter/vf_transpose.c >> @@ -40,14 +40,6 @@ >> #include "video.h" >> #include "transpose.h" >> >> -typedef struct TransVtable { >> - void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, >> - uint8_t *dst, ptrdiff_t dst_linesize); >> - void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, >> - uint8_t *dst, ptrdiff_t dst_linesize, >> - int w, int h); >> -} TransVtable; >> - >> typedef struct TransContext { >> const AVClass *class; >> int hsub, vsub; >> @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink) >> } >> } >> >> - av_log(ctx, AV_LOG_VERBOSE, >> + if (ARCH_X86) { >> + for (int i = 0; i < 4; i++) { >> + TransVtable *v = &s->vtables[i]; >> + >> + ff_transpose_init_x86(v, s->pixsteps[i]); >> + } >> + } >> + >> + av_log(ctx, AV_LOG_VERBOSE, >> "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n", >> inlink->w, inlink->h, s->dir, outlink->w, outlink->h, >> s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise", >> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile >> index 06f832e36c..8d97e46c3f 100644 >> --- a/libavfilter/x86/Makefile >> +++ b/libavfilter/x86/Makefile >> @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o >> OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o >> OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o >> OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o >> +OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose_init.o >> OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o >> OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o >> OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o >> @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o >> X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o >> X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o >> X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o >> +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose.o >> X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o >> X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o >> X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o >> diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm >> new file mode 100644 >> index 0000000000..6d925d5d97 >> --- /dev/null >> +++ b/libavfilter/x86/vf_transpose.asm >> @@ -0,0 +1,104 @@ >> +;***************************************************************************** >> +;* x86-optimized functions for transpose filter >> +;* >> +;* Copyright (C) 2019 Paul B Mahol >> +;* >> +;* This file is part of FFmpeg. >> +;* >> +;* FFmpeg is free software; you can redistribute it and/or >> +;* modify it under the terms of the GNU Lesser General Public >> +;* License as published by the Free Software Foundation; either >> +;* version 2.1 of the License, or (at your option) any later version. >> +;* >> +;* FFmpeg is distributed in the hope that it will be useful, >> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of >> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> +;* Lesser General Public License for more details. >> +;* >> +;* You should have received a copy of the GNU Lesser General Public >> +;* License along with FFmpeg; if not, write to the Free Software >> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >> +;****************************************************************************** >> + >> +%include "libavutil/x86/x86util.asm" >> + >> +SECTION .text >> + >> +;------------------------------------------------------------------------------ >> +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize, >> +; uint8_t *dst, ptrdiff_t dst_linesize) >> +;------------------------------------------------------------------------------ >> + >> +INIT_XMM sse4 >> +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize >> + movu m0, [srcq] >> + add srcq, src_linesizeq >> + movu m1, [srcq] >> + add srcq, src_linesizeq >> + movu m2, [srcq] >> + add srcq, src_linesizeq >> + movu m3, [srcq] >> + add srcq, src_linesizeq >> + movu m4, [srcq] >> + add srcq, src_linesizeq >> + movu m5, [srcq] >> + add srcq, src_linesizeq >> + movu m6, [srcq] >> + add srcq, src_linesizeq >> + movu m7, [srcq] >> + >> + TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7 This macro seems to only use the lower eight bytes of each register, for both input and output. So i think you can replace the movu's with movq's. >> + >> + movq [dstq], m0 >> + add dstq, dst_linesizeq >> + movq [dstq], m1 >> + add dstq, dst_linesizeq >> + movq [dstq], m2 >> + add dstq, dst_linesizeq >> + movq [dstq], m3 >> + add dstq, dst_linesizeq >> + movq [dstq], m4 >> + add dstq, dst_linesizeq >> + movq [dstq], m5 >> + add dstq, dst_linesizeq >> + movq [dstq], m6 >> + add dstq, dst_linesizeq >> + movq [dstq], m7 >> + RET > > lea linesize3q, [src_linesizeq*3] > movu m0, [srcq+src_linesizeq*0] > movu m1, [srcq+src_linesizeq*1] > movu m2, [srcq+src_linesizeq*2] > movu m3, [srcq+linesize3q] > lea srcq, [srcq+src_linesizeq*4] > movu m4, [srcq+src_linesizeq*0] > movu m5, [srcq+src_linesizeq*1] > movu m6, [srcq+src_linesizeq*2] > movu m7, [srcq+linesize3q] > > TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7 > > lea linesize3q, [dst_linesizeq*3] > movu [dstq+dst_linesizeq*0], m0 > movu [dstq+dst_linesizeq*1], m1 > movu [dstq+dst_linesizeq*2], m2 > movu [dstq+linesize3q], m3 > lea dstq, [dstq+dst_linesizeq*4] > movu [dstq+dst_linesizeq*0], m4 > movu [dstq+dst_linesizeq*1], m5 > movu [dstq+dst_linesizeq*2], m6 > movu [dstq+linesize3q], m7 > RET Also, i obviously screwed up by using movu here.
diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h index aa262b9487..f73a42864f 100644 --- a/libavfilter/transpose.h +++ b/libavfilter/transpose.h @@ -34,4 +34,14 @@ enum TransposeDir { TRANSPOSE_VFLIP, }; +typedef struct TransVtable { + void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize); + void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h); +} TransVtable; + +void ff_transpose_init_x86(TransVtable *v, int pixstep); + #endif diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c index dd54947bd9..16ac6c311a 100644 --- a/libavfilter/vf_transpose.c +++ b/libavfilter/vf_transpose.c @@ -40,14 +40,6 @@ #include "video.h" #include "transpose.h" -typedef struct TransVtable { - void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, - uint8_t *dst, ptrdiff_t dst_linesize); - void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, - uint8_t *dst, ptrdiff_t dst_linesize, - int w, int h); -} TransVtable; - typedef struct TransContext { const AVClass *class; int hsub, vsub; @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink) } } - av_log(ctx, AV_LOG_VERBOSE, + if (ARCH_X86) { + for (int i = 0; i < 4; i++) { + TransVtable *v = &s->vtables[i]; + + ff_transpose_init_x86(v, s->pixsteps[i]); + } + } + + av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n", inlink->w, inlink->h, s->dir, outlink->w, outlink->h, s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise", diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 06f832e36c..8d97e46c3f 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o +OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose.o X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm new file mode 100644 index 0000000000..6d925d5d97 --- /dev/null +++ b/libavfilter/x86/vf_transpose.asm @@ -0,0 +1,104 @@ +;***************************************************************************** +;* x86-optimized functions for transpose filter +;* +;* Copyright (C) 2019 Paul B Mahol +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +;------------------------------------------------------------------------------ +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize, +; uint8_t *dst, ptrdiff_t dst_linesize) +;------------------------------------------------------------------------------ + +INIT_XMM sse4 +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize + movu m0, [srcq] + add srcq, src_linesizeq + movu m1, [srcq] + add srcq, src_linesizeq + movu m2, [srcq] + add srcq, src_linesizeq + movu m3, [srcq] + add srcq, src_linesizeq + movu m4, [srcq] + add srcq, src_linesizeq + movu m5, [srcq] + add srcq, src_linesizeq + movu m6, [srcq] + add srcq, src_linesizeq + movu m7, [srcq] + + TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7 + + movq [dstq], m0 + add dstq, dst_linesizeq + movq [dstq], m1 + add dstq, dst_linesizeq + movq [dstq], m2 + add dstq, dst_linesizeq + movq [dstq], m3 + add dstq, dst_linesizeq + movq [dstq], m4 + add dstq, dst_linesizeq + movq [dstq], m5 + add dstq, dst_linesizeq + movq [dstq], m6 + add dstq, dst_linesizeq + movq [dstq], m7 + RET + +INIT_XMM sse4 +cglobal transpose_8x8_16, 4,4,9, src, src_linesize, dst, dst_linesize + movu m0, [srcq] + add srcq, src_linesizeq + movu m1, [srcq] + add srcq, src_linesizeq + movu m2, [srcq] + add srcq, src_linesizeq + movu m3, [srcq] + add srcq, src_linesizeq + movu m4, [srcq] + add srcq, src_linesizeq + movu m5, [srcq] + add srcq, src_linesizeq + movu m6, [srcq] + add srcq, src_linesizeq + movu m7, [srcq] + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + + movu [dstq], m0 + add dstq, dst_linesizeq + movu [dstq], m1 + add dstq, dst_linesizeq + movu [dstq], m2 + add dstq, dst_linesizeq + movu [dstq], m3 + add dstq, dst_linesizeq + movu [dstq], m4 + add dstq, dst_linesizeq + movu [dstq], m5 + add dstq, dst_linesizeq + movu [dstq], m6 + add dstq, dst_linesizeq + movu [dstq], m7 + RET diff --git a/libavfilter/x86/vf_transpose_init.c b/libavfilter/x86/vf_transpose_init.c new file mode 100644 index 0000000000..4f5acd5e56 --- /dev/null +++ b/libavfilter/x86/vf_transpose_init.c @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2019 Paul B Mahol + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/transpose.h" + +void ff_transpose_8x8_8_sse4(uint8_t *src, + ptrdiff_t src_linesize, + uint8_t *dst, + ptrdiff_t dst_linesize); + +void ff_transpose_8x8_16_sse4(uint8_t *src, + ptrdiff_t src_linesize, + uint8_t *dst, + ptrdiff_t dst_linesize); + +av_cold void ff_transpose_init_x86(TransVtable *v, int pixstep) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE4(cpu_flags) && pixstep == 1) { + v->transpose_8x8 = ff_transpose_8x8_8_sse4; + } + + if (EXTERNAL_SSE4(cpu_flags) && pixstep == 2) { + v->transpose_8x8 = ff_transpose_8x8_16_sse4; + } +}
Signed-off-by: Paul B Mahol <onemda@gmail.com> --- libavfilter/transpose.h | 10 +++ libavfilter/vf_transpose.c | 18 ++--- libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_transpose.asm | 104 ++++++++++++++++++++++++++++ libavfilter/x86/vf_transpose_init.c | 49 +++++++++++++ 5 files changed, 174 insertions(+), 9 deletions(-) create mode 100644 libavfilter/x86/vf_transpose.asm create mode 100644 libavfilter/x86/vf_transpose_init.c