[FFmpeg-devel] avfilter/vf_transpose: add x86 SIMD

Submitted by Paul B Mahol on Oct. 21, 2019, 3:45 p.m.

Details

Message ID 20191021154504.14753-1-onemda@gmail.com
State New
Headers show

Commit Message

Paul B Mahol Oct. 21, 2019, 3:45 p.m.
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/transpose.h             |  10 +++
 libavfilter/vf_transpose.c          |  18 ++---
 libavfilter/x86/Makefile            |   2 +
 libavfilter/x86/vf_transpose.asm    | 104 ++++++++++++++++++++++++++++
 libavfilter/x86/vf_transpose_init.c |  49 +++++++++++++
 5 files changed, 174 insertions(+), 9 deletions(-)
 create mode 100644 libavfilter/x86/vf_transpose.asm
 create mode 100644 libavfilter/x86/vf_transpose_init.c

Comments

James Almer Oct. 21, 2019, 4:36 p.m.
On 10/21/2019 12:45 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavfilter/transpose.h             |  10 +++
>  libavfilter/vf_transpose.c          |  18 ++---
>  libavfilter/x86/Makefile            |   2 +
>  libavfilter/x86/vf_transpose.asm    | 104 ++++++++++++++++++++++++++++
>  libavfilter/x86/vf_transpose_init.c |  49 +++++++++++++
>  5 files changed, 174 insertions(+), 9 deletions(-)
>  create mode 100644 libavfilter/x86/vf_transpose.asm
>  create mode 100644 libavfilter/x86/vf_transpose_init.c
> 
> diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h
> index aa262b9487..f73a42864f 100644
> --- a/libavfilter/transpose.h
> +++ b/libavfilter/transpose.h
> @@ -34,4 +34,14 @@ enum TransposeDir {
>      TRANSPOSE_VFLIP,
>  };
>  
> +typedef struct TransVtable {
> +    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
> +                          uint8_t *dst, ptrdiff_t dst_linesize);
> +    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
> +                            uint8_t *dst, ptrdiff_t dst_linesize,
> +                            int w, int h);
> +} TransVtable;
> +
> +void ff_transpose_init_x86(TransVtable *v, int pixstep);
> +
>  #endif
> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c
> index dd54947bd9..16ac6c311a 100644
> --- a/libavfilter/vf_transpose.c
> +++ b/libavfilter/vf_transpose.c
> @@ -40,14 +40,6 @@
>  #include "video.h"
>  #include "transpose.h"
>  
> -typedef struct TransVtable {
> -    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
> -                          uint8_t *dst, ptrdiff_t dst_linesize);
> -    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
> -                            uint8_t *dst, ptrdiff_t dst_linesize,
> -                            int w, int h);
> -} TransVtable;
> -
>  typedef struct TransContext {
>      const AVClass *class;
>      int hsub, vsub;
> @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink)
>          }
>      }
>  
> -    av_log(ctx, AV_LOG_VERBOSE,
> +    if (ARCH_X86) {
> +        for (int i = 0; i < 4; i++) {
> +            TransVtable *v = &s->vtables[i];
> +
> +            ff_transpose_init_x86(v, s->pixsteps[i]);
> +        }
> +    }
> +
> +     av_log(ctx, AV_LOG_VERBOSE,
>             "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
>             inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
>             s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise",
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 06f832e36c..8d97e46c3f 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
>  OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
>  OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
>  OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
> +OBJS-$(CONFIG_TRANSPOSE_FILTER)              += x86/vf_transpose_init.o
>  OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
>  OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
>  OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
> @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER)        += x86/vf_stereo3d.o
>  X86ASM-OBJS-$(CONFIG_TBLEND_FILTER)          += x86/vf_blend.o
>  X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER)       += x86/vf_threshold.o
>  X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += x86/vf_interlace.o
> +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o
>  X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
>  X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
>  X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
> diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm
> new file mode 100644
> index 0000000000..6d925d5d97
> --- /dev/null
> +++ b/libavfilter/x86/vf_transpose.asm
> @@ -0,0 +1,104 @@
> +;*****************************************************************************
> +;* x86-optimized functions for transpose filter
> +;*
> +;* Copyright (C) 2019 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +;------------------------------------------------------------------------------
> +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize,
> +;                       uint8_t *dst, ptrdiff_t dst_linesize)
> +;------------------------------------------------------------------------------
> +
> +INIT_XMM sse4
> +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize
> +    movu    m0, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m1, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m2, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m3, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m4, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m5, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m6, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m7, [srcq]
> +
> +    TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7
> +
> +    movq [dstq], m0
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m1
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m2
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m3
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m4
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m5
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m6
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m7
> +    RET

    lea linesize3q, [src_linesizeq*3]
    movu            m0, [srcq+src_linesizeq*0]
    movu            m1, [srcq+src_linesizeq*1]
    movu            m2, [srcq+src_linesizeq*2]
    movu            m3, [srcq+linesize3q]
    lea           srcq, [srcq+src_linesizeq*4]
    movu            m4, [srcq+src_linesizeq*0]
    movu            m5, [srcq+src_linesizeq*1]
    movu            m6, [srcq+src_linesizeq*2]
    movu            m7, [srcq+linesize3q]

    TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7

    lea linesize3q, [dst_linesizeq*3]
    movu [dstq+dst_linesizeq*0], m0
    movu [dstq+dst_linesizeq*1], m1
    movu [dstq+dst_linesizeq*2], m2
    movu      [dstq+linesize3q], m3
    lea                    dstq, [dstq+dst_linesizeq*4]
    movu [dstq+dst_linesizeq*0], m4
    movu [dstq+dst_linesizeq*1], m5
    movu [dstq+dst_linesizeq*2], m6
    movu      [dstq+linesize3q], m7
    RET

Then something similar for the function below.

> +
> +INIT_XMM sse4
> +cglobal transpose_8x8_16, 4,4,9, src, src_linesize, dst, dst_linesize
> +    movu    m0, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m1, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m2, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m3, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m4, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m5, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m6, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m7, [srcq]
> +
> +    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
> +
> +    movu [dstq], m0
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m1
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m2
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m3
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m4
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m5
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m6
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m7
> +    RET
> diff --git a/libavfilter/x86/vf_transpose_init.c b/libavfilter/x86/vf_transpose_init.c
> new file mode 100644
> index 0000000000..4f5acd5e56
> --- /dev/null
> +++ b/libavfilter/x86/vf_transpose_init.c
> @@ -0,0 +1,49 @@
> +/*
> + * Copyright (C) 2019 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/transpose.h"
> +
> +void ff_transpose_8x8_8_sse4(uint8_t *src,
> +                             ptrdiff_t src_linesize,
> +                             uint8_t *dst,
> +                             ptrdiff_t dst_linesize);
> +
> +void ff_transpose_8x8_16_sse4(uint8_t *src,
> +                              ptrdiff_t src_linesize,
> +                              uint8_t *dst,
> +                              ptrdiff_t dst_linesize);
> +
> +av_cold void ff_transpose_init_x86(TransVtable *v, int pixstep)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 1) {
> +        v->transpose_8x8 = ff_transpose_8x8_8_sse4;
> +    }
> +
> +    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 2) {
> +        v->transpose_8x8 = ff_transpose_8x8_16_sse4;
> +    }
> +}
>
James Almer Oct. 21, 2019, 5:01 p.m.
On 10/21/2019 12:45 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavfilter/transpose.h             |  10 +++
>  libavfilter/vf_transpose.c          |  18 ++---
>  libavfilter/x86/Makefile            |   2 +
>  libavfilter/x86/vf_transpose.asm    | 104 ++++++++++++++++++++++++++++
>  libavfilter/x86/vf_transpose_init.c |  49 +++++++++++++
>  5 files changed, 174 insertions(+), 9 deletions(-)
>  create mode 100644 libavfilter/x86/vf_transpose.asm
>  create mode 100644 libavfilter/x86/vf_transpose_init.c
> 
> diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h
> index aa262b9487..f73a42864f 100644
> --- a/libavfilter/transpose.h
> +++ b/libavfilter/transpose.h
> @@ -34,4 +34,14 @@ enum TransposeDir {
>      TRANSPOSE_VFLIP,
>  };
>  
> +typedef struct TransVtable {
> +    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
> +                          uint8_t *dst, ptrdiff_t dst_linesize);
> +    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
> +                            uint8_t *dst, ptrdiff_t dst_linesize,
> +                            int w, int h);
> +} TransVtable;
> +
> +void ff_transpose_init_x86(TransVtable *v, int pixstep);
> +
>  #endif
> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c
> index dd54947bd9..16ac6c311a 100644
> --- a/libavfilter/vf_transpose.c
> +++ b/libavfilter/vf_transpose.c
> @@ -40,14 +40,6 @@
>  #include "video.h"
>  #include "transpose.h"
>  
> -typedef struct TransVtable {
> -    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
> -                          uint8_t *dst, ptrdiff_t dst_linesize);
> -    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
> -                            uint8_t *dst, ptrdiff_t dst_linesize,
> -                            int w, int h);
> -} TransVtable;
> -
>  typedef struct TransContext {
>      const AVClass *class;
>      int hsub, vsub;
> @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink)
>          }
>      }
>  
> -    av_log(ctx, AV_LOG_VERBOSE,
> +    if (ARCH_X86) {
> +        for (int i = 0; i < 4; i++) {
> +            TransVtable *v = &s->vtables[i];
> +
> +            ff_transpose_init_x86(v, s->pixsteps[i]);
> +        }
> +    }
> +
> +     av_log(ctx, AV_LOG_VERBOSE,
>             "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
>             inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
>             s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise",
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 06f832e36c..8d97e46c3f 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
>  OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
>  OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
>  OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
> +OBJS-$(CONFIG_TRANSPOSE_FILTER)              += x86/vf_transpose_init.o
>  OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
>  OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
>  OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
> @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER)        += x86/vf_stereo3d.o
>  X86ASM-OBJS-$(CONFIG_TBLEND_FILTER)          += x86/vf_blend.o
>  X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER)       += x86/vf_threshold.o
>  X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += x86/vf_interlace.o
> +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o
>  X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
>  X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
>  X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
> diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm
> new file mode 100644
> index 0000000000..6d925d5d97
> --- /dev/null
> +++ b/libavfilter/x86/vf_transpose.asm
> @@ -0,0 +1,104 @@
> +;*****************************************************************************
> +;* x86-optimized functions for transpose filter
> +;*
> +;* Copyright (C) 2019 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +;------------------------------------------------------------------------------
> +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize,
> +;                       uint8_t *dst, ptrdiff_t dst_linesize)
> +;------------------------------------------------------------------------------
> +
> +INIT_XMM sse4
> +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize
> +    movu    m0, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m1, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m2, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m3, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m4, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m5, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m6, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m7, [srcq]
> +
> +    TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7
> +
> +    movq [dstq], m0
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m1
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m2
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m3
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m4
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m5
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m6
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m7
> +    RET
> +
> +INIT_XMM sse4
> +cglobal transpose_8x8_16, 4,4,9, src, src_linesize, dst, dst_linesize
> +    movu    m0, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m1, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m2, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m3, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m4, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m5, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m6, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m7, [srcq]
> +
> +    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8

For x86_32 this needs memory arguments (Either reserved stack space, or
the dst buffer when it's aligned), otherwise it will not compile.

If you don't want to do it, then just wrap this one function with x86_64
preprocessor checks here and below.

> +
> +    movu [dstq], m0
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m1
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m2
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m3
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m4
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m5
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m6
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m7
> +    RET
> diff --git a/libavfilter/x86/vf_transpose_init.c b/libavfilter/x86/vf_transpose_init.c
> new file mode 100644
> index 0000000000..4f5acd5e56
> --- /dev/null
> +++ b/libavfilter/x86/vf_transpose_init.c
> @@ -0,0 +1,49 @@
> +/*
> + * Copyright (C) 2019 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/transpose.h"
> +
> +void ff_transpose_8x8_8_sse4(uint8_t *src,
> +                             ptrdiff_t src_linesize,
> +                             uint8_t *dst,
> +                             ptrdiff_t dst_linesize);
> +
> +void ff_transpose_8x8_16_sse4(uint8_t *src,
> +                              ptrdiff_t src_linesize,
> +                              uint8_t *dst,
> +                              ptrdiff_t dst_linesize);
> +
> +av_cold void ff_transpose_init_x86(TransVtable *v, int pixstep)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 1) {
> +        v->transpose_8x8 = ff_transpose_8x8_8_sse4;
> +    }
> +
> +    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 2) {
> +        v->transpose_8x8 = ff_transpose_8x8_16_sse4;
> +    }
> +}
>
James Almer Oct. 21, 2019, 5:29 p.m.
On 10/21/2019 1:36 PM, James Almer wrote:
> On 10/21/2019 12:45 PM, Paul B Mahol wrote:
>> Signed-off-by: Paul B Mahol <onemda@gmail.com>
>> ---
>>  libavfilter/transpose.h             |  10 +++
>>  libavfilter/vf_transpose.c          |  18 ++---
>>  libavfilter/x86/Makefile            |   2 +
>>  libavfilter/x86/vf_transpose.asm    | 104 ++++++++++++++++++++++++++++
>>  libavfilter/x86/vf_transpose_init.c |  49 +++++++++++++
>>  5 files changed, 174 insertions(+), 9 deletions(-)
>>  create mode 100644 libavfilter/x86/vf_transpose.asm
>>  create mode 100644 libavfilter/x86/vf_transpose_init.c
>>
>> diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h
>> index aa262b9487..f73a42864f 100644
>> --- a/libavfilter/transpose.h
>> +++ b/libavfilter/transpose.h
>> @@ -34,4 +34,14 @@ enum TransposeDir {
>>      TRANSPOSE_VFLIP,
>>  };
>>  
>> +typedef struct TransVtable {
>> +    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
>> +                          uint8_t *dst, ptrdiff_t dst_linesize);
>> +    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
>> +                            uint8_t *dst, ptrdiff_t dst_linesize,
>> +                            int w, int h);
>> +} TransVtable;
>> +
>> +void ff_transpose_init_x86(TransVtable *v, int pixstep);
>> +
>>  #endif
>> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c
>> index dd54947bd9..16ac6c311a 100644
>> --- a/libavfilter/vf_transpose.c
>> +++ b/libavfilter/vf_transpose.c
>> @@ -40,14 +40,6 @@
>>  #include "video.h"
>>  #include "transpose.h"
>>  
>> -typedef struct TransVtable {
>> -    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
>> -                          uint8_t *dst, ptrdiff_t dst_linesize);
>> -    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
>> -                            uint8_t *dst, ptrdiff_t dst_linesize,
>> -                            int w, int h);
>> -} TransVtable;
>> -
>>  typedef struct TransContext {
>>      const AVClass *class;
>>      int hsub, vsub;
>> @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink)
>>          }
>>      }
>>  
>> -    av_log(ctx, AV_LOG_VERBOSE,
>> +    if (ARCH_X86) {
>> +        for (int i = 0; i < 4; i++) {
>> +            TransVtable *v = &s->vtables[i];
>> +
>> +            ff_transpose_init_x86(v, s->pixsteps[i]);
>> +        }
>> +    }
>> +
>> +     av_log(ctx, AV_LOG_VERBOSE,
>>             "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
>>             inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
>>             s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise",
>> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
>> index 06f832e36c..8d97e46c3f 100644
>> --- a/libavfilter/x86/Makefile
>> +++ b/libavfilter/x86/Makefile
>> @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
>>  OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
>>  OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
>>  OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
>> +OBJS-$(CONFIG_TRANSPOSE_FILTER)              += x86/vf_transpose_init.o
>>  OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
>>  OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
>>  OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
>> @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER)        += x86/vf_stereo3d.o
>>  X86ASM-OBJS-$(CONFIG_TBLEND_FILTER)          += x86/vf_blend.o
>>  X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER)       += x86/vf_threshold.o
>>  X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += x86/vf_interlace.o
>> +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o
>>  X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
>>  X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
>>  X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
>> diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm
>> new file mode 100644
>> index 0000000000..6d925d5d97
>> --- /dev/null
>> +++ b/libavfilter/x86/vf_transpose.asm
>> @@ -0,0 +1,104 @@
>> +;*****************************************************************************
>> +;* x86-optimized functions for transpose filter
>> +;*
>> +;* Copyright (C) 2019 Paul B Mahol
>> +;*
>> +;* This file is part of FFmpeg.
>> +;*
>> +;* FFmpeg is free software; you can redistribute it and/or
>> +;* modify it under the terms of the GNU Lesser General Public
>> +;* License as published by the Free Software Foundation; either
>> +;* version 2.1 of the License, or (at your option) any later version.
>> +;*
>> +;* FFmpeg is distributed in the hope that it will be useful,
>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +;* Lesser General Public License for more details.
>> +;*
>> +;* You should have received a copy of the GNU Lesser General Public
>> +;* License along with FFmpeg; if not, write to the Free Software
>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> +;******************************************************************************
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +SECTION .text
>> +
>> +;------------------------------------------------------------------------------
>> +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize,
>> +;                       uint8_t *dst, ptrdiff_t dst_linesize)
>> +;------------------------------------------------------------------------------
>> +
>> +INIT_XMM sse4
>> +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize
>> +    movu    m0, [srcq]
>> +    add    srcq, src_linesizeq
>> +    movu    m1, [srcq]
>> +    add    srcq, src_linesizeq
>> +    movu    m2, [srcq]
>> +    add    srcq, src_linesizeq
>> +    movu    m3, [srcq]
>> +    add    srcq, src_linesizeq
>> +    movu    m4, [srcq]
>> +    add    srcq, src_linesizeq
>> +    movu    m5, [srcq]
>> +    add    srcq, src_linesizeq
>> +    movu    m6, [srcq]
>> +    add    srcq, src_linesizeq
>> +    movu    m7, [srcq]
>> +
>> +    TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7

This macro seems to only use the lower eight bytes of each register, for
both input and output. So i think you can replace the movu's with movq's.

>> +
>> +    movq [dstq], m0
>> +    add    dstq, dst_linesizeq
>> +    movq [dstq], m1
>> +    add    dstq, dst_linesizeq
>> +    movq [dstq], m2
>> +    add    dstq, dst_linesizeq
>> +    movq [dstq], m3
>> +    add    dstq, dst_linesizeq
>> +    movq [dstq], m4
>> +    add    dstq, dst_linesizeq
>> +    movq [dstq], m5
>> +    add    dstq, dst_linesizeq
>> +    movq [dstq], m6
>> +    add    dstq, dst_linesizeq
>> +    movq [dstq], m7
>> +    RET
> 
>     lea linesize3q, [src_linesizeq*3]
>     movu            m0, [srcq+src_linesizeq*0]
>     movu            m1, [srcq+src_linesizeq*1]
>     movu            m2, [srcq+src_linesizeq*2]
>     movu            m3, [srcq+linesize3q]
>     lea           srcq, [srcq+src_linesizeq*4]
>     movu            m4, [srcq+src_linesizeq*0]
>     movu            m5, [srcq+src_linesizeq*1]
>     movu            m6, [srcq+src_linesizeq*2]
>     movu            m7, [srcq+linesize3q]
> 
>     TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7
> 
>     lea linesize3q, [dst_linesizeq*3]
>     movu [dstq+dst_linesizeq*0], m0
>     movu [dstq+dst_linesizeq*1], m1
>     movu [dstq+dst_linesizeq*2], m2
>     movu      [dstq+linesize3q], m3
>     lea                    dstq, [dstq+dst_linesizeq*4]
>     movu [dstq+dst_linesizeq*0], m4
>     movu [dstq+dst_linesizeq*1], m5
>     movu [dstq+dst_linesizeq*2], m6
>     movu      [dstq+linesize3q], m7
>     RET

Also, i obviously screwed up by using movu here.

Patch hide | download patch | download mbox

diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h
index aa262b9487..f73a42864f 100644
--- a/libavfilter/transpose.h
+++ b/libavfilter/transpose.h
@@ -34,4 +34,14 @@  enum TransposeDir {
     TRANSPOSE_VFLIP,
 };
 
+typedef struct TransVtable {
+    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
+                          uint8_t *dst, ptrdiff_t dst_linesize);
+    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
+                            uint8_t *dst, ptrdiff_t dst_linesize,
+                            int w, int h);
+} TransVtable;
+
+void ff_transpose_init_x86(TransVtable *v, int pixstep);
+
 #endif
diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c
index dd54947bd9..16ac6c311a 100644
--- a/libavfilter/vf_transpose.c
+++ b/libavfilter/vf_transpose.c
@@ -40,14 +40,6 @@ 
 #include "video.h"
 #include "transpose.h"
 
-typedef struct TransVtable {
-    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
-                          uint8_t *dst, ptrdiff_t dst_linesize);
-    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
-                            uint8_t *dst, ptrdiff_t dst_linesize,
-                            int w, int h);
-} TransVtable;
-
 typedef struct TransContext {
     const AVClass *class;
     int hsub, vsub;
@@ -243,7 +235,15 @@  static int config_props_output(AVFilterLink *outlink)
         }
     }
 
-    av_log(ctx, AV_LOG_VERBOSE,
+    if (ARCH_X86) {
+        for (int i = 0; i < 4; i++) {
+            TransVtable *v = &s->vtables[i];
+
+            ff_transpose_init_x86(v, s->pixsteps[i]);
+        }
+    }
+
+     av_log(ctx, AV_LOG_VERBOSE,
            "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
            inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
            s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise",
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 06f832e36c..8d97e46c3f 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -31,6 +31,7 @@  OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
 OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
 OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
 OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
+OBJS-$(CONFIG_TRANSPOSE_FILTER)              += x86/vf_transpose_init.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
 OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
@@ -69,6 +70,7 @@  X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER)        += x86/vf_stereo3d.o
 X86ASM-OBJS-$(CONFIG_TBLEND_FILTER)          += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER)       += x86/vf_threshold.o
 X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += x86/vf_interlace.o
+X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o
 X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
 X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
 X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm
new file mode 100644
index 0000000000..6d925d5d97
--- /dev/null
+++ b/libavfilter/x86/vf_transpose.asm
@@ -0,0 +1,104 @@ 
+;*****************************************************************************
+;* x86-optimized functions for transpose filter
+;*
+;* Copyright (C) 2019 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize,
+;                       uint8_t *dst, ptrdiff_t dst_linesize)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse4
+cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize
+    movu    m0, [srcq]
+    add    srcq, src_linesizeq
+    movu    m1, [srcq]
+    add    srcq, src_linesizeq
+    movu    m2, [srcq]
+    add    srcq, src_linesizeq
+    movu    m3, [srcq]
+    add    srcq, src_linesizeq
+    movu    m4, [srcq]
+    add    srcq, src_linesizeq
+    movu    m5, [srcq]
+    add    srcq, src_linesizeq
+    movu    m6, [srcq]
+    add    srcq, src_linesizeq
+    movu    m7, [srcq]
+
+    TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7
+
+    movq [dstq], m0
+    add    dstq, dst_linesizeq
+    movq [dstq], m1
+    add    dstq, dst_linesizeq
+    movq [dstq], m2
+    add    dstq, dst_linesizeq
+    movq [dstq], m3
+    add    dstq, dst_linesizeq
+    movq [dstq], m4
+    add    dstq, dst_linesizeq
+    movq [dstq], m5
+    add    dstq, dst_linesizeq
+    movq [dstq], m6
+    add    dstq, dst_linesizeq
+    movq [dstq], m7
+    RET
+
+INIT_XMM sse4
+cglobal transpose_8x8_16, 4,4,9, src, src_linesize, dst, dst_linesize
+    movu    m0, [srcq]
+    add    srcq, src_linesizeq
+    movu    m1, [srcq]
+    add    srcq, src_linesizeq
+    movu    m2, [srcq]
+    add    srcq, src_linesizeq
+    movu    m3, [srcq]
+    add    srcq, src_linesizeq
+    movu    m4, [srcq]
+    add    srcq, src_linesizeq
+    movu    m5, [srcq]
+    add    srcq, src_linesizeq
+    movu    m6, [srcq]
+    add    srcq, src_linesizeq
+    movu    m7, [srcq]
+
+    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+    movu [dstq], m0
+    add    dstq, dst_linesizeq
+    movu [dstq], m1
+    add    dstq, dst_linesizeq
+    movu [dstq], m2
+    add    dstq, dst_linesizeq
+    movu [dstq], m3
+    add    dstq, dst_linesizeq
+    movu [dstq], m4
+    add    dstq, dst_linesizeq
+    movu [dstq], m5
+    add    dstq, dst_linesizeq
+    movu [dstq], m6
+    add    dstq, dst_linesizeq
+    movu [dstq], m7
+    RET
diff --git a/libavfilter/x86/vf_transpose_init.c b/libavfilter/x86/vf_transpose_init.c
new file mode 100644
index 0000000000..4f5acd5e56
--- /dev/null
+++ b/libavfilter/x86/vf_transpose_init.c
@@ -0,0 +1,49 @@ 
+/*
+ * Copyright (C) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/transpose.h"
+
+void ff_transpose_8x8_8_sse4(uint8_t *src,
+                             ptrdiff_t src_linesize,
+                             uint8_t *dst,
+                             ptrdiff_t dst_linesize);
+
+void ff_transpose_8x8_16_sse4(uint8_t *src,
+                              ptrdiff_t src_linesize,
+                              uint8_t *dst,
+                              ptrdiff_t dst_linesize);
+
+av_cold void ff_transpose_init_x86(TransVtable *v, int pixstep)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 1) {
+        v->transpose_8x8 = ff_transpose_8x8_8_sse4;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 2) {
+        v->transpose_8x8 = ff_transpose_8x8_16_sse4;
+    }
+}