diff mbox

[FFmpeg-devel,1/2] x86/vf_blend: add sse and ssse3 extremity functions

Message ID 20170627154553.4800-1-jamrial@gmail.com
State Accepted
Commit fa50d9360ba36ba2ee8f85f2c59e8d6af20e833a
Headers show

Commit Message

James Almer June 27, 2017, 3:45 p.m. UTC
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavfilter/x86/vf_blend.asm    | 25 +++++++++++++++++++++++++
 libavfilter/x86/vf_blend_init.c |  4 ++++
 tests/checkasm/vf_blend.c       |  1 +
 3 files changed, 30 insertions(+)

Comments

Paul B Mahol June 27, 2017, 3:50 p.m. UTC | #1
On 6/27/17, James Almer <jamrial@gmail.com> wrote:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavfilter/x86/vf_blend.asm    | 25 +++++++++++++++++++++++++
>  libavfilter/x86/vf_blend_init.c |  4 ++++
>  tests/checkasm/vf_blend.c       |  1 +
>  3 files changed, 30 insertions(+)
>

LGTM, I have couple more blend modes to add which might not be
SIMDable.
James Almer June 27, 2017, 4:52 p.m. UTC | #2
On 6/27/2017 12:50 PM, Paul B Mahol wrote:
> On 6/27/17, James Almer <jamrial@gmail.com> wrote:
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>  libavfilter/x86/vf_blend.asm    | 25 +++++++++++++++++++++++++
>>  libavfilter/x86/vf_blend_init.c |  4 ++++
>>  tests/checkasm/vf_blend.c       |  1 +
>>  3 files changed, 30 insertions(+)
>>
> 
> LGTM, I have couple more blend modes to add which might not be
> SIMDable.

Pushed.
Ivan Kalvachev June 27, 2017, 11:19 p.m. UTC | #3
On 6/27/17, James Almer <jamrial@gmail.com> wrote:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavfilter/x86/vf_blend.asm    | 25 +++++++++++++++++++++++++
>  libavfilter/x86/vf_blend_init.c |  4 ++++
>  tests/checkasm/vf_blend.c       |  1 +
>  3 files changed, 30 insertions(+)
>
> diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
> index 33b1ad1496..25f6f5affc 100644
> --- a/libavfilter/x86/vf_blend.asm
> +++ b/libavfilter/x86/vf_blend.asm
> @@ -286,6 +286,31 @@ BLEND_INIT difference, 3
>      jl .loop
>  BLEND_END
>
> +BLEND_INIT extremity, 8
> +    pxor       m2, m2
> +    mova       m4, [pw_255]
> +.nextrow:
> +    mov        xq, widthq
> +
> +    .loop:
> +        movu            m0, [topq + xq]
> +        movu            m1, [bottomq + xq]
> +        punpckhbw       m5, m0, m2
> +        punpcklbw       m0, m2
> +        punpckhbw       m6, m1, m2
> +        punpcklbw       m1, m2
> +        psubw           m3, m4, m0
> +        psubw           m7, m4, m5
> +        psubw           m3, m1
> +        psubw           m7, m6
> +        ABS1            m3, m1
> +        ABS1            m7, m6

Minor nitpick.

There exists ABS2 that takes 4 parameters and that does
two interleaved ABS1 , that are (hopefully) faster on sse2.
It should generate exactly the same code on ssse3.
James Almer June 27, 2017, 11:46 p.m. UTC | #4
On 6/27/2017 8:19 PM, Ivan Kalvachev wrote:
> On 6/27/17, James Almer <jamrial@gmail.com> wrote:
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>  libavfilter/x86/vf_blend.asm    | 25 +++++++++++++++++++++++++
>>  libavfilter/x86/vf_blend_init.c |  4 ++++
>>  tests/checkasm/vf_blend.c       |  1 +
>>  3 files changed, 30 insertions(+)
>>
>> diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
>> index 33b1ad1496..25f6f5affc 100644
>> --- a/libavfilter/x86/vf_blend.asm
>> +++ b/libavfilter/x86/vf_blend.asm
>> @@ -286,6 +286,31 @@ BLEND_INIT difference, 3
>>      jl .loop
>>  BLEND_END
>>
>> +BLEND_INIT extremity, 8
>> +    pxor       m2, m2
>> +    mova       m4, [pw_255]
>> +.nextrow:
>> +    mov        xq, widthq
>> +
>> +    .loop:
>> +        movu            m0, [topq + xq]
>> +        movu            m1, [bottomq + xq]
>> +        punpckhbw       m5, m0, m2
>> +        punpcklbw       m0, m2
>> +        punpckhbw       m6, m1, m2
>> +        punpcklbw       m1, m2
>> +        psubw           m3, m4, m0
>> +        psubw           m7, m4, m5
>> +        psubw           m3, m1
>> +        psubw           m7, m6
>> +        ABS1            m3, m1
>> +        ABS1            m7, m6
> 
> Minor nitpick.
> 
> There exists ABS2 that takes 4 parameters and that does
> two interleaved ABS1 , that are (hopefully) faster on sse2.
> It should generate exactly the same code on ssse3.

Ah nice, pushed a change to use them. Thanks.
diff mbox

Patch

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 33b1ad1496..25f6f5affc 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -286,6 +286,31 @@  BLEND_INIT difference, 3
     jl .loop
 BLEND_END
 
+BLEND_INIT extremity, 8
+    pxor       m2, m2
+    mova       m4, [pw_255]
+.nextrow:
+    mov        xq, widthq
+
+    .loop:
+        movu            m0, [topq + xq]
+        movu            m1, [bottomq + xq]
+        punpckhbw       m5, m0, m2
+        punpcklbw       m0, m2
+        punpckhbw       m6, m1, m2
+        punpcklbw       m1, m2
+        psubw           m3, m4, m0
+        psubw           m7, m4, m5
+        psubw           m3, m1
+        psubw           m7, m6
+        ABS1            m3, m1
+        ABS1            m7, m6
+        packuswb        m3, m7
+        mova   [dstq + xq], m3
+        add             xq, mmsize
+    jl .loop
+BLEND_END
+
 BLEND_INIT negation, 5
     pxor       m2, m2
     mova       m4, [pw_255]
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index 96fe3d8baa..71f9b0a685 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -47,6 +47,8 @@  BLEND_FUNC(subtract, sse2)
 BLEND_FUNC(xor, sse2)
 BLEND_FUNC(difference, sse2)
 BLEND_FUNC(difference, ssse3)
+BLEND_FUNC(extremity, sse2)
+BLEND_FUNC(extremity, ssse3)
 BLEND_FUNC(negation, sse2)
 BLEND_FUNC(negation, ssse3)
 
@@ -72,12 +74,14 @@  av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
         case BLEND_SUBTRACT: param->blend = ff_blend_subtract_sse2; break;
         case BLEND_XOR:      param->blend = ff_blend_xor_sse2;      break;
         case BLEND_DIFFERENCE: param->blend = ff_blend_difference_sse2; break;
+        case BLEND_EXTREMITY:  param->blend = ff_blend_extremity_sse2; break;
         case BLEND_NEGATION:   param->blend = ff_blend_negation_sse2;   break;
         }
     }
     if (EXTERNAL_SSSE3(cpu_flags) && param->opacity == 1 && !is_16bit) {
         switch (param->mode) {
         case BLEND_DIFFERENCE: param->blend = ff_blend_difference_ssse3; break;
+        case BLEND_EXTREMITY:  param->blend = ff_blend_extremity_ssse3; break;
         case BLEND_NEGATION:   param->blend = ff_blend_negation_ssse3;   break;
         }
     }
diff --git a/tests/checkasm/vf_blend.c b/tests/checkasm/vf_blend.c
index aa568c0de0..4e018ac69e 100644
--- a/tests/checkasm/vf_blend.c
+++ b/tests/checkasm/vf_blend.c
@@ -117,6 +117,7 @@  void checkasm_check_blend(void)
     check_and_report(subtract, BLEND_SUBTRACT)
     check_and_report(xor, BLEND_XOR)
     check_and_report(difference, BLEND_DIFFERENCE)
+    check_and_report(extremity, BLEND_EXTREMITY)
     check_and_report(negation, BLEND_NEGATION)
 
     report("8bit");