Message ID | 20170627154553.4800-1-jamrial@gmail.com |
---|---|
State | Accepted |
Commit | fa50d9360ba36ba2ee8f85f2c59e8d6af20e833a |
Headers | show |
On 6/27/17, James Almer <jamrial@gmail.com> wrote: > Signed-off-by: James Almer <jamrial@gmail.com> > --- > libavfilter/x86/vf_blend.asm | 25 +++++++++++++++++++++++++ > libavfilter/x86/vf_blend_init.c | 4 ++++ > tests/checkasm/vf_blend.c | 1 + > 3 files changed, 30 insertions(+) > LGTM, I have couple more blend modes to add which might not be SIMDable.
On 6/27/2017 12:50 PM, Paul B Mahol wrote: > On 6/27/17, James Almer <jamrial@gmail.com> wrote: >> Signed-off-by: James Almer <jamrial@gmail.com> >> --- >> libavfilter/x86/vf_blend.asm | 25 +++++++++++++++++++++++++ >> libavfilter/x86/vf_blend_init.c | 4 ++++ >> tests/checkasm/vf_blend.c | 1 + >> 3 files changed, 30 insertions(+) >> > > LGTM, I have couple more blend modes to add which might not be > SIMDable. Pushed.
On 6/27/17, James Almer <jamrial@gmail.com> wrote: > Signed-off-by: James Almer <jamrial@gmail.com> > --- > libavfilter/x86/vf_blend.asm | 25 +++++++++++++++++++++++++ > libavfilter/x86/vf_blend_init.c | 4 ++++ > tests/checkasm/vf_blend.c | 1 + > 3 files changed, 30 insertions(+) > > diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm > index 33b1ad1496..25f6f5affc 100644 > --- a/libavfilter/x86/vf_blend.asm > +++ b/libavfilter/x86/vf_blend.asm > @@ -286,6 +286,31 @@ BLEND_INIT difference, 3 > jl .loop > BLEND_END > > +BLEND_INIT extremity, 8 > + pxor m2, m2 > + mova m4, [pw_255] > +.nextrow: > + mov xq, widthq > + > + .loop: > + movu m0, [topq + xq] > + movu m1, [bottomq + xq] > + punpckhbw m5, m0, m2 > + punpcklbw m0, m2 > + punpckhbw m6, m1, m2 > + punpcklbw m1, m2 > + psubw m3, m4, m0 > + psubw m7, m4, m5 > + psubw m3, m1 > + psubw m7, m6 > + ABS1 m3, m1 > + ABS1 m7, m6 Minor nitpick. There exists ABS2 that takes 4 parameters and that does two interleaved ABS1 , that are (hopefully) faster on sse2. It should generate exactly the same code on ssse3.
On 6/27/2017 8:19 PM, Ivan Kalvachev wrote: > On 6/27/17, James Almer <jamrial@gmail.com> wrote: >> Signed-off-by: James Almer <jamrial@gmail.com> >> --- >> libavfilter/x86/vf_blend.asm | 25 +++++++++++++++++++++++++ >> libavfilter/x86/vf_blend_init.c | 4 ++++ >> tests/checkasm/vf_blend.c | 1 + >> 3 files changed, 30 insertions(+) >> >> diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm >> index 33b1ad1496..25f6f5affc 100644 >> --- a/libavfilter/x86/vf_blend.asm >> +++ b/libavfilter/x86/vf_blend.asm >> @@ -286,6 +286,31 @@ BLEND_INIT difference, 3 >> jl .loop >> BLEND_END >> >> +BLEND_INIT extremity, 8 >> + pxor m2, m2 >> + mova m4, [pw_255] >> +.nextrow: >> + mov xq, widthq >> + >> + .loop: >> + movu m0, [topq + xq] >> + movu m1, [bottomq + xq] >> + punpckhbw m5, m0, m2 >> + punpcklbw m0, m2 >> + punpckhbw m6, m1, m2 >> + punpcklbw m1, m2 >> + psubw m3, m4, m0 >> + psubw m7, m4, m5 >> + psubw m3, m1 >> + psubw m7, m6 >> + ABS1 m3, m1 >> + ABS1 m7, m6 > > Minor nitpick. > > There exists ABS2 that takes 4 parameters and that does > two interleaved ABS1 , that are (hopefully) faster on sse2. > It should generate exactly the same code on ssse3. Ah nice, pushed a change to use them. Thanks.
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 33b1ad1496..25f6f5affc 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -286,6 +286,31 @@ BLEND_INIT difference, 3 jl .loop BLEND_END +BLEND_INIT extremity, 8 + pxor m2, m2 + mova m4, [pw_255] +.nextrow: + mov xq, widthq + + .loop: + movu m0, [topq + xq] + movu m1, [bottomq + xq] + punpckhbw m5, m0, m2 + punpcklbw m0, m2 + punpckhbw m6, m1, m2 + punpcklbw m1, m2 + psubw m3, m4, m0 + psubw m7, m4, m5 + psubw m3, m1 + psubw m7, m6 + ABS1 m3, m1 + ABS1 m7, m6 + packuswb m3, m7 + mova [dstq + xq], m3 + add xq, mmsize + jl .loop +BLEND_END + BLEND_INIT negation, 5 pxor m2, m2 mova m4, [pw_255] diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index 96fe3d8baa..71f9b0a685 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -47,6 +47,8 @@ BLEND_FUNC(subtract, sse2) BLEND_FUNC(xor, sse2) BLEND_FUNC(difference, sse2) BLEND_FUNC(difference, ssse3) +BLEND_FUNC(extremity, sse2) +BLEND_FUNC(extremity, ssse3) BLEND_FUNC(negation, sse2) BLEND_FUNC(negation, ssse3) @@ -72,12 +74,14 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_SUBTRACT: param->blend = ff_blend_subtract_sse2; break; case BLEND_XOR: param->blend = ff_blend_xor_sse2; break; case BLEND_DIFFERENCE: param->blend = ff_blend_difference_sse2; break; + case BLEND_EXTREMITY: param->blend = ff_blend_extremity_sse2; break; case BLEND_NEGATION: param->blend = ff_blend_negation_sse2; break; } } if (EXTERNAL_SSSE3(cpu_flags) && param->opacity == 1 && !is_16bit) { switch (param->mode) { case BLEND_DIFFERENCE: param->blend = ff_blend_difference_ssse3; break; + case BLEND_EXTREMITY: param->blend = ff_blend_extremity_ssse3; break; case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break; } } diff --git a/tests/checkasm/vf_blend.c b/tests/checkasm/vf_blend.c index aa568c0de0..4e018ac69e 100644 --- a/tests/checkasm/vf_blend.c +++ b/tests/checkasm/vf_blend.c @@ -117,6 +117,7 @@ void checkasm_check_blend(void) check_and_report(subtract, BLEND_SUBTRACT) check_and_report(xor, BLEND_XOR) check_and_report(difference, BLEND_DIFFERENCE) + check_and_report(extremity, BLEND_EXTREMITY) check_and_report(negation, BLEND_NEGATION) report("8bit");
Signed-off-by: James Almer <jamrial@gmail.com> --- libavfilter/x86/vf_blend.asm | 25 +++++++++++++++++++++++++ libavfilter/x86/vf_blend_init.c | 4 ++++ tests/checkasm/vf_blend.c | 1 + 3 files changed, 30 insertions(+)