Message ID | 20240901130935.5887-1-ramiro.polla@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/3] swscale/x86/rgb2rgb: fix deinterleaveBytes for unaligned dst pointers | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Sun, Sep 1, 2024 at 3:09 PM Ramiro Polla <ramiro.polla@gmail.com> wrote: > > --- > libswscale/x86/input.asm | 15 +++++++++------ > 1 file changed, 9 insertions(+), 6 deletions(-) > > diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm > index 21cd8b37fd..516e4384b1 100644 > --- a/libswscale/x86/input.asm > +++ b/libswscale/x86/input.asm > @@ -736,11 +736,11 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w > packuswb m2, m3 ; (byte) { U0, ..., U15 } > packuswb m0, m1 ; (byte) { V0, ..., V15 } > %ifidn %2, nv12 > - mova [dstUq+wq], m2 > - mova [dstVq+wq], m0 > + mov%1 [dstUq+wq], m2 > + mov%1 [dstVq+wq], m0 > %else ; nv21 > - mova [dstVq+wq], m2 > - mova [dstUq+wq], m0 > + mov%1 [dstVq+wq], m2 > + mov%1 [dstUq+wq], m0 > %endif ; nv12/21 > add wq, mmsize > jl .loop_%1 > @@ -750,15 +750,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w > ; %1 = nr. of XMM registers > ; %2 = nv12 or nv21 > %macro NVXX_TO_UV_FN 2 > -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w > +cglobal %2ToUV, 4, 5, %1, dstU, dstV, tmp, src, w > %if ARCH_X86_64 > movsxd wq, dword r5m > %else ; x86-32 > mov wq, r5m > %endif > + mov tmpq, srcq > + or tmpq, dstUq > + or tmpq, dstVq > add dstUq, wq > add dstVq, wq > - test srcq, 15 > + test tmpq, 15 > lea srcq, [srcq+wq*2] > pcmpeqb m5, m5 ; (byte) { 0xff } x 16 > psrlw m5, 8 ; (word) { 0x00ff } x 8 > -- > 2.30.2 > I'll apply this patchset in a few days if there are no comments.
On Tue, Sep 3, 2024 at 5:42 PM Ramiro Polla <ramiro.polla@gmail.com> wrote: > On Sun, Sep 1, 2024 at 3:09 PM Ramiro Polla <ramiro.polla@gmail.com> wrote: > > > > --- > > libswscale/x86/input.asm | 15 +++++++++------ > > 1 file changed, 9 insertions(+), 6 deletions(-) > > > > diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm > > index 21cd8b37fd..516e4384b1 100644 > > --- a/libswscale/x86/input.asm > > +++ b/libswscale/x86/input.asm > > @@ -736,11 +736,11 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w > > packuswb m2, m3 ; (byte) { U0, ..., U15 } > > packuswb m0, m1 ; (byte) { V0, ..., V15 } > > %ifidn %2, nv12 > > - mova [dstUq+wq], m2 > > - mova [dstVq+wq], m0 > > + mov%1 [dstUq+wq], m2 > > + mov%1 [dstVq+wq], m0 > > %else ; nv21 > > - mova [dstVq+wq], m2 > > - mova [dstUq+wq], m0 > > + mov%1 [dstVq+wq], m2 > > + mov%1 [dstUq+wq], m0 > > %endif ; nv12/21 > > add wq, mmsize > > jl .loop_%1 > > @@ -750,15 +750,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w > > ; %1 = nr. of XMM registers > > ; %2 = nv12 or nv21 > > %macro NVXX_TO_UV_FN 2 > > -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w > > +cglobal %2ToUV, 4, 5, %1, dstU, dstV, tmp, src, w > > %if ARCH_X86_64 > > movsxd wq, dword r5m > > %else ; x86-32 > > mov wq, r5m > > %endif > > + mov tmpq, srcq > > + or tmpq, dstUq > > + or tmpq, dstVq > > add dstUq, wq > > add dstVq, wq > > - test srcq, 15 > > + test tmpq, 15 > > lea srcq, [srcq+wq*2] > > pcmpeqb m5, m5 ; (byte) { 0xff } x 16 > > psrlw m5, 8 ; (word) { 0x00ff } x 8 > > -- > > 2.30.2 > > > > I'll apply this patchset in a few days if there are no comments. Pushed.
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 21cd8b37fd..516e4384b1 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -736,11 +736,11 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w packuswb m2, m3 ; (byte) { U0, ..., U15 } packuswb m0, m1 ; (byte) { V0, ..., V15 } %ifidn %2, nv12 - mova [dstUq+wq], m2 - mova [dstVq+wq], m0 + mov%1 [dstUq+wq], m2 + mov%1 [dstVq+wq], m0 %else ; nv21 - mova [dstVq+wq], m2 - mova [dstUq+wq], m0 + mov%1 [dstVq+wq], m2 + mov%1 [dstUq+wq], m0 %endif ; nv12/21 add wq, mmsize jl .loop_%1 @@ -750,15 +750,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w ; %1 = nr. of XMM registers ; %2 = nv12 or nv21 %macro NVXX_TO_UV_FN 2 -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, tmp, src, w %if ARCH_X86_64 movsxd wq, dword r5m %else ; x86-32 mov wq, r5m %endif + mov tmpq, srcq + or tmpq, dstUq + or tmpq, dstVq add dstUq, wq add dstVq, wq - test srcq, 15 + test tmpq, 15 lea srcq, [srcq+wq*2] pcmpeqb m5, m5 ; (byte) { 0xff } x 16 psrlw m5, 8 ; (word) { 0x00ff } x 8