diff mbox series

[FFmpeg-devel,1/3] swscale/x86/rgb2rgb: fix deinterleaveBytes for unaligned dst pointers

Message ID 20240901130935.5887-1-ramiro.polla@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/3] swscale/x86/rgb2rgb: fix deinterleaveBytes for unaligned dst pointers | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Ramiro Polla Sept. 1, 2024, 1:09 p.m. UTC
---
 libswscale/x86/input.asm | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

Comments

Ramiro Polla Sept. 3, 2024, 3:42 p.m. UTC | #1
On Sun, Sep 1, 2024 at 3:09 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> ---
>  libswscale/x86/input.asm | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
> index 21cd8b37fd..516e4384b1 100644
> --- a/libswscale/x86/input.asm
> +++ b/libswscale/x86/input.asm
> @@ -736,11 +736,11 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
>      packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
>      packuswb       m0, m1                 ; (byte) { V0, ..., V15 }
>  %ifidn %2, nv12
> -    mova   [dstUq+wq], m2
> -    mova   [dstVq+wq], m0
> +    mov%1  [dstUq+wq], m2
> +    mov%1  [dstVq+wq], m0
>  %else ; nv21
> -    mova   [dstVq+wq], m2
> -    mova   [dstUq+wq], m0
> +    mov%1  [dstVq+wq], m2
> +    mov%1  [dstUq+wq], m0
>  %endif ; nv12/21
>      add            wq, mmsize
>      jl .loop_%1
> @@ -750,15 +750,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
>  ; %1 = nr. of XMM registers
>  ; %2 = nv12 or nv21
>  %macro NVXX_TO_UV_FN 2
> -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
> +cglobal %2ToUV, 4, 5, %1, dstU, dstV, tmp, src, w
>  %if ARCH_X86_64
>      movsxd         wq, dword r5m
>  %else ; x86-32
>      mov            wq, r5m
>  %endif
> +    mov          tmpq, srcq
> +    or           tmpq, dstUq
> +    or           tmpq, dstVq
>      add         dstUq, wq
>      add         dstVq, wq
> -    test         srcq, 15
> +    test         tmpq, 15
>      lea          srcq, [srcq+wq*2]
>      pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
>      psrlw          m5, 8                  ; (word) { 0x00ff } x 8
> --
> 2.30.2
>

I'll apply this patchset in a few days if there are no comments.
Ramiro Polla Sept. 6, 2024, 9:15 p.m. UTC | #2
On Tue, Sep 3, 2024 at 5:42 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
> On Sun, Sep 1, 2024 at 3:09 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
> >
> > ---
> >  libswscale/x86/input.asm | 15 +++++++++------
> >  1 file changed, 9 insertions(+), 6 deletions(-)
> >
> > diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
> > index 21cd8b37fd..516e4384b1 100644
> > --- a/libswscale/x86/input.asm
> > +++ b/libswscale/x86/input.asm
> > @@ -736,11 +736,11 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
> >      packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
> >      packuswb       m0, m1                 ; (byte) { V0, ..., V15 }
> >  %ifidn %2, nv12
> > -    mova   [dstUq+wq], m2
> > -    mova   [dstVq+wq], m0
> > +    mov%1  [dstUq+wq], m2
> > +    mov%1  [dstVq+wq], m0
> >  %else ; nv21
> > -    mova   [dstVq+wq], m2
> > -    mova   [dstUq+wq], m0
> > +    mov%1  [dstVq+wq], m2
> > +    mov%1  [dstUq+wq], m0
> >  %endif ; nv12/21
> >      add            wq, mmsize
> >      jl .loop_%1
> > @@ -750,15 +750,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
> >  ; %1 = nr. of XMM registers
> >  ; %2 = nv12 or nv21
> >  %macro NVXX_TO_UV_FN 2
> > -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
> > +cglobal %2ToUV, 4, 5, %1, dstU, dstV, tmp, src, w
> >  %if ARCH_X86_64
> >      movsxd         wq, dword r5m
> >  %else ; x86-32
> >      mov            wq, r5m
> >  %endif
> > +    mov          tmpq, srcq
> > +    or           tmpq, dstUq
> > +    or           tmpq, dstVq
> >      add         dstUq, wq
> >      add         dstVq, wq
> > -    test         srcq, 15
> > +    test         tmpq, 15
> >      lea          srcq, [srcq+wq*2]
> >      pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
> >      psrlw          m5, 8                  ; (word) { 0x00ff } x 8
> > --
> > 2.30.2
> >
>
> I'll apply this patchset in a few days if there are no comments.

Pushed.
diff mbox series

Patch

diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index 21cd8b37fd..516e4384b1 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -736,11 +736,11 @@  cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
     packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
     packuswb       m0, m1                 ; (byte) { V0, ..., V15 }
 %ifidn %2, nv12
-    mova   [dstUq+wq], m2
-    mova   [dstVq+wq], m0
+    mov%1  [dstUq+wq], m2
+    mov%1  [dstVq+wq], m0
 %else ; nv21
-    mova   [dstVq+wq], m2
-    mova   [dstUq+wq], m0
+    mov%1  [dstVq+wq], m2
+    mov%1  [dstUq+wq], m0
 %endif ; nv12/21
     add            wq, mmsize
     jl .loop_%1
@@ -750,15 +750,18 @@  cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 ; %1 = nr. of XMM registers
 ; %2 = nv12 or nv21
 %macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, tmp, src, w
 %if ARCH_X86_64
     movsxd         wq, dword r5m
 %else ; x86-32
     mov            wq, r5m
 %endif
+    mov          tmpq, srcq
+    or           tmpq, dstUq
+    or           tmpq, dstVq
     add         dstUq, wq
     add         dstVq, wq
-    test         srcq, 15
+    test         tmpq, 15
     lea          srcq, [srcq+wq*2]
     pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
     psrlw          m5, 8                  ; (word) { 0x00ff } x 8