Message ID | CAOmVQXHESPuyOiaWG243DfmVTEuqGjp4f59qT_1wQMsdtBoxtA@mail.gmail.com |
---|---|
State | New |
Headers | show |
On Tue, May 09, 2017 at 11:08:48PM +0200, Matthieu Bouron wrote: > On Sun, May 7, 2017 at 11:05 AM, Matthieu Bouron <matthieu.bouron@gmail.com> > wrote: > > > > > > > Le 2 mai 2017 12:01 PM, "Benoit Fouet" <benoit.fouet@free.fr> a écrit : > > > > Hi, > > > > > > On 28/04/2017 21:58, Matthieu Bouron wrote: > > > Untested: fixes ticket #6324. > > > --- > > > libavcodec/aarch64/simple_idct_neon.S | 12 ++++++------ > > > 1 file changed, 6 insertions(+), 6 deletions(-) > > > > > > diff --git a/libavcodec/aarch64/simple_idct_neon.S > > b/libavcodec/aarch64/simple_idct_neon.S > > > index 52273420f9..d31f72a609 100644 > > > --- a/libavcodec/aarch64/simple_idct_neon.S > > > +++ b/libavcodec/aarch64/simple_idct_neon.S > > > @@ -61,19 +61,19 @@ endconst > > > br x10 > > > .endm > > > > > > -.macro smull1 a b c > > > +.macro smull1 a, b, c > > > smull \a, \b, \c > > > .endm > > > > > > -.macro smlal1 a b c > > > +.macro smlal1 a, b, c > > > smlal \a, \b, \c > > > .endm > > > > > > -.macro smlsl1 a b c > > > +.macro smlsl1 a, b, c > > > smlsl \a, \b, \c > > > .endm > > > > > > -.macro idct_col4_top y1 y2 y3 y4 i l > > > +.macro idct_col4_top y1, y2, y3, y4, i, l > > > smull\i v7.4S, \y3\().\l, z2 > > > smull\i v16.4S, \y3\().\l, z6 > > > smull\i v17.4S, \y2\().\l, z1 > > > @@ -91,7 +91,7 @@ endconst > > > smlsl\i v6.4S, \y4\().\l, z5 > > > .endm > > > > > > -.macro idct_row4_neon y1 y2 y3 y4 pass > > > +.macro idct_row4_neon y1, y2, y3, y4, pass > > > ld1 {\y1\().2D-\y2\().2D}, [x2], #32 > > > movi v23.4S, #1<<2, lsl #8 > > > orr v5.16B, \y1\().16B, \y2\().16B > > > @@ -153,7 +153,7 @@ endconst > > > trn2 \y4\().4S, v17.4S, v19.4S > > > .endm > > > > > > -.macro declare_idct_col4_neon i l > > > +.macro declare_idct_col4_neon i, l > > > function idct_col4_neon\i > > > dup v23.4H, z4c > > > .if \i == 1 > > > > Sounds sane, but shouldn't we be doing this for all instances of > > multiple arguments macros without commas? > > > > > > Sure, I may have missed some. I will work again on this patch on Tuesday > > as I will have access to an apple machine (and hopefully fix the build > > without gas-preprocessor). > > > > Sorry for the delay, > > Matthieu > > > > > Updated patch attached: > * add missing commas to separate macro arguments > * passes .4H/.8H as macro arguments instead of .4H/.8H (the later form > being interpreted as an hexadecimal value, ie: 4/8). > From e27ac0f3a8b6436a7530ee5c5c514bfdfac4a558 Mon Sep 17 00:00:00 2001 > From: Matthieu Bouron <matthieu.bouron@gmail.com> > Date: Fri, 28 Apr 2017 21:58:55 +0200 > Subject: [PATCH] lavc/aarch64/simple_idct: fix iOS build without > gas-preprocessor > MIME-Version: 1.0 > Content-Type: text/plain; charset=UTF-8 > Content-Transfer-Encoding: 8bit > > Separates macro arguments with commas and passes .4H/.8H as macro > arguments instead of 4H/8H (the later form being interpreted as an > hexadecimal value). > > Fixes ticket #6324. > > Suggested-by: Martin Storsjö <martin@martin.st> > --- > libavcodec/aarch64/simple_idct_neon.S | 74 +++++++++++++++++------------------ > 1 file changed, 37 insertions(+), 37 deletions(-) > > diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S > index 52273420f9..92987985d2 100644 > --- a/libavcodec/aarch64/simple_idct_neon.S > +++ b/libavcodec/aarch64/simple_idct_neon.S > @@ -61,37 +61,37 @@ endconst > br x10 > .endm > > -.macro smull1 a b c > +.macro smull1 a, b, c > smull \a, \b, \c > .endm > > -.macro smlal1 a b c > +.macro smlal1 a, b, c > smlal \a, \b, \c > .endm > > -.macro smlsl1 a b c > +.macro smlsl1 a, b, c > smlsl \a, \b, \c > .endm > > -.macro idct_col4_top y1 y2 y3 y4 i l > - smull\i v7.4S, \y3\().\l, z2 > - smull\i v16.4S, \y3\().\l, z6 > - smull\i v17.4S, \y2\().\l, z1 > +.macro idct_col4_top y1, y2, y3, y4, i, l > + smull\i v7.4S, \y3\l, z1 > + smull\i v16.4S, \y3\l, z6 > + smull\i v17.4S, \y2\l, z1 > add v19.4S, v23.4S, v7.4S > - smull\i v18.4S, \y2\().\l, z3 > + smull\i v18.4S, \y2\l, z3 > add v20.4S, v23.4S, v16.4S > - smull\i v5.4S, \y2\().\l, z5 > + smull\i v5.4S, \y2\l, z5 > sub v21.4S, v23.4S, v16.4S > - smull\i v6.4S, \y2\().\l, z7 > + smull\i v6.4S, \y2\l, z7 > sub v22.4S, v23.4S, v7.4S > > - smlal\i v17.4S, \y4\().\l, z3 > - smlsl\i v18.4S, \y4\().\l, z7 > - smlsl\i v5.4S, \y4\().\l, z1 > - smlsl\i v6.4S, \y4\().\l, z5 > + smlal\i v17.4S, \y4\l, z3 > + smlsl\i v18.4S, \y4\l, z7 > + smlsl\i v5.4S, \y4\l, z1 > + smlsl\i v6.4S, \y4\l, z5 > .endm > > -.macro idct_row4_neon y1 y2 y3 y4 pass > +.macro idct_row4_neon y1, y2, y3, y4, pass > ld1 {\y1\().2D-\y2\().2D}, [x2], #32 > movi v23.4S, #1<<2, lsl #8 > orr v5.16B, \y1\().16B, \y2\().16B > @@ -101,7 +101,7 @@ endconst > mov x3, v5.D[1] > smlal v23.4S, \y1\().4H, z4 > > - idct_col4_top \y1 \y2 \y3 \y4 1 4H > + idct_col4_top \y1, \y2, \y3, \y4, 1, .4H > > cmp x3, #0 > beq \pass\()f > @@ -153,7 +153,7 @@ endconst > trn2 \y4\().4S, v17.4S, v19.4S > .endm > > -.macro declare_idct_col4_neon i l > +.macro declare_idct_col4_neon i, l > function idct_col4_neon\i > dup v23.4H, z4c > .if \i == 1 > @@ -164,14 +164,14 @@ function idct_col4_neon\i > .endif > smull v23.4S, v23.4H, z4 > > - idct_col4_top v24 v25 v26 v27 \i \l > + idct_col4_top v24, v25, v26, v27, \i, \l > > mov x4, v28.D[\i - 1] > mov x5, v29.D[\i - 1] > cmp x4, #0 > beq 1f > > - smull\i v7.4S, v28.\l, z4 > + smull\i v7.4S, v28\l, z4 > add v19.4S, v19.4S, v7.4S > sub v20.4S, v20.4S, v7.4S > sub v21.4S, v21.4S, v7.4S > @@ -181,17 +181,17 @@ function idct_col4_neon\i > cmp x5, #0 > beq 2f > > - smlal\i v17.4S, v29.\l, z5 > - smlsl\i v18.4S, v29.\l, z1 > - smlal\i v5.4S, v29.\l, z7 > - smlal\i v6.4S, v29.\l, z3 > + smlal\i v17.4S, v29\l, z5 > + smlsl\i v18.4S, v29\l, z1 > + smlal\i v5.4S, v29\l, z7 > + smlal\i v6.4S, v29\l, z3 > > 2: mov x5, v31.D[\i - 1] > cmp x4, #0 > beq 3f > > - smull\i v7.4S, v30.\l, z6 > - smull\i v16.4S, v30.\l, z2 > + smull\i v7.4S, v30\l, z6 > + smull\i v16.4S, v30\l, z2 > add v19.4S, v19.4S, v7.4S > sub v22.4S, v22.4S, v7.4S > sub v20.4S, v20.4S, v16.4S > @@ -200,10 +200,10 @@ function idct_col4_neon\i > 3: cmp x5, #0 > beq 4f > > - smlal\i v17.4S, v31.\l, z7 > - smlsl\i v18.4S, v31.\l, z5 > - smlal\i v5.4S, v31.\l, z3 > - smlsl\i v6.4S, v31.\l, z1 > + smlal\i v17.4S, v31\l, z7 > + smlsl\i v18.4S, v31\l, z5 > + smlal\i v5.4S, v31\l, z3 > + smlsl\i v6.4S, v31\l, z1 > > 4: addhn v7.4H, v19.4S, v17.4S > addhn2 v7.8H, v20.4S, v18.4S > @@ -219,14 +219,14 @@ function idct_col4_neon\i > endfunc > .endm > > -declare_idct_col4_neon 1 4H > -declare_idct_col4_neon 2 8H > +declare_idct_col4_neon 1, .4H > +declare_idct_col4_neon 2, .8H > > function ff_simple_idct_put_neon, export=1 > idct_start x2 > > - idct_row4_neon v24 v25 v26 v27 1 > - idct_row4_neon v28 v29 v30 v31 2 > + idct_row4_neon v24, v25, v26, v27, 1 > + idct_row4_neon v28, v29, v30, v31, 2 > bl idct_col4_neon1 > > sqshrun v1.8B, v7.8H, #COL_SHIFT-16 > @@ -263,8 +263,8 @@ endfunc > function ff_simple_idct_add_neon, export=1 > idct_start x2 > > - idct_row4_neon v24 v25 v26 v27 1 > - idct_row4_neon v28 v29 v30 v31 2 > + idct_row4_neon v24, v25, v26, v27, 1 > + idct_row4_neon v28, v29, v30, v31, 2 > bl idct_col4_neon1 > > sshr v1.8H, V7.8H, #COL_SHIFT-16 > @@ -328,8 +328,8 @@ function ff_simple_idct_neon, export=1 > idct_start x0 > > mov x2, x0 > - idct_row4_neon v24 v25 v26 v27 1 > - idct_row4_neon v28 v29 v30 v31 2 > + idct_row4_neon v24, v25, v26, v27, 1 > + idct_row4_neon v28, v29, v30, v31, 2 > add x2, x2, #-128 > bl idct_col4_neon1 > > -- > 2.12.0 > If there is no objection, I will push the patch tomorrow. Matthieu
On Wed, May 10, 2017 at 08:23:02PM +0200, Matthieu Bouron wrote: > On Tue, May 09, 2017 at 11:08:48PM +0200, Matthieu Bouron wrote: > > On Sun, May 7, 2017 at 11:05 AM, Matthieu Bouron <matthieu.bouron@gmail.com> > > wrote: > > > > > > > > > > > Le 2 mai 2017 12:01 PM, "Benoit Fouet" <benoit.fouet@free.fr> a écrit : > > > > > > Hi, > > > > > > > > > On 28/04/2017 21:58, Matthieu Bouron wrote: > > > > Untested: fixes ticket #6324. > > > > --- > > > > libavcodec/aarch64/simple_idct_neon.S | 12 ++++++------ > > > > 1 file changed, 6 insertions(+), 6 deletions(-) > > > > > > > > diff --git a/libavcodec/aarch64/simple_idct_neon.S > > > b/libavcodec/aarch64/simple_idct_neon.S > > > > index 52273420f9..d31f72a609 100644 > > > > --- a/libavcodec/aarch64/simple_idct_neon.S > > > > +++ b/libavcodec/aarch64/simple_idct_neon.S > > > > @@ -61,19 +61,19 @@ endconst > > > > br x10 > > > > .endm > > > > > > > > -.macro smull1 a b c > > > > +.macro smull1 a, b, c > > > > smull \a, \b, \c > > > > .endm > > > > > > > > -.macro smlal1 a b c > > > > +.macro smlal1 a, b, c > > > > smlal \a, \b, \c > > > > .endm > > > > > > > > -.macro smlsl1 a b c > > > > +.macro smlsl1 a, b, c > > > > smlsl \a, \b, \c > > > > .endm > > > > > > > > -.macro idct_col4_top y1 y2 y3 y4 i l > > > > +.macro idct_col4_top y1, y2, y3, y4, i, l > > > > smull\i v7.4S, \y3\().\l, z2 > > > > smull\i v16.4S, \y3\().\l, z6 > > > > smull\i v17.4S, \y2\().\l, z1 > > > > @@ -91,7 +91,7 @@ endconst > > > > smlsl\i v6.4S, \y4\().\l, z5 > > > > .endm > > > > > > > > -.macro idct_row4_neon y1 y2 y3 y4 pass > > > > +.macro idct_row4_neon y1, y2, y3, y4, pass > > > > ld1 {\y1\().2D-\y2\().2D}, [x2], #32 > > > > movi v23.4S, #1<<2, lsl #8 > > > > orr v5.16B, \y1\().16B, \y2\().16B > > > > @@ -153,7 +153,7 @@ endconst > > > > trn2 \y4\().4S, v17.4S, v19.4S > > > > .endm > > > > > > > > -.macro declare_idct_col4_neon i l > > > > +.macro declare_idct_col4_neon i, l > > > > function idct_col4_neon\i > > > > dup v23.4H, z4c > > > > .if \i == 1 > > > > > > Sounds sane, but shouldn't we be doing this for all instances of > > > multiple arguments macros without commas? > > > > > > > > > Sure, I may have missed some. I will work again on this patch on Tuesday > > > as I will have access to an apple machine (and hopefully fix the build > > > without gas-preprocessor). > > > > > > Sorry for the delay, > > > Matthieu > > > > > > > > Updated patch attached: > > * add missing commas to separate macro arguments > > * passes .4H/.8H as macro arguments instead of .4H/.8H (the later form > > being interpreted as an hexadecimal value, ie: 4/8). > > > From e27ac0f3a8b6436a7530ee5c5c514bfdfac4a558 Mon Sep 17 00:00:00 2001 > > From: Matthieu Bouron <matthieu.bouron@gmail.com> > > Date: Fri, 28 Apr 2017 21:58:55 +0200 > > Subject: [PATCH] lavc/aarch64/simple_idct: fix iOS build without > > gas-preprocessor > > MIME-Version: 1.0 > > Content-Type: text/plain; charset=UTF-8 > > Content-Transfer-Encoding: 8bit > > > > Separates macro arguments with commas and passes .4H/.8H as macro > > arguments instead of 4H/8H (the later form being interpreted as an > > hexadecimal value). > > > > Fixes ticket #6324. > > > > Suggested-by: Martin Storsjö <martin@martin.st> > > --- > > libavcodec/aarch64/simple_idct_neon.S | 74 +++++++++++++++++------------------ > > 1 file changed, 37 insertions(+), 37 deletions(-) > > > > diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S > > index 52273420f9..92987985d2 100644 > > --- a/libavcodec/aarch64/simple_idct_neon.S > > +++ b/libavcodec/aarch64/simple_idct_neon.S > > @@ -61,37 +61,37 @@ endconst > > br x10 > > .endm > > > > -.macro smull1 a b c > > +.macro smull1 a, b, c > > smull \a, \b, \c > > .endm > > > > -.macro smlal1 a b c > > +.macro smlal1 a, b, c > > smlal \a, \b, \c > > .endm > > > > -.macro smlsl1 a b c > > +.macro smlsl1 a, b, c > > smlsl \a, \b, \c > > .endm > > > > -.macro idct_col4_top y1 y2 y3 y4 i l > > - smull\i v7.4S, \y3\().\l, z2 > > - smull\i v16.4S, \y3\().\l, z6 > > - smull\i v17.4S, \y2\().\l, z1 > > +.macro idct_col4_top y1, y2, y3, y4, i, l > > + smull\i v7.4S, \y3\l, z1 > > + smull\i v16.4S, \y3\l, z6 > > + smull\i v17.4S, \y2\l, z1 > > add v19.4S, v23.4S, v7.4S > > - smull\i v18.4S, \y2\().\l, z3 > > + smull\i v18.4S, \y2\l, z3 > > add v20.4S, v23.4S, v16.4S > > - smull\i v5.4S, \y2\().\l, z5 > > + smull\i v5.4S, \y2\l, z5 > > sub v21.4S, v23.4S, v16.4S > > - smull\i v6.4S, \y2\().\l, z7 > > + smull\i v6.4S, \y2\l, z7 > > sub v22.4S, v23.4S, v7.4S > > > > - smlal\i v17.4S, \y4\().\l, z3 > > - smlsl\i v18.4S, \y4\().\l, z7 > > - smlsl\i v5.4S, \y4\().\l, z1 > > - smlsl\i v6.4S, \y4\().\l, z5 > > + smlal\i v17.4S, \y4\l, z3 > > + smlsl\i v18.4S, \y4\l, z7 > > + smlsl\i v5.4S, \y4\l, z1 > > + smlsl\i v6.4S, \y4\l, z5 > > .endm > > > > -.macro idct_row4_neon y1 y2 y3 y4 pass > > +.macro idct_row4_neon y1, y2, y3, y4, pass > > ld1 {\y1\().2D-\y2\().2D}, [x2], #32 > > movi v23.4S, #1<<2, lsl #8 > > orr v5.16B, \y1\().16B, \y2\().16B > > @@ -101,7 +101,7 @@ endconst > > mov x3, v5.D[1] > > smlal v23.4S, \y1\().4H, z4 > > > > - idct_col4_top \y1 \y2 \y3 \y4 1 4H > > + idct_col4_top \y1, \y2, \y3, \y4, 1, .4H > > > > cmp x3, #0 > > beq \pass\()f > > @@ -153,7 +153,7 @@ endconst > > trn2 \y4\().4S, v17.4S, v19.4S > > .endm > > > > -.macro declare_idct_col4_neon i l > > +.macro declare_idct_col4_neon i, l > > function idct_col4_neon\i > > dup v23.4H, z4c > > .if \i == 1 > > @@ -164,14 +164,14 @@ function idct_col4_neon\i > > .endif > > smull v23.4S, v23.4H, z4 > > > > - idct_col4_top v24 v25 v26 v27 \i \l > > + idct_col4_top v24, v25, v26, v27, \i, \l > > > > mov x4, v28.D[\i - 1] > > mov x5, v29.D[\i - 1] > > cmp x4, #0 > > beq 1f > > > > - smull\i v7.4S, v28.\l, z4 > > + smull\i v7.4S, v28\l, z4 > > add v19.4S, v19.4S, v7.4S > > sub v20.4S, v20.4S, v7.4S > > sub v21.4S, v21.4S, v7.4S > > @@ -181,17 +181,17 @@ function idct_col4_neon\i > > cmp x5, #0 > > beq 2f > > > > - smlal\i v17.4S, v29.\l, z5 > > - smlsl\i v18.4S, v29.\l, z1 > > - smlal\i v5.4S, v29.\l, z7 > > - smlal\i v6.4S, v29.\l, z3 > > + smlal\i v17.4S, v29\l, z5 > > + smlsl\i v18.4S, v29\l, z1 > > + smlal\i v5.4S, v29\l, z7 > > + smlal\i v6.4S, v29\l, z3 > > > > 2: mov x5, v31.D[\i - 1] > > cmp x4, #0 > > beq 3f > > > > - smull\i v7.4S, v30.\l, z6 > > - smull\i v16.4S, v30.\l, z2 > > + smull\i v7.4S, v30\l, z6 > > + smull\i v16.4S, v30\l, z2 > > add v19.4S, v19.4S, v7.4S > > sub v22.4S, v22.4S, v7.4S > > sub v20.4S, v20.4S, v16.4S > > @@ -200,10 +200,10 @@ function idct_col4_neon\i > > 3: cmp x5, #0 > > beq 4f > > > > - smlal\i v17.4S, v31.\l, z7 > > - smlsl\i v18.4S, v31.\l, z5 > > - smlal\i v5.4S, v31.\l, z3 > > - smlsl\i v6.4S, v31.\l, z1 > > + smlal\i v17.4S, v31\l, z7 > > + smlsl\i v18.4S, v31\l, z5 > > + smlal\i v5.4S, v31\l, z3 > > + smlsl\i v6.4S, v31\l, z1 > > > > 4: addhn v7.4H, v19.4S, v17.4S > > addhn2 v7.8H, v20.4S, v18.4S > > @@ -219,14 +219,14 @@ function idct_col4_neon\i > > endfunc > > .endm > > > > -declare_idct_col4_neon 1 4H > > -declare_idct_col4_neon 2 8H > > +declare_idct_col4_neon 1, .4H > > +declare_idct_col4_neon 2, .8H > > > > function ff_simple_idct_put_neon, export=1 > > idct_start x2 > > > > - idct_row4_neon v24 v25 v26 v27 1 > > - idct_row4_neon v28 v29 v30 v31 2 > > + idct_row4_neon v24, v25, v26, v27, 1 > > + idct_row4_neon v28, v29, v30, v31, 2 > > bl idct_col4_neon1 > > > > sqshrun v1.8B, v7.8H, #COL_SHIFT-16 > > @@ -263,8 +263,8 @@ endfunc > > function ff_simple_idct_add_neon, export=1 > > idct_start x2 > > > > - idct_row4_neon v24 v25 v26 v27 1 > > - idct_row4_neon v28 v29 v30 v31 2 > > + idct_row4_neon v24, v25, v26, v27, 1 > > + idct_row4_neon v28, v29, v30, v31, 2 > > bl idct_col4_neon1 > > > > sshr v1.8H, V7.8H, #COL_SHIFT-16 > > @@ -328,8 +328,8 @@ function ff_simple_idct_neon, export=1 > > idct_start x0 > > > > mov x2, x0 > > - idct_row4_neon v24 v25 v26 v27 1 > > - idct_row4_neon v28 v29 v30 v31 2 > > + idct_row4_neon v24, v25, v26, v27, 1 > > + idct_row4_neon v28, v29, v30, v31, 2 > > add x2, x2, #-128 > > bl idct_col4_neon1 > > > > -- > > 2.12.0 > > > > If there is no objection, I will push the patch tomorrow. Patch applied.
From e27ac0f3a8b6436a7530ee5c5c514bfdfac4a558 Mon Sep 17 00:00:00 2001 From: Matthieu Bouron <matthieu.bouron@gmail.com> Date: Fri, 28 Apr 2017 21:58:55 +0200 Subject: [PATCH] lavc/aarch64/simple_idct: fix iOS build without gas-preprocessor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Separates macro arguments with commas and passes .4H/.8H as macro arguments instead of 4H/8H (the later form being interpreted as an hexadecimal value). Fixes ticket #6324. Suggested-by: Martin Storsjö <martin@martin.st> --- libavcodec/aarch64/simple_idct_neon.S | 74 +++++++++++++++++------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S index 52273420f9..92987985d2 100644 --- a/libavcodec/aarch64/simple_idct_neon.S +++ b/libavcodec/aarch64/simple_idct_neon.S @@ -61,37 +61,37 @@ endconst br x10 .endm -.macro smull1 a b c +.macro smull1 a, b, c smull \a, \b, \c .endm -.macro smlal1 a b c +.macro smlal1 a, b, c smlal \a, \b, \c .endm -.macro smlsl1 a b c +.macro smlsl1 a, b, c smlsl \a, \b, \c .endm -.macro idct_col4_top y1 y2 y3 y4 i l - smull\i v7.4S, \y3\().\l, z2 - smull\i v16.4S, \y3\().\l, z6 - smull\i v17.4S, \y2\().\l, z1 +.macro idct_col4_top y1, y2, y3, y4, i, l + smull\i v7.4S, \y3\l, z1 + smull\i v16.4S, \y3\l, z6 + smull\i v17.4S, \y2\l, z1 add v19.4S, v23.4S, v7.4S - smull\i v18.4S, \y2\().\l, z3 + smull\i v18.4S, \y2\l, z3 add v20.4S, v23.4S, v16.4S - smull\i v5.4S, \y2\().\l, z5 + smull\i v5.4S, \y2\l, z5 sub v21.4S, v23.4S, v16.4S - smull\i v6.4S, \y2\().\l, z7 + smull\i v6.4S, \y2\l, z7 sub v22.4S, v23.4S, v7.4S - smlal\i v17.4S, \y4\().\l, z3 - smlsl\i v18.4S, \y4\().\l, z7 - smlsl\i v5.4S, \y4\().\l, z1 - smlsl\i v6.4S, \y4\().\l, z5 + smlal\i v17.4S, \y4\l, z3 + smlsl\i v18.4S, \y4\l, z7 + smlsl\i v5.4S, \y4\l, z1 + smlsl\i v6.4S, \y4\l, z5 .endm -.macro idct_row4_neon y1 y2 y3 y4 pass +.macro idct_row4_neon y1, y2, y3, y4, pass ld1 {\y1\().2D-\y2\().2D}, [x2], #32 movi v23.4S, #1<<2, lsl #8 orr v5.16B, \y1\().16B, \y2\().16B @@ -101,7 +101,7 @@ endconst mov x3, v5.D[1] smlal v23.4S, \y1\().4H, z4 - idct_col4_top \y1 \y2 \y3 \y4 1 4H + idct_col4_top \y1, \y2, \y3, \y4, 1, .4H cmp x3, #0 beq \pass\()f @@ -153,7 +153,7 @@ endconst trn2 \y4\().4S, v17.4S, v19.4S .endm -.macro declare_idct_col4_neon i l +.macro declare_idct_col4_neon i, l function idct_col4_neon\i dup v23.4H, z4c .if \i == 1 @@ -164,14 +164,14 @@ function idct_col4_neon\i .endif smull v23.4S, v23.4H, z4 - idct_col4_top v24 v25 v26 v27 \i \l + idct_col4_top v24, v25, v26, v27, \i, \l mov x4, v28.D[\i - 1] mov x5, v29.D[\i - 1] cmp x4, #0 beq 1f - smull\i v7.4S, v28.\l, z4 + smull\i v7.4S, v28\l, z4 add v19.4S, v19.4S, v7.4S sub v20.4S, v20.4S, v7.4S sub v21.4S, v21.4S, v7.4S @@ -181,17 +181,17 @@ function idct_col4_neon\i cmp x5, #0 beq 2f - smlal\i v17.4S, v29.\l, z5 - smlsl\i v18.4S, v29.\l, z1 - smlal\i v5.4S, v29.\l, z7 - smlal\i v6.4S, v29.\l, z3 + smlal\i v17.4S, v29\l, z5 + smlsl\i v18.4S, v29\l, z1 + smlal\i v5.4S, v29\l, z7 + smlal\i v6.4S, v29\l, z3 2: mov x5, v31.D[\i - 1] cmp x4, #0 beq 3f - smull\i v7.4S, v30.\l, z6 - smull\i v16.4S, v30.\l, z2 + smull\i v7.4S, v30\l, z6 + smull\i v16.4S, v30\l, z2 add v19.4S, v19.4S, v7.4S sub v22.4S, v22.4S, v7.4S sub v20.4S, v20.4S, v16.4S @@ -200,10 +200,10 @@ function idct_col4_neon\i 3: cmp x5, #0 beq 4f - smlal\i v17.4S, v31.\l, z7 - smlsl\i v18.4S, v31.\l, z5 - smlal\i v5.4S, v31.\l, z3 - smlsl\i v6.4S, v31.\l, z1 + smlal\i v17.4S, v31\l, z7 + smlsl\i v18.4S, v31\l, z5 + smlal\i v5.4S, v31\l, z3 + smlsl\i v6.4S, v31\l, z1 4: addhn v7.4H, v19.4S, v17.4S addhn2 v7.8H, v20.4S, v18.4S @@ -219,14 +219,14 @@ function idct_col4_neon\i endfunc .endm -declare_idct_col4_neon 1 4H -declare_idct_col4_neon 2 8H +declare_idct_col4_neon 1, .4H +declare_idct_col4_neon 2, .8H function ff_simple_idct_put_neon, export=1 idct_start x2 - idct_row4_neon v24 v25 v26 v27 1 - idct_row4_neon v28 v29 v30 v31 2 + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 bl idct_col4_neon1 sqshrun v1.8B, v7.8H, #COL_SHIFT-16 @@ -263,8 +263,8 @@ endfunc function ff_simple_idct_add_neon, export=1 idct_start x2 - idct_row4_neon v24 v25 v26 v27 1 - idct_row4_neon v28 v29 v30 v31 2 + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 bl idct_col4_neon1 sshr v1.8H, V7.8H, #COL_SHIFT-16 @@ -328,8 +328,8 @@ function ff_simple_idct_neon, export=1 idct_start x0 mov x2, x0 - idct_row4_neon v24 v25 v26 v27 1 - idct_row4_neon v28 v29 v30 v31 2 + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 add x2, x2, #-128 bl idct_col4_neon1 -- 2.12.0