Message ID | 20170614113912.23045-1-matthieu.bouron@gmail.com |
---|---|
State | Accepted |
Commit | 204008354f7f18fa9e64a6d487f65495f1cc9885 |
Headers | show |
On Wed, Jun 14, 2017 at 01:39:12PM +0200, Matthieu Bouron wrote: > --- > libavcodec/aarch64/simple_idct_neon.S | 28 ++++++++++++++-------------- > 1 file changed, 14 insertions(+), 14 deletions(-) > > diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S > index 5bd31e5be9..5e4d021a97 100644 > --- a/libavcodec/aarch64/simple_idct_neon.S > +++ b/libavcodec/aarch64/simple_idct_neon.S > @@ -92,10 +92,10 @@ endconst > .endm > > .macro idct_row4_neon y1, y2, y3, y4, pass > - ld1 {\y1\().2D-\y2\().2D}, [x2], #32 > + ld1 {\y1\().2D,\y2\().2D}, [x2], #32 > movi v23.4S, #1<<2, lsl #8 > orr v5.16B, \y1\().16B, \y2\().16B > - ld1 {\y3\().2D, \y4\().2D}, [x2], #32 > + ld1 {\y3\().2D,\y4\().2D}, [x2], #32 > orr v6.16B, \y3\().16B, \y4\().16B > orr v5.16B, v5.16B, v6.16B > mov x3, v5.D[1] > @@ -104,7 +104,7 @@ endconst > idct_col4_top \y1, \y2, \y3, \y4, 1, .4H > > cmp x3, #0 > - beq \pass\()f > + b.eq \pass\()f > > smull2 v7.4S, \y1\().8H, z4 > smlal2 v17.4S, \y2\().8H, z5 > @@ -169,7 +169,7 @@ function idct_col4_neon\i > mov x4, v28.D[\i - 1] > mov x5, v29.D[\i - 1] > cmp x4, #0 > - beq 1f > + b.eq 1f > > smull\i v7.4S, v28\l, z4 > add v19.4S, v19.4S, v7.4S > @@ -179,7 +179,7 @@ function idct_col4_neon\i > > 1: mov x4, v30.D[\i - 1] > cmp x5, #0 > - beq 2f > + b.eq 2f > > smlal\i v17.4S, v29\l, z5 > smlsl\i v18.4S, v29\l, z1 > @@ -188,7 +188,7 @@ function idct_col4_neon\i > > 2: mov x5, v31.D[\i - 1] > cmp x4, #0 > - beq 3f > + b.eq 3f > > smull\i v7.4S, v30\l, z6 > smull\i v16.4S, v30\l, z2 > @@ -198,7 +198,7 @@ function idct_col4_neon\i > add v21.4S, v21.4S, v16.4S > > 3: cmp x5, #0 > - beq 4f > + b.eq 4f > > smlal\i v17.4S, v31\l, z7 > smlsl\i v18.4S, v31\l, z5 > @@ -267,14 +267,14 @@ function ff_simple_idct_add_neon, export=1 > idct_row4_neon v28, v29, v30, v31, 2 > bl idct_col4_neon1 > > - sshr v1.8H, V7.8H, #COL_SHIFT-16 > + sshr v1.8H, v7.8H, #COL_SHIFT-16 > sshr v2.8H, v16.8H, #COL_SHIFT-16 > sshr v3.8H, v17.8H, #COL_SHIFT-16 > sshr v4.8H, v18.8H, #COL_SHIFT-16 > > bl idct_col4_neon2 > > - sshr v7.8H, V7.8H, #COL_SHIFT-16 > + sshr v7.8H, v7.8H, #COL_SHIFT-16 > sshr v16.8H, v16.8H, #COL_SHIFT-16 > sshr v17.8H, v17.8H, #COL_SHIFT-16 > sshr v18.8H, v18.8H, #COL_SHIFT-16 > @@ -330,7 +330,7 @@ function ff_simple_idct_neon, export=1 > mov x2, x0 > idct_row4_neon v24, v25, v26, v27, 1 > idct_row4_neon v28, v29, v30, v31, 2 > - add x2, x2, #-128 > + sub x2, x2, #128 > bl idct_col4_neon1 > > sshr v1.8H, v7.8H, #COL_SHIFT-16 > @@ -347,16 +347,16 @@ function ff_simple_idct_neon, export=1 > > zip1 v23.2D, v1.2D, v7.2D > zip2 v24.2D, v1.2D, v7.2D > - st1 {v23.2D,V24.2D}, [x2], #32 > + st1 {v23.2D,v24.2D}, [x2], #32 > zip1 v25.2D, v2.2D, v16.2D > zip2 v26.2D, v2.2D, v16.2D > - st1 {v25.2D,V26.2D}, [x2], #32 > + st1 {v25.2D,v26.2D}, [x2], #32 > zip1 v27.2D, v3.2D, v17.2D > zip2 v28.2D, v3.2D, v17.2D > - st1 {v27.2D,V28.2D}, [x2], #32 > + st1 {v27.2D,v28.2D}, [x2], #32 > zip1 v29.2D, v4.2D, v18.2D > zip2 v30.2D, v4.2D, v18.2D > - st1 {v29.2D,V30.2D}, [x2], #32 > + st1 {v29.2D,v30.2D}, [x2], #32 > > idct_end > endfunc > -- > 2.13.1 > I will push the patch in a few hours in there is no objection.
On Wed, Jun 14, 2017 at 06:17:54PM +0200, Matthieu Bouron wrote: > On Wed, Jun 14, 2017 at 01:39:12PM +0200, Matthieu Bouron wrote: > > --- > > libavcodec/aarch64/simple_idct_neon.S | 28 ++++++++++++++-------------- > > 1 file changed, 14 insertions(+), 14 deletions(-) > > > > diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S > > index 5bd31e5be9..5e4d021a97 100644 > > --- a/libavcodec/aarch64/simple_idct_neon.S > > +++ b/libavcodec/aarch64/simple_idct_neon.S > > @@ -92,10 +92,10 @@ endconst > > .endm > > > > .macro idct_row4_neon y1, y2, y3, y4, pass > > - ld1 {\y1\().2D-\y2\().2D}, [x2], #32 > > + ld1 {\y1\().2D,\y2\().2D}, [x2], #32 > > movi v23.4S, #1<<2, lsl #8 > > orr v5.16B, \y1\().16B, \y2\().16B > > - ld1 {\y3\().2D, \y4\().2D}, [x2], #32 > > + ld1 {\y3\().2D,\y4\().2D}, [x2], #32 > > orr v6.16B, \y3\().16B, \y4\().16B > > orr v5.16B, v5.16B, v6.16B > > mov x3, v5.D[1] > > @@ -104,7 +104,7 @@ endconst > > idct_col4_top \y1, \y2, \y3, \y4, 1, .4H > > > > cmp x3, #0 > > - beq \pass\()f > > + b.eq \pass\()f > > > > smull2 v7.4S, \y1\().8H, z4 > > smlal2 v17.4S, \y2\().8H, z5 > > @@ -169,7 +169,7 @@ function idct_col4_neon\i > > mov x4, v28.D[\i - 1] > > mov x5, v29.D[\i - 1] > > cmp x4, #0 > > - beq 1f > > + b.eq 1f > > > > smull\i v7.4S, v28\l, z4 > > add v19.4S, v19.4S, v7.4S > > @@ -179,7 +179,7 @@ function idct_col4_neon\i > > > > 1: mov x4, v30.D[\i - 1] > > cmp x5, #0 > > - beq 2f > > + b.eq 2f > > > > smlal\i v17.4S, v29\l, z5 > > smlsl\i v18.4S, v29\l, z1 > > @@ -188,7 +188,7 @@ function idct_col4_neon\i > > > > 2: mov x5, v31.D[\i - 1] > > cmp x4, #0 > > - beq 3f > > + b.eq 3f > > > > smull\i v7.4S, v30\l, z6 > > smull\i v16.4S, v30\l, z2 > > @@ -198,7 +198,7 @@ function idct_col4_neon\i > > add v21.4S, v21.4S, v16.4S > > > > 3: cmp x5, #0 > > - beq 4f > > + b.eq 4f > > > > smlal\i v17.4S, v31\l, z7 > > smlsl\i v18.4S, v31\l, z5 > > @@ -267,14 +267,14 @@ function ff_simple_idct_add_neon, export=1 > > idct_row4_neon v28, v29, v30, v31, 2 > > bl idct_col4_neon1 > > > > - sshr v1.8H, V7.8H, #COL_SHIFT-16 > > + sshr v1.8H, v7.8H, #COL_SHIFT-16 > > sshr v2.8H, v16.8H, #COL_SHIFT-16 > > sshr v3.8H, v17.8H, #COL_SHIFT-16 > > sshr v4.8H, v18.8H, #COL_SHIFT-16 > > > > bl idct_col4_neon2 > > > > - sshr v7.8H, V7.8H, #COL_SHIFT-16 > > + sshr v7.8H, v7.8H, #COL_SHIFT-16 > > sshr v16.8H, v16.8H, #COL_SHIFT-16 > > sshr v17.8H, v17.8H, #COL_SHIFT-16 > > sshr v18.8H, v18.8H, #COL_SHIFT-16 > > @@ -330,7 +330,7 @@ function ff_simple_idct_neon, export=1 > > mov x2, x0 > > idct_row4_neon v24, v25, v26, v27, 1 > > idct_row4_neon v28, v29, v30, v31, 2 > > - add x2, x2, #-128 > > + sub x2, x2, #128 > > bl idct_col4_neon1 > > > > sshr v1.8H, v7.8H, #COL_SHIFT-16 > > @@ -347,16 +347,16 @@ function ff_simple_idct_neon, export=1 > > > > zip1 v23.2D, v1.2D, v7.2D > > zip2 v24.2D, v1.2D, v7.2D > > - st1 {v23.2D,V24.2D}, [x2], #32 > > + st1 {v23.2D,v24.2D}, [x2], #32 > > zip1 v25.2D, v2.2D, v16.2D > > zip2 v26.2D, v2.2D, v16.2D > > - st1 {v25.2D,V26.2D}, [x2], #32 > > + st1 {v25.2D,v26.2D}, [x2], #32 > > zip1 v27.2D, v3.2D, v17.2D > > zip2 v28.2D, v3.2D, v17.2D > > - st1 {v27.2D,V28.2D}, [x2], #32 > > + st1 {v27.2D,v28.2D}, [x2], #32 > > zip1 v29.2D, v4.2D, v18.2D > > zip2 v30.2D, v4.2D, v18.2D > > - st1 {v29.2D,V30.2D}, [x2], #32 > > + st1 {v29.2D,v30.2D}, [x2], #32 > > > > idct_end > > endfunc > > -- > > 2.13.1 > > > > I will push the patch in a few hours in there is no objection. Patch applied.
diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S index 5bd31e5be9..5e4d021a97 100644 --- a/libavcodec/aarch64/simple_idct_neon.S +++ b/libavcodec/aarch64/simple_idct_neon.S @@ -92,10 +92,10 @@ endconst .endm .macro idct_row4_neon y1, y2, y3, y4, pass - ld1 {\y1\().2D-\y2\().2D}, [x2], #32 + ld1 {\y1\().2D,\y2\().2D}, [x2], #32 movi v23.4S, #1<<2, lsl #8 orr v5.16B, \y1\().16B, \y2\().16B - ld1 {\y3\().2D, \y4\().2D}, [x2], #32 + ld1 {\y3\().2D,\y4\().2D}, [x2], #32 orr v6.16B, \y3\().16B, \y4\().16B orr v5.16B, v5.16B, v6.16B mov x3, v5.D[1] @@ -104,7 +104,7 @@ endconst idct_col4_top \y1, \y2, \y3, \y4, 1, .4H cmp x3, #0 - beq \pass\()f + b.eq \pass\()f smull2 v7.4S, \y1\().8H, z4 smlal2 v17.4S, \y2\().8H, z5 @@ -169,7 +169,7 @@ function idct_col4_neon\i mov x4, v28.D[\i - 1] mov x5, v29.D[\i - 1] cmp x4, #0 - beq 1f + b.eq 1f smull\i v7.4S, v28\l, z4 add v19.4S, v19.4S, v7.4S @@ -179,7 +179,7 @@ function idct_col4_neon\i 1: mov x4, v30.D[\i - 1] cmp x5, #0 - beq 2f + b.eq 2f smlal\i v17.4S, v29\l, z5 smlsl\i v18.4S, v29\l, z1 @@ -188,7 +188,7 @@ function idct_col4_neon\i 2: mov x5, v31.D[\i - 1] cmp x4, #0 - beq 3f + b.eq 3f smull\i v7.4S, v30\l, z6 smull\i v16.4S, v30\l, z2 @@ -198,7 +198,7 @@ function idct_col4_neon\i add v21.4S, v21.4S, v16.4S 3: cmp x5, #0 - beq 4f + b.eq 4f smlal\i v17.4S, v31\l, z7 smlsl\i v18.4S, v31\l, z5 @@ -267,14 +267,14 @@ function ff_simple_idct_add_neon, export=1 idct_row4_neon v28, v29, v30, v31, 2 bl idct_col4_neon1 - sshr v1.8H, V7.8H, #COL_SHIFT-16 + sshr v1.8H, v7.8H, #COL_SHIFT-16 sshr v2.8H, v16.8H, #COL_SHIFT-16 sshr v3.8H, v17.8H, #COL_SHIFT-16 sshr v4.8H, v18.8H, #COL_SHIFT-16 bl idct_col4_neon2 - sshr v7.8H, V7.8H, #COL_SHIFT-16 + sshr v7.8H, v7.8H, #COL_SHIFT-16 sshr v16.8H, v16.8H, #COL_SHIFT-16 sshr v17.8H, v17.8H, #COL_SHIFT-16 sshr v18.8H, v18.8H, #COL_SHIFT-16 @@ -330,7 +330,7 @@ function ff_simple_idct_neon, export=1 mov x2, x0 idct_row4_neon v24, v25, v26, v27, 1 idct_row4_neon v28, v29, v30, v31, 2 - add x2, x2, #-128 + sub x2, x2, #128 bl idct_col4_neon1 sshr v1.8H, v7.8H, #COL_SHIFT-16 @@ -347,16 +347,16 @@ function ff_simple_idct_neon, export=1 zip1 v23.2D, v1.2D, v7.2D zip2 v24.2D, v1.2D, v7.2D - st1 {v23.2D,V24.2D}, [x2], #32 + st1 {v23.2D,v24.2D}, [x2], #32 zip1 v25.2D, v2.2D, v16.2D zip2 v26.2D, v2.2D, v16.2D - st1 {v25.2D,V26.2D}, [x2], #32 + st1 {v25.2D,v26.2D}, [x2], #32 zip1 v27.2D, v3.2D, v17.2D zip2 v28.2D, v3.2D, v17.2D - st1 {v27.2D,V28.2D}, [x2], #32 + st1 {v27.2D,v28.2D}, [x2], #32 zip1 v29.2D, v4.2D, v18.2D zip2 v30.2D, v4.2D, v18.2D - st1 {v29.2D,V30.2D}, [x2], #32 + st1 {v29.2D,v30.2D}, [x2], #32 idct_end endfunc