[FFmpeg-devel,1/6] initial alignment corrections for xmm registers

Submitted by James Darnley on June 3, 2017, 12:18 a.m.

Details

Message ID 20170603001809.13960-2-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley June 3, 2017, 12:18 a.m.
---
 libavcodec/x86/simple_idct.asm | 47 ++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 13 deletions(-)

Comments

Michael Niedermayer June 3, 2017, 5:11 p.m.
On Sat, Jun 03, 2017 at 02:18:04AM +0200, James Darnley wrote:
> ---
>  libavcodec/x86/simple_idct.asm | 47 ++++++++++++++++++++++++++++++------------
>  1 file changed, 34 insertions(+), 13 deletions(-)

should be ok

minor cosmetic misalignment below:

[...]

> @@ -582,7 +603,7 @@ SECTION .text
>      pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
>      movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
>      pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
> -    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
> +    pmaddwd         mm3, [coeffs + 128]  ; -C2R6+C6R2     -C2r6+C6r2
>      paddd           mm7, mm1            ; A0             a0
>      paddd           mm1, mm1            ; 2C0            2c0
>      psubd           mm1, mm7            ; A3             a3
> -- 
> 2.12.2
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index 6fedbb5784..b5d05ca653 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -29,8 +29,8 @@  SECTION_RODATA
 
 cextern pb_80
 
-wm1010: dw 0, 0xffff, 0, 0xffff
 d40000: dd 4 << 16, 0
+wm1010: dw 0, 0xffff, 0, 0xffff
 
 ; 23170.475006
 ; 22725.260826
@@ -53,30 +53,51 @@  d40000: dd 4 << 16, 0
 %define ROW_SHIFT 11
 %define COL_SHIFT 20 ; 6
 
-coeffs:
+rounding:
+    dw 1 << (ROW_SHIFT - 1), 0
     dw 1 << (ROW_SHIFT - 1), 0
     dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 0
+
+coeffs:
+    dw 1 << (ROW_SHIFT - 1), 1
+    dw 1 << (ROW_SHIFT - 1), 0
     dw 1 << (ROW_SHIFT - 1), 1
     dw 1 << (ROW_SHIFT - 1), 0
 
+    ; coeffs + 16
     dw C4,  C4,  C4,  C4
     dw C4, -C4,  C4, -C4
 
+    ; coeffs + 32
     dw C2,  C6,  C2,  C6
     dw C6, -C2,  C6, -C2
 
+    ; coeffs + 48
     dw C1,  C3,  C1,  C3
     dw C5,  C7,  C5,  C7
 
+    ; coeffs + 64
     dw C3, -C7,  C3, -C7
     dw -C1, -C5, -C1, -C5
 
+    ; coeffs + 80
     dw C5, -C1,  C5, -C1
     dw C7,  C3,  C7,  C3
 
+    ; coeffs + 96
     dw C7, -C5,  C7, -C5
     dw C3, -C1,  C3, -C1
 
+    ; for alignment
+    ; coeffs + 112
+    dw C3, -C1,  C3, -C1
+    times 8 db 0
+
+    ; coeffs + 128
+    dw C6, -C2,  C6, -C2
+
+
 SECTION .text
 
 %macro DC_COND_IDCT 7
@@ -103,13 +124,13 @@  SECTION .text
     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, [coeffs + 8]
+    paddd           mm4, [rounding + 16]
     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
     paddd           mm4, mm5            ; A0             a0
     psubd           mm6, mm5            ; A3             a3
     movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
     pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
-    paddd           mm0, [coeffs + 8]
+    paddd           mm0, [rounding + 16]
     paddd           mm1, mm0            ; A1             a1
     paddd           mm0, mm0
     psubd           mm0, mm1            ; A2             a2
@@ -139,7 +160,7 @@  SECTION .text
     pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
     movq            mm2, mm0            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    pmaddwd         mm3, [coeffs + 112] ; -C1R7+C3R5     -C1r7+C3r5
     paddd           mm4, mm7            ; B2             b2
     paddd           mm2, mm4            ; A2+B2          a2+b2
     psubd           mm0, mm4            ; a2-B2          a2-b2
@@ -191,13 +212,13 @@  SECTION .text
     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, [coeffs]
+    paddd           mm4, [rounding]
     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
     paddd           mm4, mm5            ; A0             a0
     psubd           mm6, mm5            ; A3             a3
     movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
     pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
-    paddd           mm0, [coeffs]
+    paddd           mm0, [rounding]
     paddd           mm1, mm0            ; A1             a1
     paddd           mm0, mm0
     psubd           mm0, mm1            ; A2             a2
@@ -227,7 +248,7 @@  SECTION .text
     pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
     movq            mm2, mm0            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    pmaddwd         mm3, [coeffs + 112] ; -C1R7+C3R5     -C1r7+C3r5
     paddd           mm4, mm7            ; B2             b2
     paddd           mm2, mm4            ; A2+B2          a2+b2
     psubd           mm0, mm4            ; a2-B2          a2-b2
@@ -298,7 +319,7 @@  SECTION .text
     pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
     movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    pmaddwd         mm3, [coeffs + 112] ; -C1R7+C3R5     -C1r7+C3r5
     paddd           mm4, mm7            ; B2             b2
     paddd           mm2, mm4            ; A2+B2          a2+b2
     psubd           mm5, mm4            ; a2-B2          a2-b2
@@ -363,7 +384,7 @@  SECTION .text
     movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
     pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
     movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    pmaddwd         mm3, [coeffs + 112] ; -C1R7+C3R5     -C1r7+C3r5
     paddd           mm2, mm1            ; A2+B2          a2+b2
     psubd           mm5, mm1            ; a2-B2          a2-b2
     psrad           mm2, %6
@@ -417,7 +438,7 @@  SECTION .text
     movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
     pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
     movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    pmaddwd         mm3, [coeffs + 112] ; -C1R7+C3R5     -C1r7+C3r5
     paddd           mm2, mm1            ; A2+B2          a2+b2
     psubd           mm5, mm1            ; a2-B2          a2-b2
     psrad           mm2, %6
@@ -481,7 +502,7 @@  SECTION .text
     pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
     movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    pmaddwd         mm3, [coeffs + 112] ; -C1R7+C3R5     -C1r7+C3r5
     paddd           mm4, mm7            ; B2             b2
     paddd           mm2, mm4            ; A2+B2          a2+b2
     psubd           mm5, mm4            ; a2-B2          a2-b2
@@ -582,7 +603,7 @@  SECTION .text
     pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
     movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
     pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
-    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
+    pmaddwd         mm3, [coeffs + 128]  ; -C2R6+C6R2     -C2r6+C6r2
     paddd           mm7, mm1            ; A0             a0
     paddd           mm1, mm1            ; 2C0            2c0
     psubd           mm1, mm7            ; A3             a3