diff mbox series

[FFmpeg-devel] x86/vvc_alf: use the x86inc instruction macros

Message ID 20240521135256.3111-1-jamrial@gmail.com
State Accepted
Commit 3d1597d3e2ff9a6625af23be63131142a910d403
Headers show
Series [FFmpeg-devel] x86/vvc_alf: use the x86inc instruction macros | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

James Almer May 21, 2024, 1:52 p.m. UTC
Let its magic figure out the correct mnemonic based on target instruction set.

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/vvc/vvc_alf.asm | 202 ++++++++++++++++-----------------
 1 file changed, 101 insertions(+), 101 deletions(-)

Comments

Wu Jianhua May 21, 2024, 4:29 p.m. UTC | #1
> 发件人: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> 代表 James Almer <jamrial@gmail.com>
> 发送时间: 2024年5月21日 6:52
> 收件人: ffmpeg-devel@ffmpeg.org
> 主题: [FFmpeg-devel] [PATCH] x86/vvc_alf: use the x86inc instruction macros
> 
> Let its magic figure out the correct mnemonic based on target instruction set.
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavcodec/x86/vvc/vvc_alf.asm | 202 ++++++++++++++++-----------------
>  1 file changed, 101 insertions(+), 101 deletions(-)

I tested this patch and LGTM. Thanks for updating them. 

And would it be better to add avcodec to the path of the commit message?

Thanks,
Jianhua
Nuo Mi May 22, 2024, 12:58 p.m. UTC | #2
On Wed, May 22, 2024 at 12:29 AM Wu Jianhua <toqsxw@outlook.com> wrote:

> > 发件人: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> 代表 James Almer <
> jamrial@gmail.com>
> > 发送时间: 2024年5月21日 6:52
> > 收件人: ffmpeg-devel@ffmpeg.org
> > 主题: [FFmpeg-devel] [PATCH] x86/vvc_alf: use the x86inc instruction macros
> >
> > Let its magic figure out the correct mnemonic based on target
> instruction set.
> >
> > Signed-off-by: James Almer <jamrial@gmail.com>
> > ---
> >  libavcodec/x86/vvc/vvc_alf.asm | 202 ++++++++++++++++-----------------
> >  1 file changed, 101 insertions(+), 101 deletions(-)
>
> I tested this patch and LGTM. Thanks for updating them.
> 
> And would it be better to add avcodec to the path of the commit message?
>
Hi Jianhua,
vvc is clear in this case.

Applied,
thank you, Jianhua and James




> Thanks,
> Jianhua
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm
index b3d118962f..71e821c27b 100644
--- a/libavcodec/x86/vvc/vvc_alf.asm
+++ b/libavcodec/x86/vvc/vvc_alf.asm
@@ -73,15 +73,15 @@  SECTION .text
     ;m%2 = 07 06 05 04
     ;m%3 = 11 10 09 08
 
-    vshufpd                 m%5, m%1, m%2, 0011b        ;06 02 05 01
-    vshufpd                 m%6, m%3, m%5, 1001b        ;06 10 01 09
+    shufpd                  m%5, m%1, m%2, 0011b        ;06 02 05 01
+    shufpd                  m%6, m%3, m%5, 1001b        ;06 10 01 09
 
-    vshufpd                 m%1, m%1, m%6, 1100b        ;06 03 09 00
-    vshufpd                 m%2, m%2, m%6, 0110b        ;10 07 01 04
-    vshufpd                 m%3, m%3, m%5, 0110b        ;02 11 05 08
+    shufpd                  m%1, m%1, m%6, 1100b        ;06 03 09 00
+    shufpd                  m%2, m%2, m%6, 0110b        ;10 07 01 04
+    shufpd                  m%3, m%3, m%5, 0110b        ;02 11 05 08
 
     vpermpd                 m%1, m%1, 01111000b         ;09 06 03 00
-    vshufpd                 m%2, m%2, m%2, 1001b        ;10 07 04 01
+    shufpd                  m%2, m%2, m%2, 1001b        ;10 07 04 01
     vpermpd                 m%3, m%3, 10000111b         ;11 08 05 02
 %endmacro
 
@@ -125,21 +125,21 @@  SECTION .text
     pxor             m11, m11
     psubw            m11, m12                                ;-clip
 
-    vpsubw            m9, m2
+    psubw             m9, m2
     CLIPW             m9, m11, m12
 
-    vpsubw           m10, m2
+    psubw            m10, m2
     CLIPW            m10, m11, m12
 
-    vpunpckhwd       m13, m9, m10
-    vpunpcklwd        m9, m9, m10
+    punpckhwd        m13, m9, m10
+    punpcklwd         m9, m9, m10
 
     pshufb           m12, filters, [param_shuffe_ %+ i]       ;filter
-    vpunpcklwd       m10, m12, m12
-    vpunpckhwd       m12, m12, m12
+    punpcklwd        m10, m12, m12
+    punpckhwd        m12, m12, m12
 
-    vpmaddwd          m9, m10
-    vpmaddwd         m12, m13
+    pmaddwd           m9, m10
+    pmaddwd          m12, m13
 
     paddd             m0, m9
     paddd             m1, m12
@@ -268,17 +268,17 @@  SECTION .text
     je         %%near_vb
 %endif
 %%no_vb:
-    vpsrad            m0, SHIFT
-    vpsrad            m1, SHIFT
+    psrad             m0, SHIFT
+    psrad             m1, SHIFT
     jmp      %%shift_end
 %%near_vb:
     vpbroadcastd      m9, [dd448]
     paddd             m0, m9
     paddd             m1, m9
-    vpsrad            m0, SHIFT + 3
-    vpsrad            m1, SHIFT + 3
+    psrad             m0, SHIFT + 3
+    psrad             m1, SHIFT + 3
 %%shift_end:
-    vpackssdw         m0, m0, m1
+    packssdw          m0, m0, m1
 %endmacro
 
 ; FILTER_VB(line)
@@ -320,7 +320,7 @@  SECTION .text
 %if ps == 2
     movu      %1, %2
 %else
-    vpmovzxbw %1, %2
+    pmovzxbw  %1, %2
 %endif
 %endmacro
 
@@ -329,7 +329,7 @@  SECTION .text
     %if ps == 2
         movu         %1, m%2
     %else
-        vpackuswb   m%2, m%2
+        packuswb    m%2, m%2
         vpermq      m%2, m%2, 0x8
         movu         %1, xm%2
     %endif
@@ -489,43 +489,43 @@  cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w
         LOAD_PIXELS       m6, [s2q + 2 * ps]
         LOAD_PIXELS       m7, [s3q + 2 * ps]
 
-        vpblendw          m8, m0, m1, 0xaa             ; nw
-        vpblendw          m9, m0, m5, 0x55             ; n
-        vpblendw         m10, m4, m5, 0xaa             ; ne
-        vpblendw         m11, m1, m2, 0xaa             ; w
-        vpblendw         m12, m5, m6, 0xaa             ; e
-        vpblendw         m13, m2, m3, 0xaa             ; sw
-        vpblendw         m14, m2, m7, 0x55             ; s
+        pblendw           m8, m0, m1, 0xaa             ; nw
+        pblendw           m9, m0, m5, 0x55             ; n
+        pblendw          m10, m4, m5, 0xaa             ; ne
+        pblendw          m11, m1, m2, 0xaa             ; w
+        pblendw          m12, m5, m6, 0xaa             ; e
+        pblendw          m13, m2, m3, 0xaa             ; sw
+        pblendw          m14, m2, m7, 0x55             ; s
 
-        vpblendw          m0, m1, m6, 0x55
-        vpaddw            m0, m0                       ; c
+        pblendw           m0, m1, m6, 0x55
+        paddw             m0, m0                       ; c
 
         movu              m1, [CLASSIFY_SHUFFE]
         pshufb            m1, m0, m1                   ; d
 
-        vpaddw            m9, m14                      ; n + s
-        vpsubw            m9, m0                       ; (n + s) - c
-        vpabsw            m9, m9                       ; ver
+        paddw             m9, m14                      ; n + s
+        psubw             m9, m0                       ; (n + s) - c
+        pabsw             m9, m9                       ; ver
 
-        vpaddw           m11, m12                      ; w + e
-        vpsubw           m11, m1                       ; (w + e) - d
-        vpabsw           m11, m11                      ; hor
+        paddw            m11, m12                      ; w + e
+        psubw            m11, m1                       ; (w + e) - d
+        pabsw            m11, m11                      ; hor
 
-        vpblendw         m14, m6, m7, 0xaa             ; se
-        vpaddw            m8, m14                      ; nw + se
-        vpsubw            m8, m1                       ; (nw + se) - d
-        vpabsw            m8, m8                       ; di0
+        pblendw          m14, m6, m7, 0xaa             ; se
+        paddw             m8, m14                      ; nw + se
+        psubw             m8, m1                       ; (nw + se) - d
+        pabsw             m8, m8                       ; di0
 
-        vpaddw           m10, m13                      ; ne + sw
-        vpsubw           m10, m1                       ; (nw + se) - d
-        vpabsw           m10, m10                      ; di1
+        paddw            m10, m13                      ; ne + sw
+        psubw            m10, m1                       ; (nw + se) - d
+        pabsw            m10, m10                      ; di1
 
         phaddw            m9,  m11                     ; vh,  each word represent 2x2 pixels
         phaddw            m8,  m10                     ; di,  each word represent 2x2 pixels
         phaddw            m0,  m9, m8                  ; all = each word represent 4x2 pixels, order is v_h_d0_d1 x 4
 
         vinserti128      m15, m15, xm0, 1
-        vpblendw          m1,  m0, m15, 0xaa           ; t
+        pblendw           m1,  m0, m15, 0xaa           ; t
 
         phaddw            m1,  m0                      ; each word represent 8x2 pixels, adjacent word share 4x2 pixels
 
@@ -594,7 +594,7 @@  cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w
     vpbroadcastd     m13, xm13
     movd            xm12, vb_posd
     vpbroadcastd     m12, xm12
-    vpcmpeqd         m13, m12       ; y == vb_pos
+    pcmpeqd          m13, m12       ; y == vb_pos
     pandn            m13, m11       ; y != vb_pos
 
     vpbroadcastd     m14, [dw3]
@@ -603,23 +603,23 @@  cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w
     pblendvb          m3, m15, [gradq + sum_stride3q], m13
 
     ; extent to dword to avoid overflow
-    vpunpcklwd        m4, m0, m15
-    vpunpckhwd        m5, m0, m15
-    vpunpcklwd        m6, m1, m15
-    vpunpckhwd        m7, m1, m15
-    vpunpcklwd        m8, m2, m15
-    vpunpckhwd        m9, m2, m15
-    vpunpcklwd       m10, m3, m15
-    vpunpckhwd       m11, m3, m15
-
-    vpaddd            m0, m4, m6
-    vpaddd            m1, m5, m7
-    vpaddd            m2, m8, m10
-    vpaddd            m3, m9, m11
+    punpcklwd         m4, m0, m15
+    punpckhwd         m5, m0, m15
+    punpcklwd         m6, m1, m15
+    punpckhwd         m7, m1, m15
+    punpcklwd         m8, m2, m15
+    punpckhwd         m9, m2, m15
+    punpcklwd        m10, m3, m15
+    punpckhwd        m11, m3, m15
+
+    paddd             m0, m4, m6
+    paddd             m1, m5, m7
+    paddd             m2, m8, m10
+    paddd             m3, m9, m11
 
     ; sum of the first row
-    vpaddd            m0, m2           ; low
-    vpaddd            m1, m3           ; high
+    paddd             m0, m2           ; low
+    paddd             m1, m3           ; high
 
     lea            gradq, [gradq + 2 * sum_strideq]
 
@@ -629,65 +629,65 @@  cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w
     movu             m12, [gradq + 2 * sum_strideq]
     movu             m13, [gradq + sum_stride3q]
 
-    vpunpcklwd        m4,  m10, m15
-    vpunpckhwd        m5,  m10, m15
-    vpunpcklwd        m6,  m11, m15
-    vpunpckhwd        m7,  m11, m15
-    vpunpcklwd        m8,  m12, m15
-    vpunpckhwd        m9,  m12, m15
-    vpunpcklwd       m10,  m13, m15
-    vpunpckhwd       m11,  m13, m15
+    punpcklwd         m4,  m10, m15
+    punpckhwd         m5,  m10, m15
+    punpcklwd         m6,  m11, m15
+    punpckhwd         m7,  m11, m15
+    punpcklwd         m8,  m12, m15
+    punpckhwd         m9,  m12, m15
+    punpcklwd        m10,  m13, m15
+    punpckhwd        m11,  m13, m15
 
-    vpaddd            m2, m4, m6
-    vpaddd            m3, m5, m7
-    vpaddd            m4, m8, m10
-    vpaddd            m5, m9, m11
+    paddd             m2, m4, m6
+    paddd             m3, m5, m7
+    paddd             m4, m8, m10
+    paddd             m5, m9, m11
 
     ; sum of the second row
-    vpaddd            m2, m4        ; low
-    vpaddd            m3, m5        ; high
+    paddd             m2, m4        ; low
+    paddd             m3, m5        ; high
 
-    vpunpckldq        m4, m0, m2
-    vpunpckhdq        m5, m0, m2
-    vpunpckldq        m6, m1, m3
-    vpunpckhdq        m7, m1, m3
+    punpckldq         m4, m0, m2
+    punpckhdq         m5, m0, m2
+    punpckldq         m6, m1, m3
+    punpckhdq         m7, m1, m3
 
     ; each dword represent 4x2 alf blocks
     ; the order is 01452367
-    vpunpckldq        m0, m4, m6         ; sum_v
-    vpunpckhdq        m1, m4, m6         ; sum_h
-    vpunpckldq        m2, m5, m7         ; sum_d0
-    vpunpckhdq        m3, m5, m7         ; sum_d1
+    punpckldq         m0, m4, m6         ; sum_v
+    punpckhdq         m1, m4, m6         ; sum_h
+    punpckldq         m2, m5, m7         ; sum_d0
+    punpckhdq         m3, m5, m7         ; sum_d1
 
-    vpcmpgtd          m4, m0, m1         ; dir_hv - 1
-    vpmaxsd           m5, m0, m1         ; hv1
-    vpminsd           m6, m0, m1         ; hv0
+    pcmpgtd           m4, m0, m1         ; dir_hv - 1
+    pmaxsd            m5, m0, m1         ; hv1
+    pminsd            m6, m0, m1         ; hv0
 
-    vpaddd            m0, m1;            ; sum_hv
+    paddd             m0, m1;            ; sum_hv
 
-    vpcmpgtd          m7, m2, m3         ; dir_d - 1
-    vpmaxsd           m8, m2, m3         ; d1
-    vpminsd           m9, m2, m3         ; d0
+    pcmpgtd           m7, m2, m3         ; dir_d - 1
+    pmaxsd            m8, m2, m3         ; d1
+    pminsd            m9, m2, m3         ; d0
 
     ; *transpose_idx = dir_d * 2 + dir_hv;
     vpbroadcastd     m10, [dw3]
-    vpaddd           m11, m7, m7
-    vpaddd           m11, m4
-    vpaddd           m10, m11
+    paddd            m11, m7, m7
+    paddd            m11, m4
+    paddd            m10, m11
     vpermq           m10, m10, 11011000b
     SAVE_CLASSIFY_PARAM transpose_idx, 10
 
-    vpsrlq           m10, m8, 32
-    vpsrlq           m11, m6, 32
+    psrlq            m10, m8, 32
+    psrlq            m11, m6, 32
     pmuldq           m12, m10, m11       ; d1 * hv0 high
-    vpsrlq            m1,  m9, 32
-    vpsrlq            m2,  m5, 32
+    psrlq             m1,  m9, 32
+    psrlq             m2,  m5, 32
     pmuldq            m3,  m1, m2        ; d0 * hv1 high
-    vpcmpgtq         m10, m12, m3        ; dir1 - 1 high
+    pcmpgtq          m10, m12, m3        ; dir1 - 1 high
 
     pmuldq            m1, m8, m6         ; d1 * hv0 low
     pmuldq            m2, m9, m5         ; d0 * hv1 low
-    vpcmpgtq          m1, m2             ; dir1 - 1 low
+    pcmpgtq           m1, m2             ; dir1 - 1 low
 
     vpblendd          m1, m1, m10, 0xaa  ; dir1 - 1
 
@@ -698,9 +698,9 @@  cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w
     vpbroadcastd      m5, xm5
 
     ;*class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
-    vpmulld           m0, m14            ; sum_hv * ac
+    pmulld            m0, m14            ; sum_hv * ac
     vpsrlvd           m0, m0, m5
-    vpminsd           m0, [dd15]
+    pminsd            m0, [dd15]
     movu              m6, [ARG_VAR_SHUFFE]
     pshufb            m6, m0             ; class_idx
 
@@ -716,7 +716,7 @@  cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w
     paddd             m6,  m7            ; class_idx
 
     paddd             m8, m2, m2
-    vpslld            m9, m3, 3
+    pslld             m9, m3, 3
     paddd             m9, m3
     pcmpgtd           m8, m9             ; hvd1 * 2 > 9 * hvd0
     pand              m8, m10