diff mbox series

[FFmpeg-devel,v4,2/2,GSoC,2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c

Message ID 20240520004210.11489-4-chen.stonechen@gmail.com
State New
Headers show
Series [FFmpeg-devel,v4,1/2,GSoC,2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

Stone Chen May 20, 2024, 12:42 a.m. UTC
Adds checkasm for DMVR SAD AVX2 implementation.

Benchmarks ( AMD 7940HS )
vvc_sad_8x8_c: 70.0
vvc_sad_8x8_avx2: 10.0
vvc_sad_16x16_c: 280.0
vvc_sad_16x16_avx2: 20.0
vvc_sad_32x32_c: 1020.0
vvc_sad_32x32_avx2: 70.0
vvc_sad_64x64_c: 3560.0
vvc_sad_64x64_avx2: 270.0
vvc_sad_128x128_c: 13760.0
vvc_sad_128x128_avx2: 1070.0
---
 tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

Comments

Rémi Denis-Courmont May 21, 2024, 5:12 a.m. UTC | #1
Hi,

Le 20 mai 2024 03:42:03 GMT+03:00, Stone Chen <chen.stonechen@gmail.com> a écrit :
>Adds checkasm for DMVR SAD AVX2 implementation.
>
>Benchmarks ( AMD 7940HS )
>vvc_sad_8x8_c: 70.0
>vvc_sad_8x8_avx2: 10.0
>vvc_sad_16x16_c: 280.0
>vvc_sad_16x16_avx2: 20.0
>vvc_sad_32x32_c: 1020.0
>vvc_sad_32x32_avx2: 70.0
>vvc_sad_64x64_c: 3560.0
>vvc_sad_64x64_avx2: 270.0
>vvc_sad_128x128_c: 13760.0
>vvc_sad_128x128_avx2: 1070.0
>---
> tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 38 insertions(+)

VVC benchmarks have increased checksam runtime by at least an order of magnitude. It's become so prohibitively slow that I could not even get to the end.

This is not an acceptable situation and impedes non-VVC assembler work 

Please fix this before you add any new VVC tests. In the mean time:

-1 / Nack all VVC checksam from my behalf.


>diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
>index 97f57cb401..e251400bfc 100644
>--- a/tests/checkasm/vvc_mc.c
>+++ b/tests/checkasm/vvc_mc.c
>@@ -322,8 +322,46 @@ static void check_avg(void)
>     report("avg");
> }
> 
>+static void check_vvc_sad(void)
>+{
>+    const int bit_depth = 10;
>+    VVCDSPContext c;
>+    LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
>+    LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
>+    declare_func(int, const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
>+
>+    ff_vvc_dsp_init(&c, bit_depth);
>+    memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
>+    memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
>+
>+    randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
>+     for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
>+        for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
>+            for(int offy = 0; offy <= 4; offy++) {
>+                for(int offx = 0; offx <= 4; offx++) {
>+                    if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
>+                        int result0;
>+                        int result1;
>+
>+                        result0 =  call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
>+                        result1 =  call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
>+
>+                        if (result1 != result0)
>+                            fail();
>+                        if(w == h && offx == 0 && offy == 0)
>+                            bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
>+                    }
>+                }
>+            }
>+        }
>+     }
>+
>+    report("check_vvc_sad");
>+}
>+
> void checkasm_check_vvc_mc(void)
> {
>+    check_vvc_sad();
>     check_put_vvc_luma();
>     check_put_vvc_luma_uni();
>     check_put_vvc_chroma();
Martin Storsjö May 21, 2024, 6:37 a.m. UTC | #2
On Tue, 21 May 2024, Rémi Denis-Courmont wrote:

> Hi,
>
> Le 20 mai 2024 03:42:03 GMT+03:00, Stone Chen <chen.stonechen@gmail.com> a écrit :
>> Adds checkasm for DMVR SAD AVX2 implementation.
>>
>> Benchmarks ( AMD 7940HS )
>> vvc_sad_8x8_c: 70.0
>> vvc_sad_8x8_avx2: 10.0
>> vvc_sad_16x16_c: 280.0
>> vvc_sad_16x16_avx2: 20.0
>> vvc_sad_32x32_c: 1020.0
>> vvc_sad_32x32_avx2: 70.0
>> vvc_sad_64x64_c: 3560.0
>> vvc_sad_64x64_avx2: 270.0
>> vvc_sad_128x128_c: 13760.0
>> vvc_sad_128x128_avx2: 1070.0
>> ---
>> tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 38 insertions(+)
>
> VVC benchmarks have increased checksam runtime by at least an order of 
> magnitude. It's become so prohibitively slow that I could not even get 
> to the end.
>
> This is not an acceptable situation and impedes non-VVC assembler work

I don't quite understand; whenever benchmarking anything in checkasm, I 
would always run e.g. "checkasm --test=ac3dsp 
--bench=ac3_sum_square_bufferfly_float", limiting the total running of 
tests to a specific module, and only benchmarking a subset of the run 
functions. (The --bench parameter specifies a prefix; only functions 
matching that prefix gets benchmarked.)

Without limiting the scope with a --test parameter, checkasm benchmarking 
has always been prohibitively slow for me - so I don't think there's 
anything new here? If you were lucky enough to be able to do a full run of 
checkasm with benchmarks of all functions before, that sounds like an 
exception to me, not a reason to limit adding new tests?

That said I'm not familiar with the VVC tests in checkasm, perhaps they 
benchmark things excessively. But I don't see how that would impede work 
on other DSP functions in any way?

// Martin
Rémi Denis-Courmont May 21, 2024, 8:47 a.m. UTC | #3
Le 21 mai 2024 09:37:18 GMT+03:00, "Martin Storsjö" <martin@martin.st> a écrit :
>On Tue, 21 May 2024, Rémi Denis-Courmont wrote:
>
>> Hi,
>> 
>> Le 20 mai 2024 03:42:03 GMT+03:00, Stone Chen <chen.stonechen@gmail.com> a écrit :
>>> Adds checkasm for DMVR SAD AVX2 implementation.
>>> 
>>> Benchmarks ( AMD 7940HS )
>>> vvc_sad_8x8_c: 70.0
>>> vvc_sad_8x8_avx2: 10.0
>>> vvc_sad_16x16_c: 280.0
>>> vvc_sad_16x16_avx2: 20.0
>>> vvc_sad_32x32_c: 1020.0
>>> vvc_sad_32x32_avx2: 70.0
>>> vvc_sad_64x64_c: 3560.0
>>> vvc_sad_64x64_avx2: 270.0
>>> vvc_sad_128x128_c: 13760.0
>>> vvc_sad_128x128_avx2: 1070.0
>>> ---
>>> tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 38 insertions(+)
>> 
>> VVC benchmarks have increased checksam runtime by at least an order of magnitude. It's become so prohibitively slow that I could not even get to the end.
>> 
>> This is not an acceptable situation and impedes non-VVC assembler work
>
>I don't quite understand; whenever benchmarking anything in checkasm, I would always run e.g. "checkasm --test=ac3dsp --bench=ac3_sum_square_bufferfly_float", limiting the total running of tests to a specific module, and only benchmarking a subset of the run functions. (The --bench parameter specifies a prefix; only functions matching that prefix gets benchmarked.)

Sure that's how you do it when you're working on a specific new optimisation. Now we're trying to compare 128-bit and 256-bit vectors for *all* existing functions to see which ones need to be reworked.

That used to work (in 30 minutes on K230, 5 minutes on Zen 2, IIRC). Now it's effectively broken and that's not acceptable'

>
>Without limiting the scope with a --test parameter, checkasm benchmarking has always been prohibitively slow for me - so I don't think there's anything new here?

As said, it seems to be literally an order of magnitude slower than before if not worse.

>That said I'm not familiar with the VVC tests in checkasm, perhaps they benchmark things excessively. But I don't see how that would impede work on other DSP functions in any way?

James also complained about the same thing before I.
Martin Storsjö May 21, 2024, 10:12 a.m. UTC | #4
On Tue, 21 May 2024, Rémi Denis-Courmont wrote:

>
>
> Le 21 mai 2024 09:37:18 GMT+03:00, "Martin Storsjö" <martin@martin.st> a écrit :
>> On Tue, 21 May 2024, Rémi Denis-Courmont wrote:
>>
>>> Hi,
>>> 
>>> VVC benchmarks have increased checksam runtime by at least an order of 
>>> magnitude. It's become so prohibitively slow that I could not even get 
>>> to the end.
>>> 
>>> This is not an acceptable situation and impedes non-VVC assembler work
>>
>> I don't quite understand; whenever benchmarking anything in checkasm, I 
>> would always run e.g. "checkasm --test=ac3dsp 
>> --bench=ac3_sum_square_bufferfly_float", limiting the total running of 
>> tests to a specific module, and only benchmarking a subset of the run 
>> functions. (The --bench parameter specifies a prefix; only functions 
>> matching that prefix gets benchmarked.)
>
> Sure that's how you do it when you're working on a specific new 
> optimisation. Now we're trying to compare 128-bit and 256-bit vectors 
> for *all* existing functions to see which ones need to be reworked.
>
> That used to work (in 30 minutes on K230, 5 minutes on Zen 2, IIRC). Now 
> it's effectively broken and that's not acceptable'

Ah, I see. Ok, that's a reasonable thing to do I guess.

(It's of course possible to speed it up further by only testing specific 
--test=foo cases where you know you have riscv assembly worth 
benchmarking, but if it was doable in a tolerable amount of time before, 
that shouldn't be needed.)

>> Without limiting the scope with a --test parameter, checkasm 
>> benchmarking has always been prohibitively slow for me - so I don't 
>> think there's anything new here?
>
> As said, it seems to be literally an order of magnitude slower than 
> before if not worse.
>
>> That said I'm not familiar with the VVC tests in checkasm, perhaps they 
>> benchmark things excessively. But I don't see how that would impede 
>> work on other DSP functions in any way?
>
> James also complained about the same thing before I.

Indeed, the tests in vvc_alf group seem to do excessive benchmarking 
(benchmarking every width/height combination between 4 and 128, in 
increments of 4). I sent a patch to cut this down to a reasonable amount.

Overall, I would expect the vvc checkasm tests to take a notable amount of 
time. Dav1d's checkasm takes twice as long to run as ffmpeg's, and it's 
probably a reasonable to assume that vvc is roughly of the same level of 
complexity as av1, so it's probably expected that ffmpeg's checkasm 
runtime at least doubles, once all vvc routines are integrated in 
checkasm.

But the tests in vvc_alf indeed had an entirely unreasonable amount of 
benchmarking hooked up, and that should indeed be fixed, e.g. with the 
patch I just sent.

// Martin
Ronald S. Bultje May 21, 2024, 2:35 p.m. UTC | #5
Hi,

On Sun, May 19, 2024 at 8:55 PM Stone Chen <chen.stonechen@gmail.com> wrote:

> Adds checkasm for DMVR SAD AVX2 implementation.
>
> Benchmarks ( AMD 7940HS )
> vvc_sad_8x8_c: 70.0
> vvc_sad_8x8_avx2: 10.0
> vvc_sad_16x16_c: 280.0
> vvc_sad_16x16_avx2: 20.0
> vvc_sad_32x32_c: 1020.0
> vvc_sad_32x32_avx2: 70.0
> vvc_sad_64x64_c: 3560.0
> vvc_sad_64x64_avx2: 270.0
> vvc_sad_128x128_c: 13760.0
> vvc_sad_128x128_avx2: 1070.0
> ---
>  tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
>

It appears Remi's performance concerns have been addressed separately, so
this patch is good to go.

Ronald
diff mbox series

Patch

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 97f57cb401..e251400bfc 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -322,8 +322,46 @@  static void check_avg(void)
     report("avg");
 }
 
+static void check_vvc_sad(void)
+{
+    const int bit_depth = 10;
+    VVCDSPContext c;
+    LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+    LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+    declare_func(int, const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+
+    ff_vvc_dsp_init(&c, bit_depth);
+    memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+    memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+
+    randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
+     for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
+        for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
+            for(int offy = 0; offy <= 4; offy++) {
+                for(int offx = 0; offx <= 4; offx++) {
+                    if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
+                        int result0;
+                        int result1;
+
+                        result0 =  call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+                        result1 =  call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+
+                        if (result1 != result0)
+                            fail();
+                        if(w == h && offx == 0 && offy == 0)
+                            bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+                    }
+                }
+            }
+        }
+     }
+
+    report("check_vvc_sad");
+}
+
 void checkasm_check_vvc_mc(void)
 {
+    check_vvc_sad();
     check_put_vvc_luma();
     check_put_vvc_luma_uni();
     check_put_vvc_chroma();