diff mbox series

[FFmpeg-devel] lavu/tx: make 32-bit fixed-point transforms more bitexact

Message ID NYLIg0x--3-9@lynne.ee
State New
Headers show
Series [FFmpeg-devel] lavu/tx: make 32-bit fixed-point transforms more bitexact | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 fail Make fate failed

Commit Message

Lynne June 20, 2023, 12:57 a.m. UTC
Using the sqrt/cos/sin approximations we have, the only parts left
which may be inexact are multiplies and divisions in some transforms.

Patch attached.

Comments

Martin Storsjö June 20, 2023, 8:53 a.m. UTC | #1
On Tue, 20 Jun 2023, Lynne wrote:

> Using the sqrt/cos/sin approximations we have, the only parts left
> which may be inexact are multiplies and divisions in some transforms.

This seems to help somewhat, but there still are cases of inexactness, 
somewhere.

The content of the tables that are initialized here does become bitexact 
(at least across some of the configs that otherwise disagree with the 
output), but despite that, the output differs.

With the test references generated on linux/x86_64 compiled with GCC, run 
on an Intel CPU, I get the following set of machines that either agree or 
disagree with the reference:

matching
- linux x86_64 gcc11 Intel
- linux aarch64 gcc12 on Apple M1
- linux aarch64 clang10 Neoverse N1
- linux aarch64 gcc9 Neoverse N1
- linux armv7 gcc9 Neoverse N1

disagreeing
- macos x86_64 clang Xcode14 Intel
- mingw x86_64 clang trunk Dragonboard
- macos aarch64 clang Xcode12 Apple M1
- macos aarch64 clang Xcode14 Apple M1
- linux i686 gcc11 Intel
- mingw aarch64 clang trunk Dragonboard
- linux aarch64 gcc7 Dragonboard
- mingw armv7 clang trunk Dragonboard
- mingw i686 clang trunk Intel
- mingw i686 clang trunk -march=i686 Intel


The configs that are easiest to reproduce are probably the ones on macOS 
on Apple M1, or macOS on x86_64 if you happen to have access to that, or 
GCC/i686 on Linux (just configure with --extra-cflags=-m32 
--extra-ldflags=-m32).

// Martin
diff mbox series

Patch

From b2fd8fde86d421109d7922ded7b4691384af2214 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Tue, 20 Jun 2023 02:47:17 +0200
Subject: [PATCH] lavu/tx: make 32-bit fixed-point transforms more bitexact

Using the sqrt/cos/sin approximations we have, the only parts left
which may be inexact are multiplies and divisions in some transforms.
---
 libavutil/tx_priv.h               |  2 ++
 libavutil/tx_template.c           | 38 +++++++++++++++++++++++++++----
 tests/fate/ac3.mak                |  2 +-
 tests/ref/fate/unknown_layout-ac3 |  2 +-
 tests/ref/lavf/rm                 |  2 +-
 5 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h
index d5ff8e1421..2f056a777c 100644
--- a/libavutil/tx_priv.h
+++ b/libavutil/tx_priv.h
@@ -110,6 +110,8 @@  typedef void TXComplex;
 
 #elif defined(TX_INT32)
 
+#include "softfloat.h"
+
 /* Properly rounds the result */
 #define CMUL(dre, dim, are, aim, bre, bim)             \
     do {                                               \
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 983de75a47..719dae2440 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -60,6 +60,17 @@  typedef struct FFTabInitData {
     int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
 } FFTabInitData;
 
+#if defined(TX_INT32)
+static TXSample COS_GEN(double freq)
+{
+    int c_f, s_f;
+    av_sincos_sf(llrintf(freq * (1 << 30) / M_PI), &s_f, &c_f);
+    return av_clip64(((int64_t)c_f) << 1, INT32_MIN, INT32_MAX);
+}
+#else
+#define COS_GEN cos
+#endif
+
 #define SR_TABLE(len)                                              \
 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void)            \
 {                                                                  \
@@ -67,7 +78,7 @@  static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void)            \
     TXSample *tab = TX_TAB(ff_tx_tab_ ##len);                      \
                                                                    \
     for (int i = 0; i < len/4; i++)                                \
-        *tab++ = RESCALE(cos(i*freq));                             \
+        *tab++ = COS_GEN(i*freq);                                  \
                                                                    \
     *tab = 0;                                                      \
 }
@@ -1903,22 +1914,39 @@  int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
 {
     int off = 0;
     int len4 = s->len >> 1;
-    double scale = s->scale_d;
-    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
+    const double theta = (s->scale_d < 0 ? len4 : 0) + 1.0/8.0;
     size_t alloc = pre_tab ? 2*len4 : len4;
 
+#if defined(TX_INT32)
+    int scale = llrintf(fabs(s->scale_d) * (1 << 30));
+    SoftFloat scale_sf = av_int2sf(scale, 30);
+    scale_sf = av_sqrt_sf(scale_sf);
+#else
+    double scale = sqrt(fabs(s->scale_d));
+#endif
+
     if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
         return AVERROR(ENOMEM);
 
-    scale = sqrt(fabs(scale));
-
     if (pre_tab)
         off = len4;
 
     for (int i = 0; i < len4; i++) {
         const double alpha = M_PI_2 * (i + theta) / len4;
+
+#if defined(TX_INT32)
+        int c_f, s_f;
+        SoftFloat cos_sf, sin_sf;
+        av_sincos_sf(llrintf(alpha * (1 << 30) / M_PI), &s_f, &c_f);
+        cos_sf = av_int2sf(c_f, 30);
+        sin_sf = av_int2sf(s_f, 30);
+        cos_sf = av_mul_sf(cos_sf, scale_sf);
+        sin_sf = av_mul_sf(sin_sf, scale_sf);
+        s->exp[off + i] = (TXComplex){ av_sf2int(cos_sf, 30) << 1, av_sf2int(sin_sf, 30) << 1 };
+#else
         s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
                                        RESCALE(sin(alpha) * scale) };
+#endif
     }
 
     if (pre_tab)
diff --git a/tests/fate/ac3.mak b/tests/fate/ac3.mak
index 2dfd59dfb1..85766e82c7 100644
--- a/tests/fate/ac3.mak
+++ b/tests/fate/ac3.mak
@@ -89,7 +89,7 @@  fate-ac3-fixed-encode: tests/data/asynth-44100-2.wav
 fate-ac3-fixed-encode: SRC = $(TARGET_PATH)/tests/data/asynth-44100-2.wav
 fate-ac3-fixed-encode: CMD = md5 -i $(SRC) -c ac3_fixed -ab 128k -f ac3 -flags +bitexact -af aresample
 fate-ac3-fixed-encode: CMP = oneline
-fate-ac3-fixed-encode: REF = e9d78bca187b4bbafc4512bcea8efd3e
+fate-ac3-fixed-encode: REF = 3c1781a78ba3ea653c145798511644eb
 
 FATE_EAC3-$(call ALLYES, EAC3_DEMUXER EAC3_MUXER EAC3_CORE_BSF) += fate-eac3-core-bsf
 fate-eac3-core-bsf: CMD = md5pipe -i $(TARGET_SAMPLES)/eac3/the_great_wall_7.1.eac3 -c:a copy -bsf:a eac3_core -fflags +bitexact -f eac3
diff --git a/tests/ref/fate/unknown_layout-ac3 b/tests/ref/fate/unknown_layout-ac3
index a694c52899..c535c4ff05 100644
--- a/tests/ref/fate/unknown_layout-ac3
+++ b/tests/ref/fate/unknown_layout-ac3
@@ -1 +1 @@ 
-ff7e25844b3cb6abb571ef7e226cbafa
+c40992cfc42a620b592e03153a74ff68
diff --git a/tests/ref/lavf/rm b/tests/ref/lavf/rm
index 62251380cf..dc7b9ed57b 100644
--- a/tests/ref/lavf/rm
+++ b/tests/ref/lavf/rm
@@ -1,2 +1,2 @@ 
-a7b0ac6e5131bbf662a07ccc82ab8618 *tests/data/lavf/lavf.rm
+b471964c3f313ed33245ef3e56f144c0 *tests/data/lavf/lavf.rm
 346424 tests/data/lavf/lavf.rm
-- 
2.40.1