diff mbox series

[FFmpeg-devel] avfilter/vf_ssim: improve precision

Message ID 20200131090454.17838-1-onemda@gmail.com
State Accepted
Headers show
Series [FFmpeg-devel] avfilter/vf_ssim: improve precision
Related show

Checks

Context Check Description
andriy/ffmpeg-patchwork pending
andriy/ffmpeg-patchwork success Applied patch
andriy/ffmpeg-patchwork success Configure finished
andriy/ffmpeg-patchwork success Make finished
andriy/ffmpeg-patchwork fail Make fate failed

Commit Message

Paul B Mahol Jan. 31, 2020, 9:04 a.m. UTC
Use doubles for accumulating floats.

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/ssim.h                    |  2 +-
 libavfilter/vf_ssim.c                 | 18 +++++------
 libavfilter/x86/vf_ssim.asm           | 36 ++++++++++++++--------
 libavfilter/x86/vf_ssim_init.c        |  2 +-
 tests/ref/fate/filter-refcmp-ssim-rgb | 44 +++++++++++++--------------
 tests/ref/fate/filter-refcmp-ssim-yuv | 26 ++++++++--------
 6 files changed, 70 insertions(+), 58 deletions(-)

Comments

Paul B Mahol Feb. 3, 2020, 12:29 p.m. UTC | #1
Will apply soon.

On 1/31/20, Paul B Mahol <onemda@gmail.com> wrote:
> Use doubles for accumulating floats.
>
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavfilter/ssim.h                    |  2 +-
>  libavfilter/vf_ssim.c                 | 18 +++++------
>  libavfilter/x86/vf_ssim.asm           | 36 ++++++++++++++--------
>  libavfilter/x86/vf_ssim_init.c        |  2 +-
>  tests/ref/fate/filter-refcmp-ssim-rgb | 44 +++++++++++++--------------
>  tests/ref/fate/filter-refcmp-ssim-yuv | 26 ++++++++--------
>  6 files changed, 70 insertions(+), 58 deletions(-)
>
Carl Eugen Hoyos Feb. 3, 2020, 10:28 p.m. UTC | #2
Am Fr., 31. Jan. 2020 um 10:05 Uhr schrieb Paul B Mahol <onemda@gmail.com>:
>
> Use doubles for accumulating floats.

Consider mentioning the horrible ticket.

Thank you for looking into this, Carl Eugen
diff mbox series

Patch

diff --git a/libavfilter/ssim.h b/libavfilter/ssim.h
index ac0395a22a..a6a41aabe6 100644
--- a/libavfilter/ssim.h
+++ b/libavfilter/ssim.h
@@ -28,7 +28,7 @@  typedef struct SSIMDSPContext {
     void (*ssim_4x4_line)(const uint8_t *buf, ptrdiff_t buf_stride,
                           const uint8_t *ref, ptrdiff_t ref_stride,
                           int (*sums)[4], int w);
-    float (*ssim_end_line)(const int (*sum0)[4], const int (*sum1)[4], int w);
+    double (*ssim_end_line)(const int (*sum0)[4], const int (*sum1)[4], int w);
 } SSIMDSPContext;
 
 void ff_ssim_init_x86(SSIMDSPContext *dsp);
diff --git a/libavfilter/vf_ssim.c b/libavfilter/vf_ssim.c
index c08fbcdcc2..17dce8e8e8 100644
--- a/libavfilter/vf_ssim.c
+++ b/libavfilter/vf_ssim.c
@@ -55,13 +55,13 @@  typedef struct SSIMContext {
     uint64_t nb_frames;
     double ssim[4], ssim_total;
     char comps[4];
-    float coefs[4];
+    double coefs[4];
     uint8_t rgba_map[4];
     int planewidth[4];
     int planeheight[4];
     int *temp;
     int is_rgb;
-    float (*ssim_plane)(SSIMDSPContext *dsp,
+    double (*ssim_plane)(SSIMDSPContext *dsp,
                         uint8_t *main, int main_stride,
                         uint8_t *ref, int ref_stride,
                         int width, int height, void *temp,
@@ -206,9 +206,9 @@  static float ssim_endn_16bit(const int64_t (*sum0)[4], const int64_t (*sum1)[4],
     return ssim;
 }
 
-static float ssim_endn_8bit(const int (*sum0)[4], const int (*sum1)[4], int width)
+static double ssim_endn_8bit(const int (*sum0)[4], const int (*sum1)[4], int width)
 {
-    float ssim = 0.0;
+    double ssim = 0.0;
     int i;
 
     for (i = 0; i < width; i++)
@@ -221,14 +221,14 @@  static float ssim_endn_8bit(const int (*sum0)[4], const int (*sum1)[4], int widt
 
 #define SUM_LEN(w) (((w) >> 2) + 3)
 
-static float ssim_plane_16bit(SSIMDSPContext *dsp,
+static double ssim_plane_16bit(SSIMDSPContext *dsp,
                               uint8_t *main, int main_stride,
                               uint8_t *ref, int ref_stride,
                               int width, int height, void *temp,
                               int max)
 {
     int z = 0, y;
-    float ssim = 0.0;
+    double ssim = 0.0;
     int64_t (*sum0)[4] = temp;
     int64_t (*sum1)[4] = sum0 + SUM_LEN(width);
 
@@ -249,14 +249,14 @@  static float ssim_plane_16bit(SSIMDSPContext *dsp,
     return ssim / ((height - 1) * (width - 1));
 }
 
-static float ssim_plane(SSIMDSPContext *dsp,
+static double ssim_plane(SSIMDSPContext *dsp,
                         uint8_t *main, int main_stride,
                         uint8_t *ref, int ref_stride,
                         int width, int height, void *temp,
                         int max)
 {
     int z = 0, y;
-    float ssim = 0.0;
+    double ssim = 0.0;
     int (*sum0)[4] = temp;
     int (*sum1)[4] = sum0 + SUM_LEN(width);
 
@@ -288,7 +288,7 @@  static int do_ssim(FFFrameSync *fs)
     SSIMContext *s = ctx->priv;
     AVFrame *master, *ref;
     AVDictionary **metadata;
-    float c[4], ssimv = 0.0;
+    double c[4], ssimv = 0.0;
     int ret, i;
 
     ret = ff_framesync_dualinput_get(fs, &master, &ref);
diff --git a/libavfilter/x86/vf_ssim.asm b/libavfilter/x86/vf_ssim.asm
index 3293e66701..4cd6293b59 100644
--- a/libavfilter/x86/vf_ssim.asm
+++ b/libavfilter/x86/vf_ssim.asm
@@ -169,8 +169,9 @@  SSIM_4X4_LINE 8
 %endif
 
 INIT_XMM sse4
-cglobal ssim_end_line, 3, 3, 6, sum0, sum1, w
+cglobal ssim_end_line, 3, 3, 7, sum0, sum1, w
     pxor              m0, m0
+    pxor              m6, m6
 .loop:
     mova              m1, [sum0q+mmsize*0]
     mova              m2, [sum0q+mmsize*1]
@@ -214,34 +215,45 @@  cglobal ssim_end_line, 3, 3, 6, sum0, sum1, w
     mulps             m4, m5
     mulps             m3, m1
     divps             m4, m3                    ; ssim_endl
-    addps             m0, m4                    ; ssim
+    mova              m5, m4
+    cvtps2pd          m3, m5
+    movhlps           m5, m5
+    cvtps2pd          m5, m5
+    addpd             m0, m3                    ; ssim
+    addpd             m6, m5                    ; ssim
     add            sum0q, mmsize*4
     add            sum1q, mmsize*4
     sub               wd, 4
     jg .loop
 
-    ; subps the ones we added too much
+    ; subpd the ones we added too much
     test              wd, wd
     jz .end
     add               wd, 4
+    test              wd, 3
+    jz .skip3
     test              wd, 2
     jz .skip2
-    psrldq            m4, 8
-.skip2:
     test              wd, 1
     jz .skip1
-    psrldq            m4, 4
+.skip3:
+    psrldq            m5, 8
+    subpd             m6, m5
+    jmp .end
+.skip2:
+    psrldq            m3, 8
+    subpd             m0, m3
+    jmp .end
 .skip1:
-    subps             m0, m4
+    psrldq            m5, 8
+    subpd             m6, m5
 
 .end:
+    addpd             m0, m6
     movhlps           m4, m0
-    addps             m0, m4
-    movss             m4, m0
-    shufps            m0, m0, 1
-    addss             m0, m4
+    addpd             m0, m4
 %if ARCH_X86_32
-    movss            r0m, m0
+    movsd            r0m, m0
     fld             r0mp
 %endif
     RET
diff --git a/libavfilter/x86/vf_ssim_init.c b/libavfilter/x86/vf_ssim_init.c
index 599c928403..cbaa20ef16 100644
--- a/libavfilter/x86/vf_ssim_init.c
+++ b/libavfilter/x86/vf_ssim_init.c
@@ -28,7 +28,7 @@  void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride,
 void ff_ssim_4x4_line_xop  (const uint8_t *buf, ptrdiff_t buf_stride,
                             const uint8_t *ref, ptrdiff_t ref_stride,
                             int (*sums)[4], int w);
-float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w);
+double ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w);
 
 void ff_ssim_init_x86(SSIMDSPContext *dsp)
 {
diff --git a/tests/ref/fate/filter-refcmp-ssim-rgb b/tests/ref/fate/filter-refcmp-ssim-rgb
index 8c23c60b37..cb3b5f2e12 100644
--- a/tests/ref/fate/filter-refcmp-ssim-rgb
+++ b/tests/ref/fate/filter-refcmp-ssim-rgb
@@ -1,30 +1,30 @@ 
 frame:0    pts:0       pts_time:0
+lavfi.ssim.R=0.73
+lavfi.ssim.G=0.77
+lavfi.ssim.B=0.90
+lavfi.ssim.All=0.80
+lavfi.ssim.dB=7.01
+frame:1    pts:1       pts_time:1
 lavfi.ssim.R=0.72
 lavfi.ssim.G=0.76
-lavfi.ssim.B=0.89
-lavfi.ssim.All=0.79
-lavfi.ssim.dB=6.74
-frame:1    pts:1       pts_time:1
-lavfi.ssim.R=0.70
-lavfi.ssim.G=0.74
-lavfi.ssim.B=0.85
-lavfi.ssim.All=0.77
-lavfi.ssim.dB=6.31
+lavfi.ssim.B=0.86
+lavfi.ssim.All=0.78
+lavfi.ssim.dB=6.56
 frame:2    pts:2       pts_time:2
-lavfi.ssim.R=0.71
+lavfi.ssim.R=0.72
+lavfi.ssim.G=0.76
+lavfi.ssim.B=0.85
+lavfi.ssim.All=0.78
+lavfi.ssim.dB=6.53
+frame:3    pts:3       pts_time:3
+lavfi.ssim.R=0.72
 lavfi.ssim.G=0.75
 lavfi.ssim.B=0.84
+lavfi.ssim.All=0.77
+lavfi.ssim.dB=6.35
+frame:4    pts:4       pts_time:4
+lavfi.ssim.R=0.72
+lavfi.ssim.G=0.75
+lavfi.ssim.B=0.82
 lavfi.ssim.All=0.76
 lavfi.ssim.dB=6.29
-frame:3    pts:3       pts_time:3
-lavfi.ssim.R=0.70
-lavfi.ssim.G=0.73
-lavfi.ssim.B=0.83
-lavfi.ssim.All=0.76
-lavfi.ssim.dB=6.11
-frame:4    pts:4       pts_time:4
-lavfi.ssim.R=0.71
-lavfi.ssim.G=0.74
-lavfi.ssim.B=0.80
-lavfi.ssim.All=0.75
-lavfi.ssim.dB=6.05
diff --git a/tests/ref/fate/filter-refcmp-ssim-yuv b/tests/ref/fate/filter-refcmp-ssim-yuv
index 5c8ffb9483..209c8bd600 100644
--- a/tests/ref/fate/filter-refcmp-ssim-yuv
+++ b/tests/ref/fate/filter-refcmp-ssim-yuv
@@ -1,30 +1,30 @@ 
 frame:0    pts:0       pts_time:0
-lavfi.ssim.Y=0.80
+lavfi.ssim.Y=0.82
 lavfi.ssim.U=0.76
 lavfi.ssim.V=0.69
-lavfi.ssim.All=0.76
-lavfi.ssim.dB=6.25
+lavfi.ssim.All=0.77
+lavfi.ssim.dB=6.37
 frame:1    pts:1       pts_time:1
-lavfi.ssim.Y=0.80
+lavfi.ssim.Y=0.81
 lavfi.ssim.U=0.73
 lavfi.ssim.V=0.68
-lavfi.ssim.All=0.75
-lavfi.ssim.dB=6.08
+lavfi.ssim.All=0.76
+lavfi.ssim.dB=6.20
 frame:2    pts:2       pts_time:2
-lavfi.ssim.Y=0.80
+lavfi.ssim.Y=0.82
 lavfi.ssim.U=0.73
 lavfi.ssim.V=0.68
-lavfi.ssim.All=0.75
-lavfi.ssim.dB=6.10
+lavfi.ssim.All=0.76
+lavfi.ssim.dB=6.22
 frame:3    pts:3       pts_time:3
-lavfi.ssim.Y=0.79
+lavfi.ssim.Y=0.81
 lavfi.ssim.U=0.72
 lavfi.ssim.V=0.68
 lavfi.ssim.All=0.75
-lavfi.ssim.dB=5.94
+lavfi.ssim.dB=6.06
 frame:4    pts:4       pts_time:4
-lavfi.ssim.Y=0.80
+lavfi.ssim.Y=0.81
 lavfi.ssim.U=0.72
 lavfi.ssim.V=0.68
 lavfi.ssim.All=0.75
-lavfi.ssim.dB=5.97
+lavfi.ssim.dB=6.05