Message ID | 20180506114100.4223-3-u@pkh.me |
---|---|
State | Superseded |
Headers | show |
On Sun, May 06, 2018 at 01:40:53PM +0200, Clément Bœsch wrote: > SIMD code will not have to deal with padding itself. Overwriting in that > function may have been possible but involve large overreading of the > sources. Instead, we simply make sure the width to process is always a > multiple of 16. Additionally, there must be some actual area to process > so the SIMD code can have its boundary checks after processing the first > pixels. > --- > libavfilter/vf_nlmeans.c | 25 ++++++++++++++++++------- > 1 file changed, 18 insertions(+), 7 deletions(-) > > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c > index d222d3913e..21f981a605 100644 > --- a/libavfilter/vf_nlmeans.c > +++ b/libavfilter/vf_nlmeans.c > @@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32 > { > int x, y; > > + /* SIMD-friendly assumptions allowed here */ > + av_assert2(!(w & 0xf) && w >= 16 && h >= 1); > + > for (y = 0; y < h; y++) { > uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1]; > > @@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, > // to compare the 2 sources pixels > const int startx_safe = FFMAX(s1x, s2x); > const int starty_safe = FFMAX(s1y, s2y); > - const int endx_safe = FFMIN(s1x + w, s2x + w); > + const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned > const int endy_safe = FFMIN(s1y + h, s2y + h); > > + // deduce the safe area width and height > + const int safe_pw = (u_endx_safe - startx_safe) & ~0xf; > + const int safe_ph = endy_safe - starty_safe; > + > + // adjusted end x position of the safe area after width of the safe area gets aligned > + const int endx_safe = startx_safe + safe_pw; > + > // top part where only one of s1 and s2 is still readable, or none at all > compute_unsafe_ssd_integral_image(ii, ii_linesize_32, > 0, 0, > @@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, > 0, starty_safe, > src, linesize, > offx, offy, e, w, h, > - startx_safe, endy_safe - starty_safe); > + startx_safe, safe_ph); > > // main and safe part of the integral > av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w); > av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h); > av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w); > av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h); > - compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, > - src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, > - src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, > - endx_safe - startx_safe, endy_safe - starty_safe); > + if (safe_pw && safe_ph) > + dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, > + src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, > + src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, > + safe_pw, safe_ph); i think this is or i am missing some change libavfilter/vf_nlmeans.c: In function ‘compute_ssd_integral_image’: libavfilter/vf_nlmeans.c:294:9: error: ‘dsp’ undeclared (first use in this function) dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, ^ libavfilter/vf_nlmeans.c:294:9: note: each undeclared identifier is reported only once for each function it appears in libavfilter/vf_nlmeans.c: At top level: libavfilter/vf_nlmeans.c:153:13: warning: ‘compute_safe_ssd_integral_image_c’ defined but not used [-Wunused-function] static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32, ^ make: *** [libavfilter/vf_nlmeans.o] Error 1 make: *** Waiting for unfinished jobs.... [...]
On Mon, May 07, 2018 at 12:14:37AM +0200, Michael Niedermayer wrote: > On Sun, May 06, 2018 at 01:40:53PM +0200, Clément Bœsch wrote: > > SIMD code will not have to deal with padding itself. Overwriting in that > > function may have been possible but involve large overreading of the > > sources. Instead, we simply make sure the width to process is always a > > multiple of 16. Additionally, there must be some actual area to process > > so the SIMD code can have its boundary checks after processing the first > > pixels. > > --- > > libavfilter/vf_nlmeans.c | 25 ++++++++++++++++++------- > > 1 file changed, 18 insertions(+), 7 deletions(-) > > > > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c > > index d222d3913e..21f981a605 100644 > > --- a/libavfilter/vf_nlmeans.c > > +++ b/libavfilter/vf_nlmeans.c > > @@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32 > > { > > int x, y; > > > > + /* SIMD-friendly assumptions allowed here */ > > + av_assert2(!(w & 0xf) && w >= 16 && h >= 1); > > + > > for (y = 0; y < h; y++) { > > uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1]; > > > > @@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, > > // to compare the 2 sources pixels > > const int startx_safe = FFMAX(s1x, s2x); > > const int starty_safe = FFMAX(s1y, s2y); > > - const int endx_safe = FFMIN(s1x + w, s2x + w); > > + const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned > > const int endy_safe = FFMIN(s1y + h, s2y + h); > > > > + // deduce the safe area width and height > > + const int safe_pw = (u_endx_safe - startx_safe) & ~0xf; > > + const int safe_ph = endy_safe - starty_safe; > > + > > + // adjusted end x position of the safe area after width of the safe area gets aligned > > + const int endx_safe = startx_safe + safe_pw; > > + > > // top part where only one of s1 and s2 is still readable, or none at all > > compute_unsafe_ssd_integral_image(ii, ii_linesize_32, > > 0, 0, > > @@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, > > 0, starty_safe, > > src, linesize, > > offx, offy, e, w, h, > > - startx_safe, endy_safe - starty_safe); > > + startx_safe, safe_ph); > > > > // main and safe part of the integral > > av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w); > > av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h); > > av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w); > > av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h); > > - compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, > > - src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, > > - src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, > > - endx_safe - startx_safe, endy_safe - starty_safe); > > + if (safe_pw && safe_ph) > > + dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, > > + src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, > > + src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, > > + safe_pw, safe_ph); > > > i think this is or i am missing some change > > libavfilter/vf_nlmeans.c: In function ‘compute_ssd_integral_image’: > libavfilter/vf_nlmeans.c:294:9: error: ‘dsp’ undeclared (first use in this function) > dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, > ^ > libavfilter/vf_nlmeans.c:294:9: note: each undeclared identifier is reported only once for each function it appears in > libavfilter/vf_nlmeans.c: At top level: > libavfilter/vf_nlmeans.c:153:13: warning: ‘compute_safe_ssd_integral_image_c’ defined but not used [-Wunused-function] > static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32, > ^ > make: *** [libavfilter/vf_nlmeans.o] Error 1 > make: *** Waiting for unfinished jobs.... Yeah I made a mistake while splitting commit, this is fixed locally. At this step it's supposed to still be calling compute_safe_ssd_integral_image_c() directly (but its last 2 parameters changed).
diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c index d222d3913e..21f981a605 100644 --- a/libavfilter/vf_nlmeans.c +++ b/libavfilter/vf_nlmeans.c @@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32 { int x, y; + /* SIMD-friendly assumptions allowed here */ + av_assert2(!(w & 0xf) && w >= 16 && h >= 1); + for (y = 0; y < h; y++) { uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1]; @@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, // to compare the 2 sources pixels const int startx_safe = FFMAX(s1x, s2x); const int starty_safe = FFMAX(s1y, s2y); - const int endx_safe = FFMIN(s1x + w, s2x + w); + const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned const int endy_safe = FFMIN(s1y + h, s2y + h); + // deduce the safe area width and height + const int safe_pw = (u_endx_safe - startx_safe) & ~0xf; + const int safe_ph = endy_safe - starty_safe; + + // adjusted end x position of the safe area after width of the safe area gets aligned + const int endx_safe = startx_safe + safe_pw; + // top part where only one of s1 and s2 is still readable, or none at all compute_unsafe_ssd_integral_image(ii, ii_linesize_32, 0, 0, @@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, 0, starty_safe, src, linesize, offx, offy, e, w, h, - startx_safe, endy_safe - starty_safe); + startx_safe, safe_ph); // main and safe part of the integral av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w); av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h); av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w); av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h); - compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, - src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, - src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, - endx_safe - startx_safe, endy_safe - starty_safe); + if (safe_pw && safe_ph) + dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, + src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, + src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, + safe_pw, safe_ph); // right part of the integral compute_unsafe_ssd_integral_image(ii, ii_linesize_32, endx_safe, starty_safe, src, linesize, offx, offy, e, w, h, - ii_w - endx_safe, endy_safe - starty_safe); + ii_w - endx_safe, safe_ph); // bottom part where only one of s1 and s2 is still readable, or none at all compute_unsafe_ssd_integral_image(ii, ii_linesize_32,