diff mbox series

[FFmpeg-devel,v2,1/4] avcodec/aom_film_grain: add AOM film grain synthesis

Message ID 20240308132108.28337-1-ffmpeg@haasn.xyz
State New
Headers show
Series [FFmpeg-devel,v2,1/4] avcodec/aom_film_grain: add AOM film grain synthesis | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Niklas Haas March 8, 2024, 1:21 p.m. UTC
From: Niklas Haas <git@haasn.dev>

Implementation copied wholesale from dav1d, sans SIMD, under permissive
license. This implementation was extensively verified to be bit-exact,
so it serves as a much better starting point than trying to re-engineer
this from scratch for no reason. (I also authored the original
implementation in dav1d, so any "clean room" implementation would end up
looking much the same, anyway)

The notable changes I had to make while adapting this from the dav1d
code-base to the FFmpeg codebase include:

- reordering variable declarations to avoid triggering warnings
- replacing several inline helpers by avutil equivalents
- changing code that accesses frame metadata
- replacing raw plane copying logic by av_image_copy_plane

Apart from this, the implementation is basically unmodified.
---
 libavcodec/aom_film_grain.c          | 310 ++++++++++++++
 libavcodec/aom_film_grain.h          |  38 ++
 libavcodec/aom_film_grain_template.c | 577 +++++++++++++++++++++++++++
 3 files changed, 925 insertions(+)
 create mode 100644 libavcodec/aom_film_grain.c
 create mode 100644 libavcodec/aom_film_grain.h
 create mode 100644 libavcodec/aom_film_grain_template.c

Comments

James Almer March 8, 2024, 1:31 p.m. UTC | #1
On 3/8/2024 10:21 AM, Niklas Haas wrote:
> From: Niklas Haas <git@haasn.dev>
> 
> Implementation copied wholesale from dav1d, sans SIMD, under permissive
> license. This implementation was extensively verified to be bit-exact,
> so it serves as a much better starting point than trying to re-engineer
> this from scratch for no reason. (I also authored the original
> implementation in dav1d, so any "clean room" implementation would end up
> looking much the same, anyway)
> 
> The notable changes I had to make while adapting this from the dav1d
> code-base to the FFmpeg codebase include:
> 
> - reordering variable declarations to avoid triggering warnings
> - replacing several inline helpers by avutil equivalents
> - changing code that accesses frame metadata
> - replacing raw plane copying logic by av_image_copy_plane
> 
> Apart from this, the implementation is basically unmodified.

Do we want this to be public? Both as a struct and the decoding functions.
It could be used by libavfilter or even outside our libraries. The hevc 
decoder would export the relevant T.35 SEI in the new struct if told to 
not apply fg, like we already do in av1.

> ---
>   libavcodec/aom_film_grain.c          | 310 ++++++++++++++
>   libavcodec/aom_film_grain.h          |  38 ++
>   libavcodec/aom_film_grain_template.c | 577 +++++++++++++++++++++++++++
>   3 files changed, 925 insertions(+)
>   create mode 100644 libavcodec/aom_film_grain.c
>   create mode 100644 libavcodec/aom_film_grain.h
>   create mode 100644 libavcodec/aom_film_grain_template.c
> 
> diff --git a/libavcodec/aom_film_grain.c b/libavcodec/aom_film_grain.c
> new file mode 100644
> index 00000000000..ffcd71b584b
> --- /dev/null
> +++ b/libavcodec/aom_film_grain.c
> @@ -0,0 +1,310 @@
> +/*
> + * AOM film grain synthesis
> + * Copyright (c) 2023 Niklas Haas <ffmpeg@haasn.xyz>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * AOM film grain synthesis.
> + * @author Niklas Haas <ffmpeg@haasn.xyz>
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/imgutils.h"
> +
> +#include "aom_film_grain.h"
> +
> +// Common/shared helpers (not dependent on BIT_DEPTH)
> +static inline int get_random_number(const int bits, unsigned *const state) {
> +    const int r = *state;
> +    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
> +    *state = (r >> 1) | (bit << 15);
> +
> +    return (*state >> (16 - bits)) & ((1 << bits) - 1);
> +}
> +
> +static inline int round2(const int x, const uint64_t shift) {
> +    return (x + ((1 << shift) >> 1)) >> shift;
> +}
> +
> +enum {
> +    GRAIN_WIDTH      = 82,
> +    GRAIN_HEIGHT     = 73,
> +    SUB_GRAIN_WIDTH  = 44,
> +    SUB_GRAIN_HEIGHT = 38,
> +    FG_BLOCK_SIZE    = 32,
> +};
> +
> +static const int16_t gaussian_sequence[2048];
> +
> +#define BIT_DEPTH 16
> +#include "aom_film_grain_template.c"
> +#undef BIT_DEPTH
> +
> +#define BIT_DEPTH 8
> +#include "aom_film_grain_template.c"
> +#undef BIT_DEPTH
> +
> +
> +int ff_aom_apply_film_grain(AVFrame *out, const AVFrame *in,
> +                            const AVFilmGrainParams *params)
> +{
> +    const AVFilmGrainAOMParams *const data = &params->codec.aom;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +    const int subx = desc->log2_chroma_w, suby = desc->log2_chroma_h;
> +    const int pxstep = desc->comp[0].step;
> +
> +    av_assert0(out->format == in->format);
> +    av_assert0(params->type == AV_FILM_GRAIN_PARAMS_AV1);
> +
> +    // Copy over the non-modified planes
> +    if (!params->codec.aom.num_y_points) {
> +        av_image_copy_plane(out->data[0], out->linesize[0],
> +                            in->data[0], in->linesize[0],
> +                            out->width * pxstep, out->height);
> +    }
> +    for (int uv = 0; uv < 2; uv++) {
> +        if (!data->num_uv_points[uv]) {
> +            av_image_copy_plane(out->data[1+uv], out->linesize[1+uv],
> +                                in->data[1+uv], in->linesize[1+uv],
> +                                AV_CEIL_RSHIFT(out->width, subx) * pxstep,
> +                                AV_CEIL_RSHIFT(out->height, suby));
> +        }
> +    }
> +
> +    switch (in->format) {
> +    case AV_PIX_FMT_GRAY8:
> +    case AV_PIX_FMT_YUV420P:
> +    case AV_PIX_FMT_YUV422P:
> +    case AV_PIX_FMT_YUV444P:
> +    case AV_PIX_FMT_YUVJ420P:
> +    case AV_PIX_FMT_YUVJ422P:
> +    case AV_PIX_FMT_YUVJ444P:
> +        return apply_film_grain_8(out, in, params);
> +    case AV_PIX_FMT_GRAY9:
> +    case AV_PIX_FMT_YUV420P9:
> +    case AV_PIX_FMT_YUV422P9:
> +    case AV_PIX_FMT_YUV444P9:
> +        return apply_film_grain_16(out, in, params, 9);
> +    case AV_PIX_FMT_GRAY10:
> +    case AV_PIX_FMT_YUV420P10:
> +    case AV_PIX_FMT_YUV422P10:
> +    case AV_PIX_FMT_YUV444P10:
> +        return apply_film_grain_16(out, in, params, 10);
> +    case AV_PIX_FMT_GRAY12:
> +    case AV_PIX_FMT_YUV420P12:
> +    case AV_PIX_FMT_YUV422P12:
> +    case AV_PIX_FMT_YUV444P12:
> +        return apply_film_grain_16(out, in, params, 12);
> +    }
> +
> +    /* The AV1 spec only defines film grain synthesis for these formats */
> +    return AVERROR_INVALIDDATA;
> +}
> +
> +// Taken from the AV1 spec. Range is [-2048, 2047], mean is 0 and stddev is 512
> +static const int16_t gaussian_sequence[2048] = {
> +    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
> +    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
> +    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
> +    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
> +    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
> +    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
> +    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
> +    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
> +    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
> +    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
> +    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
> +    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
> +    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
> +    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
> +    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
> +    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
> +    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
> +    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
> +    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
> +    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
> +    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
> +    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
> +    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
> +    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
> +    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
> +    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
> +    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
> +    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
> +    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
> +    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
> +    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
> +    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
> +    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
> +    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
> +    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
> +    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
> +    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
> +    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
> +    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
> +    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
> +    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
> +    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
> +    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
> +    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
> +    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
> +    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
> +    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
> +    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
> +    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
> +    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
> +    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
> +    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
> +    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
> +    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
> +    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
> +    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
> +    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
> +    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
> +    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
> +    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
> +    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
> +    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
> +    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
> +    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
> +    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
> +    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
> +    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
> +    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
> +    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
> +    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
> +    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
> +    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
> +    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
> +    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
> +    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
> +    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
> +    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
> +    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
> +    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
> +    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
> +    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
> +    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
> +    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
> +    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
> +    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
> +    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
> +    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
> +    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
> +    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
> +    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
> +    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
> +    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
> +    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
> +    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
> +    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
> +    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
> +    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
> +    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
> +    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
> +    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
> +    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
> +    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
> +    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
> +    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
> +    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
> +    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
> +    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
> +    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
> +    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
> +    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
> +    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
> +    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
> +    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
> +    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
> +    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
> +    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
> +    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
> +    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
> +    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
> +    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
> +    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
> +    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
> +    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
> +    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
> +    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
> +    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
> +    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
> +    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
> +    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
> +    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
> +    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
> +    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
> +    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
> +    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
> +    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
> +    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
> +    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
> +    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
> +    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
> +    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
> +    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
> +    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
> +    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
> +    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
> +    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
> +    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
> +    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
> +    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
> +    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
> +    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
> +    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
> +    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
> +    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
> +    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
> +    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
> +    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
> +    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
> +    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
> +    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
> +    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
> +    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
> +    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
> +    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
> +    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
> +    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
> +    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
> +    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
> +    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
> +    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
> +    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
> +    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
> +    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
> +    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
> +    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
> +    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
> +    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
> +    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
> +    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
> +    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
> +    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
> +    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
> +    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
> +    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
> +    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
> +    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
> +    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
> +    428,   -484
> +};
> diff --git a/libavcodec/aom_film_grain.h b/libavcodec/aom_film_grain.h
> new file mode 100644
> index 00000000000..5d772bd7d17
> --- /dev/null
> +++ b/libavcodec/aom_film_grain.h
> @@ -0,0 +1,38 @@
> +/*
> + * AOM film grain synthesis
> + * Copyright (c) 2021 Niklas Haas <ffmpeg@haasn.xyz>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * AOM film grain synthesis.
> + * @author Niklas Haas <ffmpeg@haasn.xyz>
> + */
> +
> +#ifndef AVCODEC_AOM_FILM_GRAIN_H
> +#define AVCODEC_AOM_FILM_GRAIN_H
> +
> +#include "libavutil/film_grain_params.h"
> +
> +// Synthesizes film grain on top of `in` and stores the result to `out`. `out`
> +// must already have been allocated and set to the same size and format as `in`.
> +int ff_aom_apply_film_grain(AVFrame *out, const AVFrame *in,
> +                            const AVFilmGrainParams *params);
> +
> +#endif /* AVCODEC_AOM_FILM_GRAIN_H */
> diff --git a/libavcodec/aom_film_grain_template.c b/libavcodec/aom_film_grain_template.c
> new file mode 100644
> index 00000000000..5f9f29f1fab
> --- /dev/null
> +++ b/libavcodec/aom_film_grain_template.c
> @@ -0,0 +1,577 @@
> +/*
> + * AOM film grain synthesis
> + * Copyright (c) 2023 Niklas Haas <ffmpeg@haasn.xyz>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/*
> + * Copyright © 2018, Niklas Haas
> + * Copyright © 2018, VideoLAN and dav1d authors
> + * Copyright © 2018, Two Orioles, LLC
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are met:
> + *
> + * 1. Redistributions of source code must retain the above copyright notice, this
> + *    list of conditions and the following disclaimer.
> + *
> + * 2. Redistributions in binary form must reproduce the above copyright notice,
> + *    this list of conditions and the following disclaimer in the documentation
> + *    and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
> + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
> + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
> + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
> + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "bit_depth_template.c"
> +
> +#undef entry
> +#undef bitdepth
> +#undef bitdepth_max
> +#undef HBD_DECL
> +#undef HBD_CALL
> +#undef SCALING_SIZE
> +
> +#if BIT_DEPTH > 8
> +# define entry int16_t
> +# define bitdepth_max ((1 << bitdepth) - 1)
> +# define HBD_DECL , const int bitdepth
> +# define HBD_CALL , bitdepth
> +# define SCALING_SIZE 4096
> +#else
> +# define entry int8_t
> +# define bitdepth 8
> +# define bitdepth_max UINT8_MAX
> +# define HBD_DECL
> +# define HBD_CALL
> +# define SCALING_SIZE 256
> +#endif
> +
> +static void FUNC(generate_grain_y_c)(entry buf[][GRAIN_WIDTH],
> +                                     const AVFilmGrainParams *const params
> +                                     HBD_DECL)
> +{
> +    const AVFilmGrainAOMParams *const data = &params->codec.aom;
> +    const int bitdepth_min_8 = bitdepth - 8;
> +    unsigned seed = params->seed;
> +    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
> +    const int grain_ctr = 128 << bitdepth_min_8;
> +    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
> +
> +    const int ar_pad = 3;
> +    const int ar_lag = data->ar_coeff_lag;
> +
> +    for (int y = 0; y < GRAIN_HEIGHT; y++) {
> +        for (int x = 0; x < GRAIN_WIDTH; x++) {
> +            const int value = get_random_number(11, &seed);
> +            buf[y][x] = round2(gaussian_sequence[ value ], shift);
> +        }
> +    }
> +
> +    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
> +        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
> +            const int8_t *coeff = data->ar_coeffs_y;
> +            int sum = 0, grain;
> +            for (int dy = -ar_lag; dy <= 0; dy++) {
> +                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
> +                    if (!dx && !dy)
> +                        break;
> +                    sum += *(coeff++) * buf[y + dy][x + dx];
> +                }
> +            }
> +
> +            grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
> +            buf[y][x] = av_clip(grain, grain_min, grain_max);
> +        }
> +    }
> +}
> +
> +static void
> +FUNC(generate_grain_uv_c)(entry buf[][GRAIN_WIDTH],
> +                          const entry buf_y[][GRAIN_WIDTH],
> +                          const AVFilmGrainParams *const params, const intptr_t uv,
> +                          const int subx, const int suby HBD_DECL)
> +{
> +    const AVFilmGrainAOMParams *const data = &params->codec.aom;
> +    const int bitdepth_min_8 = bitdepth - 8;
> +    unsigned seed = params->seed ^ (uv ? 0x49d8 : 0xb524);
> +    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
> +    const int grain_ctr = 128 << bitdepth_min_8;
> +    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
> +
> +    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
> +    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
> +
> +    const int ar_pad = 3;
> +    const int ar_lag = data->ar_coeff_lag;
> +
> +    for (int y = 0; y < chromaH; y++) {
> +        for (int x = 0; x < chromaW; x++) {
> +            const int value = get_random_number(11, &seed);
> +            buf[y][x] = round2(gaussian_sequence[ value ], shift);
> +        }
> +    }
> +
> +    for (int y = ar_pad; y < chromaH; y++) {
> +        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
> +            const int8_t *coeff = data->ar_coeffs_uv[uv];
> +            int sum = 0, grain;
> +            for (int dy = -ar_lag; dy <= 0; dy++) {
> +                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
> +                    // For the final (current) pixel, we need to add in the
> +                    // contribution from the luma grain texture
> +                    if (!dx && !dy) {
> +                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
> +                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
> +                        int luma = 0;
> +                        if (!data->num_y_points)
> +                            break;
> +                        for (int i = 0; i <= suby; i++) {
> +                            for (int j = 0; j <= subx; j++) {
> +                                luma += buf_y[lumaY + i][lumaX + j];
> +                            }
> +                        }
> +                        luma = round2(luma, subx + suby);
> +                        sum += luma * (*coeff);
> +                        break;
> +                    }
> +
> +                    sum += *(coeff++) * buf[y + dy][x + dx];
> +                }
> +            }
> +
> +            grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
> +            buf[y][x] = av_clip(grain, grain_min, grain_max);
> +        }
> +    }
> +}
> +
> +// samples from the correct block of a grain LUT, while taking into account the
> +// offsets provided by the offsets cache
> +static inline entry FUNC(sample_lut)(const entry grain_lut[][GRAIN_WIDTH],
> +                                     const int offsets[2][2],
> +                                     const int subx, const int suby,
> +                                     const int bx, const int by,
> +                                     const int x, const int y)
> +{
> +    const int randval = offsets[bx][by];
> +    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
> +    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
> +    return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
> +                    [offx + x + (FG_BLOCK_SIZE >> subx) * bx];
> +}
> +
> +static void FUNC(fgy_32x32xn_c)(pixel *const dst_row, const pixel *const src_row,
> +                                const ptrdiff_t stride,
> +                                const AVFilmGrainParams *const params, const size_t pw,
> +                                const uint8_t scaling[SCALING_SIZE],
> +                                const entry grain_lut[][GRAIN_WIDTH],
> +                                const int bh, const int row_num HBD_DECL)
> +{
> +    const AVFilmGrainAOMParams *const data = &params->codec.aom;
> +    const int rows = 1 + (data->overlap_flag && row_num > 0);
> +    const int bitdepth_min_8 = bitdepth - 8;
> +    const int grain_ctr = 128 << bitdepth_min_8;
> +    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
> +    unsigned seed[2];
> +    int offsets[2 /* col offset */][2 /* row offset */];
> +
> +    int min_value, max_value;
> +    if (data->limit_output_range) {
> +        min_value = 16 << bitdepth_min_8;
> +        max_value = 235 << bitdepth_min_8;
> +    } else {
> +        min_value = 0;
> +        max_value = bitdepth_max;
> +    }
> +
> +    // seed[0] contains the current row, seed[1] contains the previous
> +    for (int i = 0; i < rows; i++) {
> +        seed[i] = params->seed;
> +        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
> +        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
> +    }
> +
> +    av_assert1(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
> +
> +    // process this row in FG_BLOCK_SIZE^2 blocks
> +    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
> +        const int bw = FFMIN(FG_BLOCK_SIZE, (int) pw - bx);
> +        const pixel *src;
> +        pixel *dst;
> +        int noise;
> +
> +        // x/y block offsets to compensate for overlapped regions
> +        const int ystart = data->overlap_flag && row_num ? FFMIN(2, bh) : 0;
> +        const int xstart = data->overlap_flag && bx      ? FFMIN(2, bw) : 0;
> +
> +        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
> +
> +        if (data->overlap_flag && bx) {
> +            // shift previous offsets left
> +            for (int i = 0; i < rows; i++)
> +                offsets[1][i] = offsets[0][i];
> +        }
> +
> +        // update current offsets
> +        for (int i = 0; i < rows; i++)
> +            offsets[0][i] = get_random_number(8, &seed[i]);
> +
> +#define add_noise_y(x, y, grain)                                                \
> +        src = (const pixel*)((const char*)src_row + (y) * stride) + (x) + bx;   \
> +        dst = (pixel*)((char*)dst_row + (y) * stride) + (x) + bx;               \
> +        noise = round2(scaling[ *src ] * (grain), data->scaling_shift);         \
> +        *dst = av_clip(*src + noise, min_value, max_value);
> +
> +        for (int y = ystart; y < bh; y++) {
> +            // Non-overlapped image region (straightforward)
> +            for (int x = xstart; x < bw; x++) {
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
> +                add_noise_y(x, y, grain);
> +            }
> +
> +            // Special case for overlapped column
> +            for (int x = 0; x < xstart; x++) {
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
> +                int old   = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 0, x, y);
> +                grain = round2(old * w[x][0] + grain * w[x][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +                add_noise_y(x, y, grain);
> +            }
> +        }
> +
> +        for (int y = 0; y < ystart; y++) {
> +            // Special case for overlapped row (sans corner)
> +            for (int x = xstart; x < bw; x++) {
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
> +                int old   = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 1, x, y);
> +                grain = round2(old * w[y][0] + grain * w[y][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +                add_noise_y(x, y, grain);
> +            }
> +
> +            // Special case for doubly-overlapped corner
> +            for (int x = 0; x < xstart; x++) {
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
> +                int top = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 1, x, y);
> +                int old = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 1, x, y);
> +
> +                // Blend the top pixel with the top left block
> +                top = round2(old * w[x][0] + top * w[x][1], 5);
> +                top = av_clip(top, grain_min, grain_max);
> +
> +                // Blend the current pixel with the left block
> +                old = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 0, x, y);
> +                grain = round2(old * w[x][0] + grain * w[x][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +
> +                // Mix the row rows together and apply grain
> +                grain = round2(top * w[y][0] + grain * w[y][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +                add_noise_y(x, y, grain);
> +            }
> +        }
> +    }
> +}
> +
> +static void
> +FUNC(fguv_32x32xn_c)(pixel *const dst_row, const pixel *const src_row,
> +                     const ptrdiff_t stride, const AVFilmGrainParams *const params,
> +                     const size_t pw, const uint8_t scaling[SCALING_SIZE],
> +                     const entry grain_lut[][GRAIN_WIDTH], const int bh,
> +                     const int row_num, const pixel *const luma_row,
> +                     const ptrdiff_t luma_stride, const int uv, const int is_id,
> +                     const int sx, const int sy HBD_DECL)
> +{
> +    const AVFilmGrainAOMParams *const data = &params->codec.aom;
> +    const int rows = 1 + (data->overlap_flag && row_num > 0);
> +    const int bitdepth_min_8 = bitdepth - 8;
> +    const int grain_ctr = 128 << bitdepth_min_8;
> +    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
> +    unsigned seed[2];
> +    int offsets[2 /* col offset */][2 /* row offset */];
> +
> +    int min_value, max_value;
> +    if (data->limit_output_range) {
> +        min_value = 16 << bitdepth_min_8;
> +        max_value = (is_id ? 235 : 240) << bitdepth_min_8;
> +    } else {
> +        min_value = 0;
> +        max_value = bitdepth_max;
> +    }
> +
> +    // seed[0] contains the current row, seed[1] contains the previous
> +    for (int i = 0; i < rows; i++) {
> +        seed[i] = params->seed;
> +        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
> +        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
> +    }
> +
> +    av_assert1(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
> +
> +    // process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
> +    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
> +        const int bw = FFMIN(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
> +        int val, lx, ly, noise;
> +        const pixel *src, *luma;
> +        pixel *dst, avg;
> +
> +        // x/y block offsets to compensate for overlapped regions
> +        const int ystart = data->overlap_flag && row_num ? FFMIN(2 >> sy, bh) : 0;
> +        const int xstart = data->overlap_flag && bx      ? FFMIN(2 >> sx, bw) : 0;
> +
> +        static const int w[2 /* sub */][2 /* off */][2] = {
> +            { { 27, 17 }, { 17, 27 } },
> +            { { 23, 22 } },
> +        };
> +
> +        if (data->overlap_flag && bx) {
> +            // shift previous offsets left
> +            for (int i = 0; i < rows; i++)
> +                offsets[1][i] = offsets[0][i];
> +        }
> +
> +        // update current offsets
> +        for (int i = 0; i < rows; i++)
> +            offsets[0][i] = get_random_number(8, &seed[i]);
> +
> +#define add_noise_uv(x, y, grain)                                               \
> +            lx = (bx + x) << sx;                                                \
> +            ly = y << sy;                                                       \
> +            luma = (const pixel*)((const char*)luma_row + ly * luma_stride) + lx;\
> +            avg = luma[0];                                                      \
> +            if (sx)                                                             \
> +                avg = (avg + luma[1] + 1) >> 1;                                 \
> +            src = (const pixel*)((const char *)src_row + (y) * stride) + bx + (x);\
> +            dst = (pixel *) ((char *) dst_row + (y) * stride) + bx + (x);       \
> +            val = avg;                                                          \
> +            if (!data->chroma_scaling_from_luma) {                              \
> +                const int combined = avg * data->uv_mult_luma[uv] +             \
> +                                    *src * data->uv_mult[uv];                   \
> +                val = av_clip( (combined >> 6) +                                \
> +                               (data->uv_offset[uv] * (1 << bitdepth_min_8)),   \
> +                               0, bitdepth_max );                               \
> +            }                                                                   \
> +            noise = round2(scaling[ val ] * (grain), data->scaling_shift);      \
> +            *dst = av_clip(*src + noise, min_value, max_value);
> +
> +        for (int y = ystart; y < bh; y++) {
> +            // Non-overlapped image region (straightforward)
> +            for (int x = xstart; x < bw; x++) {
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
> +                add_noise_uv(x, y, grain);
> +            }
> +
> +            // Special case for overlapped column
> +            for (int x = 0; x < xstart; x++) {
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
> +                int old   = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 0, x, y);
> +                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +                add_noise_uv(x, y, grain);
> +            }
> +        }
> +
> +        for (int y = 0; y < ystart; y++) {
> +            // Special case for overlapped row (sans corner)
> +            for (int x = xstart; x < bw; x++) {
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
> +                int old   = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 1, x, y);
> +                grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +                add_noise_uv(x, y, grain);
> +            }
> +
> +            // Special case for doubly-overlapped corner
> +            for (int x = 0; x < xstart; x++) {
> +                int top = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 1, x, y);
> +                int old = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 1, x, y);
> +                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
> +
> +                // Blend the top pixel with the top left block
> +                top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
> +                top = av_clip(top, grain_min, grain_max);
> +
> +                // Blend the current pixel with the left block
> +                old = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 0, x, y);
> +                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +
> +                // Mix the row rows together and apply to image
> +                grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
> +                grain = av_clip(grain, grain_min, grain_max);
> +                add_noise_uv(x, y, grain);
> +            }
> +        }
> +    }
> +}
> +
> +static void FUNC(generate_scaling)(const uint8_t points[][2], const int num,
> +                                   uint8_t scaling[SCALING_SIZE] HBD_DECL)
> +{
> +    const int shift_x = bitdepth - 8;
> +    const int scaling_size = 1 << bitdepth;
> +    const int max_value = points[num - 1][0] << shift_x;
> +    av_assert0(scaling_size <= SCALING_SIZE);
> +
> +    if (num == 0) {
> +        memset(scaling, 0, scaling_size);
> +        return;
> +    }
> +
> +    // Fill up the preceding entries with the initial value
> +    memset(scaling, points[0][1], points[0][0] << shift_x);
> +
> +    // Linearly interpolate the values in the middle
> +    for (int i = 0; i < num - 1; i++) {
> +        const int bx = points[i][0];
> +        const int by = points[i][1];
> +        const int ex = points[i+1][0];
> +        const int ey = points[i+1][1];
> +        const int dx = ex - bx;
> +        const int dy = ey - by;
> +        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
> +        av_assert1(dx > 0);
> +        for (int x = 0, d = 0x8000; x < dx; x++) {
> +            scaling[(bx + x) << shift_x] = by + (d >> 16);
> +            d += delta;
> +        }
> +    }
> +
> +    // Fill up the remaining entries with the final value
> +    memset(&scaling[max_value], points[num - 1][1], scaling_size - max_value);
> +
> +#if BIT_DEPTH != 8
> +    for (int i = 0; i < num - 1; i++) {
> +        const int pad = 1 << shift_x, rnd = pad >> 1;
> +        const int bx = points[i][0] << shift_x;
> +        const int ex = points[i+1][0] << shift_x;
> +        const int dx = ex - bx;
> +        for (int x = 0; x < dx; x += pad) {
> +            const int range = scaling[bx + x + pad] - scaling[bx + x];
> +            for (int n = 1, r = rnd; n < pad; n++) {
> +                r += range;
> +                scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
> +            }
> +        }
> +    }
> +#endif
> +}
> +
> +static av_always_inline void
> +FUNC(apply_grain_row)(AVFrame *out, const AVFrame *in,
> +                      const int ss_x, const int ss_y,
> +                      const uint8_t scaling[3][SCALING_SIZE],
> +                      const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH],
> +                      const AVFilmGrainParams *params,
> +                      const int row HBD_DECL)
> +{
> +    // Synthesize grain for the affected planes
> +    const AVFilmGrainAOMParams *const data = &params->codec.aom;
> +    const int cpw = (out->width + ss_x) >> ss_x;
> +    const int is_id = out->colorspace == AVCOL_SPC_RGB;
> +    const int bh = (FFMIN(out->height - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
> +    const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * out->linesize[1] >> ss_y;
> +    pixel *const luma_src = (pixel *)
> +        ((char *) in->data[0] + row * FG_BLOCK_SIZE * in->linesize[0]);
> +
> +    if (data->num_y_points) {
> +        const int bh = FFMIN(out->height - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
> +        const ptrdiff_t off = row * FG_BLOCK_SIZE * out->linesize[0];
> +        FUNC(fgy_32x32xn_c)((pixel *) ((char *) out->data[0] + off), luma_src,
> +                            out->linesize[0], params, out->width, scaling[0],
> +                            grain_lut[0], bh, row HBD_CALL);
> +    }
> +
> +    if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
> +        !data->chroma_scaling_from_luma)
> +    {
> +        return;
> +    }
> +
> +    // extend padding pixels
> +    if (out->width & ss_x) {
> +        pixel *ptr = luma_src;
> +        for (int y = 0; y < bh; y++) {
> +            ptr[out->width] = ptr[out->width - 1];
> +            ptr = (pixel *) ((char *) ptr + (in->linesize[0] << ss_y));
> +        }
> +    }
> +
> +    if (data->chroma_scaling_from_luma) {
> +        for (int pl = 0; pl < 2; pl++)
> +            FUNC(fguv_32x32xn_c)((pixel *) ((char *) out->data[1 + pl] + uv_off),
> +                                 (const pixel *) ((const char *) in->data[1 + pl] + uv_off),
> +                                 in->linesize[1], params, cpw, scaling[0],
> +                                 grain_lut[1 + pl], bh, row, luma_src,
> +                                 in->linesize[0], pl, is_id, ss_x, ss_y HBD_CALL);
> +    } else {
> +        for (int pl = 0; pl < 2; pl++) {
> +            if (data->num_uv_points[pl]) {
> +                FUNC(fguv_32x32xn_c)((pixel *) ((char *) out->data[1 + pl] + uv_off),
> +                                     (const pixel *) ((const char *) in->data[1 + pl] + uv_off),
> +                                     in->linesize[1], params, cpw, scaling[1 + pl],
> +                                     grain_lut[1 + pl], bh, row, luma_src,
> +                                     in->linesize[0], pl, is_id, ss_x, ss_y HBD_CALL);
> +            }
> +        }
> +    }
> +}
> +
> +static int FUNC(apply_film_grain)(AVFrame *out_frame, const AVFrame *in_frame,
> +                                  const AVFilmGrainParams *params HBD_DECL)
> +{
> +    entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
> +    uint8_t scaling[3][SCALING_SIZE];
> +
> +    const AVFilmGrainAOMParams *const data = &params->codec.aom;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out_frame->format);
> +    const int rows = AV_CEIL_RSHIFT(out_frame->height, 5); /* log2(FG_BLOCK_SIZE) */
> +    const int subx = desc->log2_chroma_w, suby = desc->log2_chroma_h;
> +
> +    // Generate grain LUTs as needed
> +    FUNC(generate_grain_y_c)(grain_lut[0], params HBD_CALL);
> +    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
> +        FUNC(generate_grain_uv_c)(grain_lut[1], grain_lut[0], params, 0, subx, suby HBD_CALL);
> +    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
> +        FUNC(generate_grain_uv_c)(grain_lut[2], grain_lut[0], params, 1, subx, suby HBD_CALL);
> +
> +    // Generate scaling LUTs as needed
> +    if (data->num_y_points || data->chroma_scaling_from_luma)
> +        FUNC(generate_scaling)(data->y_points, data->num_y_points, scaling[0] HBD_CALL);
> +    if (data->num_uv_points[0])
> +        FUNC(generate_scaling)(data->uv_points[0], data->num_uv_points[0], scaling[1] HBD_CALL);
> +    if (data->num_uv_points[1])
> +        FUNC(generate_scaling)(data->uv_points[1], data->num_uv_points[1], scaling[2] HBD_CALL);
> +
> +    for (int row = 0; row < rows; row++) {
> +        FUNC(apply_grain_row)(out_frame, in_frame, subx, suby, scaling, grain_lut,
> +                              params, row HBD_CALL);
> +    }
> +
> +    return 0;
> +}
Niklas Haas March 8, 2024, 1:44 p.m. UTC | #2
On Fri, 08 Mar 2024 10:31:28 -0300 James Almer <jamrial@gmail.com> wrote:
> On 3/8/2024 10:21 AM, Niklas Haas wrote:
> > From: Niklas Haas <git@haasn.dev>
> > 
> > Implementation copied wholesale from dav1d, sans SIMD, under permissive
> > license. This implementation was extensively verified to be bit-exact,
> > so it serves as a much better starting point than trying to re-engineer
> > this from scratch for no reason. (I also authored the original
> > implementation in dav1d, so any "clean room" implementation would end up
> > looking much the same, anyway)
> > 
> > The notable changes I had to make while adapting this from the dav1d
> > code-base to the FFmpeg codebase include:
> > 
> > - reordering variable declarations to avoid triggering warnings
> > - replacing several inline helpers by avutil equivalents
> > - changing code that accesses frame metadata
> > - replacing raw plane copying logic by av_image_copy_plane
> > 
> > Apart from this, the implementation is basically unmodified.
> 
> Do we want this to be public? Both as a struct and the decoding functions.
> It could be used by libavfilter or even outside our libraries. The hevc 
> decoder would export the relevant T.35 SEI in the new struct if told to 
> not apply fg, like we already do in av1.

I'm not sure if the AFGS1 struct itself needs to be public, since it is
largely just a codec-internal wrapper for multiple param sets (for
scalable codecs).

If we want to add a public film grain synthesis helper, IMHO it should
be a wrapper around both ff_aom_apply_film_grain *and*
ff_h274_apply_film_grain, which directly ingests an AVFilmGrainParams
and resolves to the correct implementation.

So we can merge this series as-is and still add a public helper on top.

Incidentally, there is a strong precedent here: dav1d.h exports
dav1d_apply_grain() precisely for the reason that VLC needs to
initialize the decoder *before* it knows whether to apply synthesis on
GPU or CPU, so it has to set the equivalent of `-export_side_data
film_grain` at init time and manually apply CPU film grain synthesis if
it turns out that GPU fgs is unavailable.
Mark Thompson March 11, 2024, 10:32 p.m. UTC | #3
On 08/03/2024 13:44, Niklas Haas wrote:
> On Fri, 08 Mar 2024 10:31:28 -0300 James Almer <jamrial@gmail.com> wrote:
>> On 3/8/2024 10:21 AM, Niklas Haas wrote:
>>> From: Niklas Haas <git@haasn.dev>
>>>
>>> Implementation copied wholesale from dav1d, sans SIMD, under permissive
>>> license. This implementation was extensively verified to be bit-exact,
>>> so it serves as a much better starting point than trying to re-engineer
>>> this from scratch for no reason. (I also authored the original
>>> implementation in dav1d, so any "clean room" implementation would end up
>>> looking much the same, anyway)
>>>
>>> The notable changes I had to make while adapting this from the dav1d
>>> code-base to the FFmpeg codebase include:
>>>
>>> - reordering variable declarations to avoid triggering warnings
>>> - replacing several inline helpers by avutil equivalents
>>> - changing code that accesses frame metadata
>>> - replacing raw plane copying logic by av_image_copy_plane
>>>
>>> Apart from this, the implementation is basically unmodified.
>>
>> Do we want this to be public? Both as a struct and the decoding functions.
>> It could be used by libavfilter or even outside our libraries. The hevc
>> decoder would export the relevant T.35 SEI in the new struct if told to
>> not apply fg, like we already do in av1.
> 
> I'm not sure if the AFGS1 struct itself needs to be public, since it is
> largely just a codec-internal wrapper for multiple param sets (for
> scalable codecs).

This is not correct.  Along with scalable cases, the multiple param sets are to support applying film grain at the display resolution after scaling, providing a better result than upscaling the grain applied at the decode resolution.

For example, you could have a scalable stream with operating points of 1920x1080, 1280x720 and 640x360.  The AFGS1 metadata associated with the stream would then have film grain parameters for those three resolutions, plus perhaps 2560x1440 and 3840x2160.

In the ideal case you then pick the operating point for decode based on your available bandwidth and decode capabilities, and the resolution for film grain application based on the display.  The decode happens without any film grain, the clean video is upscaled, and then the film grain is applied immediately before display.

A current conforming AV1 implementation which only supports applying film grain as part of the decode process can do so and produce a conforming result, but the quality may not be as good as the ideal case because the presence of noise will affect the upscale quality and also the grain itself will be scaled in a way which may not look correct.

I'm not sure what the best way to expose this is.  For a player application an option to select the intended display resolution and then export an AV1 film grain side data as it is now is sufficient, but that doesn't really work in an application like ffmpeg where the target resolution isn't directly known.

(Also note that a transcode can carry the AFGS1 message from the source to the output without ever touching it, as long as the target resolution satisfies the requirement on the coded resolution being available in the param sets.  It seems desirable to support this possibility.)

Thanks,

- Mark
Niklas Haas March 12, 2024, 9:40 p.m. UTC | #4
On Mon, 11 Mar 2024 22:32:01 +0000 Mark Thompson <sw@jkqxz.net> wrote:
> This is not correct.  Along with scalable cases, the multiple param sets are to support applying film grain at the display resolution after scaling, providing a better result than upscaling the grain applied at the decode resolution.
> 
> For example, you could have a scalable stream with operating points of 1920x1080, 1280x720 and 640x360.  The AFGS1 metadata associated with the stream would then have film grain parameters for those three resolutions, plus perhaps 2560x1440 and 3840x2160.
> 
> In the ideal case you then pick the operating point for decode based on your available bandwidth and decode capabilities, and the resolution for film grain application based on the display.  The decode happens without any film grain, the clean video is upscaled, and then the film grain is applied immediately before display.
> 
> A current conforming AV1 implementation which only supports applying film grain as part of the decode process can do so and produce a conforming result, but the quality may not be as good as the ideal case because the presence of noise will affect the upscale quality and also the grain itself will be scaled in a way which may not look correct.
> 
> I'm not sure what the best way to expose this is.  For a player application an option to select the intended display resolution and then export an AV1 film grain side data as it is now is sufficient, but that doesn't really work in an application like ffmpeg where the target resolution isn't directly known.

One way to get there would be to attach multiple AVFilmGrainAOMParams
structs as side data to the frame, plus adding the extra metadata for
the resolution (and colorimetry etc.) to that struct.

May be a bit awkward to deal with on the user side, though.

> (Also note that a transcode can carry the AFGS1 message from the source to the output without ever touching it, as long as the target resolution satisfies the requirement on the coded resolution being available in the param sets.  It seems desirable to support this possibility.)
> 
> Thanks,
> 
> - Mark
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Niklas Haas March 15, 2024, 10:17 a.m. UTC | #5
Tested and validated against official reference files.

Any objection to merging this as-is?

I would rather submit a separate series for adding a new public API, as
well as for exposing multiple film grain parameter sets, both of which
are incremental additions on top of the parsing and synthesis code this
series adds.
diff mbox series

Patch

diff --git a/libavcodec/aom_film_grain.c b/libavcodec/aom_film_grain.c
new file mode 100644
index 00000000000..ffcd71b584b
--- /dev/null
+++ b/libavcodec/aom_film_grain.c
@@ -0,0 +1,310 @@ 
+/*
+ * AOM film grain synthesis
+ * Copyright (c) 2023 Niklas Haas <ffmpeg@haasn.xyz>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AOM film grain synthesis.
+ * @author Niklas Haas <ffmpeg@haasn.xyz>
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+
+#include "aom_film_grain.h"
+
+// Common/shared helpers (not dependent on BIT_DEPTH)
+static inline int get_random_number(const int bits, unsigned *const state) {
+    const int r = *state;
+    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const uint64_t shift) {
+    return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+enum {
+    GRAIN_WIDTH      = 82,
+    GRAIN_HEIGHT     = 73,
+    SUB_GRAIN_WIDTH  = 44,
+    SUB_GRAIN_HEIGHT = 38,
+    FG_BLOCK_SIZE    = 32,
+};
+
+static const int16_t gaussian_sequence[2048];
+
+#define BIT_DEPTH 16
+#include "aom_film_grain_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 8
+#include "aom_film_grain_template.c"
+#undef BIT_DEPTH
+
+
+int ff_aom_apply_film_grain(AVFrame *out, const AVFrame *in,
+                            const AVFilmGrainParams *params)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
+    const int subx = desc->log2_chroma_w, suby = desc->log2_chroma_h;
+    const int pxstep = desc->comp[0].step;
+
+    av_assert0(out->format == in->format);
+    av_assert0(params->type == AV_FILM_GRAIN_PARAMS_AV1);
+
+    // Copy over the non-modified planes
+    if (!params->codec.aom.num_y_points) {
+        av_image_copy_plane(out->data[0], out->linesize[0],
+                            in->data[0], in->linesize[0],
+                            out->width * pxstep, out->height);
+    }
+    for (int uv = 0; uv < 2; uv++) {
+        if (!data->num_uv_points[uv]) {
+            av_image_copy_plane(out->data[1+uv], out->linesize[1+uv],
+                                in->data[1+uv], in->linesize[1+uv],
+                                AV_CEIL_RSHIFT(out->width, subx) * pxstep,
+                                AV_CEIL_RSHIFT(out->height, suby));
+        }
+    }
+
+    switch (in->format) {
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV444P:
+    case AV_PIX_FMT_YUVJ420P:
+    case AV_PIX_FMT_YUVJ422P:
+    case AV_PIX_FMT_YUVJ444P:
+        return apply_film_grain_8(out, in, params);
+    case AV_PIX_FMT_GRAY9:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV422P9:
+    case AV_PIX_FMT_YUV444P9:
+        return apply_film_grain_16(out, in, params, 9);
+    case AV_PIX_FMT_GRAY10:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV444P10:
+        return apply_film_grain_16(out, in, params, 10);
+    case AV_PIX_FMT_GRAY12:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+        return apply_film_grain_16(out, in, params, 12);
+    }
+
+    /* The AV1 spec only defines film grain synthesis for these formats */
+    return AVERROR_INVALIDDATA;
+}
+
+// Taken from the AV1 spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+static const int16_t gaussian_sequence[2048] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484
+};
diff --git a/libavcodec/aom_film_grain.h b/libavcodec/aom_film_grain.h
new file mode 100644
index 00000000000..5d772bd7d17
--- /dev/null
+++ b/libavcodec/aom_film_grain.h
@@ -0,0 +1,38 @@ 
+/*
+ * AOM film grain synthesis
+ * Copyright (c) 2021 Niklas Haas <ffmpeg@haasn.xyz>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AOM film grain synthesis.
+ * @author Niklas Haas <ffmpeg@haasn.xyz>
+ */
+
+#ifndef AVCODEC_AOM_FILM_GRAIN_H
+#define AVCODEC_AOM_FILM_GRAIN_H
+
+#include "libavutil/film_grain_params.h"
+
+// Synthesizes film grain on top of `in` and stores the result to `out`. `out`
+// must already have been allocated and set to the same size and format as `in`.
+int ff_aom_apply_film_grain(AVFrame *out, const AVFrame *in,
+                            const AVFilmGrainParams *params);
+
+#endif /* AVCODEC_AOM_FILM_GRAIN_H */
diff --git a/libavcodec/aom_film_grain_template.c b/libavcodec/aom_film_grain_template.c
new file mode 100644
index 00000000000..5f9f29f1fab
--- /dev/null
+++ b/libavcodec/aom_film_grain_template.c
@@ -0,0 +1,577 @@ 
+/*
+ * AOM film grain synthesis
+ * Copyright (c) 2023 Niklas Haas <ffmpeg@haasn.xyz>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bit_depth_template.c"
+
+#undef entry
+#undef bitdepth
+#undef bitdepth_max
+#undef HBD_DECL
+#undef HBD_CALL
+#undef SCALING_SIZE
+
+#if BIT_DEPTH > 8
+# define entry int16_t
+# define bitdepth_max ((1 << bitdepth) - 1)
+# define HBD_DECL , const int bitdepth
+# define HBD_CALL , bitdepth
+# define SCALING_SIZE 4096
+#else
+# define entry int8_t
+# define bitdepth 8
+# define bitdepth_max UINT8_MAX
+# define HBD_DECL
+# define HBD_CALL
+# define SCALING_SIZE 256
+#endif
+
+static void FUNC(generate_grain_y_c)(entry buf[][GRAIN_WIDTH],
+                                     const AVFilmGrainParams *const params
+                                     HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int bitdepth_min_8 = bitdepth - 8;
+    unsigned seed = params->seed;
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(gaussian_sequence[ value ], shift);
+        }
+    }
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0, grain;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = av_clip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+static void
+FUNC(generate_grain_uv_c)(entry buf[][GRAIN_WIDTH],
+                          const entry buf_y[][GRAIN_WIDTH],
+                          const AVFilmGrainParams *const params, const intptr_t uv,
+                          const int subx, const int suby HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int bitdepth_min_8 = bitdepth - 8;
+    unsigned seed = params->seed ^ (uv ? 0x49d8 : 0xb524);
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(gaussian_sequence[ value ], shift);
+        }
+    }
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_uv[uv];
+            int sum = 0, grain;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+                        int luma = 0;
+                        if (!data->num_y_points)
+                            break;
+                        for (int i = 0; i <= suby; i++) {
+                            for (int j = 0; j <= subx; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, subx + suby);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = av_clip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry FUNC(sample_lut)(const entry grain_lut[][GRAIN_WIDTH],
+                                     const int offsets[2][2],
+                                     const int subx, const int suby,
+                                     const int bx, const int by,
+                                     const int x, const int y)
+{
+    const int randval = offsets[bx][by];
+    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+    return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
+                    [offx + x + (FG_BLOCK_SIZE >> subx) * bx];
+}
+
+static void FUNC(fgy_32x32xn_c)(pixel *const dst_row, const pixel *const src_row,
+                                const ptrdiff_t stride,
+                                const AVFilmGrainParams *const params, const size_t pw,
+                                const uint8_t scaling[SCALING_SIZE],
+                                const entry grain_lut[][GRAIN_WIDTH],
+                                const int bh, const int row_num HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+    unsigned seed[2];
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    int min_value, max_value;
+    if (data->limit_output_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = 235 << bitdepth_min_8;
+    } else {
+        min_value = 0;
+        max_value = bitdepth_max;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    for (int i = 0; i < rows; i++) {
+        seed[i] = params->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    av_assert1(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    // process this row in FG_BLOCK_SIZE^2 blocks
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
+        const int bw = FFMIN(FG_BLOCK_SIZE, (int) pw - bx);
+        const pixel *src;
+        pixel *dst;
+        int noise;
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? FFMIN(2, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? FFMIN(2, bw) : 0;
+
+        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+#define add_noise_y(x, y, grain)                                                \
+        src = (const pixel*)((const char*)src_row + (y) * stride) + (x) + bx;   \
+        dst = (pixel*)((char*)dst_row + (y) * stride) + (x) + bx;               \
+        noise = round2(scaling[ *src ] * (grain), data->scaling_shift);         \
+        *dst = av_clip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                grain = round2(old * w[y][0] + grain * w[y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int top = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                int old = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 1, x, y);
+
+                // Blend the top pixel with the top left block
+                top = round2(old * w[x][0] + top * w[x][1], 5);
+                top = av_clip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                old = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply grain
+                grain = round2(top * w[y][0] + grain * w[y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+    }
+}
+
+static void
+FUNC(fguv_32x32xn_c)(pixel *const dst_row, const pixel *const src_row,
+                     const ptrdiff_t stride, const AVFilmGrainParams *const params,
+                     const size_t pw, const uint8_t scaling[SCALING_SIZE],
+                     const entry grain_lut[][GRAIN_WIDTH], const int bh,
+                     const int row_num, const pixel *const luma_row,
+                     const ptrdiff_t luma_stride, const int uv, const int is_id,
+                     const int sx, const int sy HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+    unsigned seed[2];
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    int min_value, max_value;
+    if (data->limit_output_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = (is_id ? 235 : 240) << bitdepth_min_8;
+    } else {
+        min_value = 0;
+        max_value = bitdepth_max;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    for (int i = 0; i < rows; i++) {
+        seed[i] = params->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    av_assert1(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    // process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
+        const int bw = FFMIN(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
+        int val, lx, ly, noise;
+        const pixel *src, *luma;
+        pixel *dst, avg;
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? FFMIN(2 >> sy, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? FFMIN(2 >> sx, bw) : 0;
+
+        static const int w[2 /* sub */][2 /* off */][2] = {
+            { { 27, 17 }, { 17, 27 } },
+            { { 23, 22 } },
+        };
+
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+#define add_noise_uv(x, y, grain)                                               \
+            lx = (bx + x) << sx;                                                \
+            ly = y << sy;                                                       \
+            luma = (const pixel*)((const char*)luma_row + ly * luma_stride) + lx;\
+            avg = luma[0];                                                      \
+            if (sx)                                                             \
+                avg = (avg + luma[1] + 1) >> 1;                                 \
+            src = (const pixel*)((const char *)src_row + (y) * stride) + bx + (x);\
+            dst = (pixel *) ((char *) dst_row + (y) * stride) + bx + (x);       \
+            val = avg;                                                          \
+            if (!data->chroma_scaling_from_luma) {                              \
+                const int combined = avg * data->uv_mult_luma[uv] +             \
+                                    *src * data->uv_mult[uv];                   \
+                val = av_clip( (combined >> 6) +                                \
+                               (data->uv_offset[uv] * (1 << bitdepth_min_8)),   \
+                               0, bitdepth_max );                               \
+            }                                                                   \
+            noise = round2(scaling[ val ] * (grain), data->scaling_shift);      \
+            *dst = av_clip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                int top = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                int old = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 1, x, y);
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+
+                // Blend the top pixel with the top left block
+                top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
+                top = av_clip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                old = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply to image
+                grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+    }
+}
+
+static void FUNC(generate_scaling)(const uint8_t points[][2], const int num,
+                                   uint8_t scaling[SCALING_SIZE] HBD_DECL)
+{
+    const int shift_x = bitdepth - 8;
+    const int scaling_size = 1 << bitdepth;
+    const int max_value = points[num - 1][0] << shift_x;
+    av_assert0(scaling_size <= SCALING_SIZE);
+
+    if (num == 0) {
+        memset(scaling, 0, scaling_size);
+        return;
+    }
+
+    // Fill up the preceding entries with the initial value
+    memset(scaling, points[0][1], points[0][0] << shift_x);
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0];
+        const int by = points[i][1];
+        const int ex = points[i+1][0];
+        const int ey = points[i+1][1];
+        const int dx = ex - bx;
+        const int dy = ey - by;
+        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+        av_assert1(dx > 0);
+        for (int x = 0, d = 0x8000; x < dx; x++) {
+            scaling[(bx + x) << shift_x] = by + (d >> 16);
+            d += delta;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    memset(&scaling[max_value], points[num - 1][1], scaling_size - max_value);
+
+#if BIT_DEPTH != 8
+    for (int i = 0; i < num - 1; i++) {
+        const int pad = 1 << shift_x, rnd = pad >> 1;
+        const int bx = points[i][0] << shift_x;
+        const int ex = points[i+1][0] << shift_x;
+        const int dx = ex - bx;
+        for (int x = 0; x < dx; x += pad) {
+            const int range = scaling[bx + x + pad] - scaling[bx + x];
+            for (int n = 1, r = rnd; n < pad; n++) {
+                r += range;
+                scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
+            }
+        }
+    }
+#endif
+}
+
+static av_always_inline void
+FUNC(apply_grain_row)(AVFrame *out, const AVFrame *in,
+                      const int ss_x, const int ss_y,
+                      const uint8_t scaling[3][SCALING_SIZE],
+                      const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH],
+                      const AVFilmGrainParams *params,
+                      const int row HBD_DECL)
+{
+    // Synthesize grain for the affected planes
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int cpw = (out->width + ss_x) >> ss_x;
+    const int is_id = out->colorspace == AVCOL_SPC_RGB;
+    const int bh = (FFMIN(out->height - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
+    const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * out->linesize[1] >> ss_y;
+    pixel *const luma_src = (pixel *)
+        ((char *) in->data[0] + row * FG_BLOCK_SIZE * in->linesize[0]);
+
+    if (data->num_y_points) {
+        const int bh = FFMIN(out->height - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
+        const ptrdiff_t off = row * FG_BLOCK_SIZE * out->linesize[0];
+        FUNC(fgy_32x32xn_c)((pixel *) ((char *) out->data[0] + off), luma_src,
+                            out->linesize[0], params, out->width, scaling[0],
+                            grain_lut[0], bh, row HBD_CALL);
+    }
+
+    if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
+        !data->chroma_scaling_from_luma)
+    {
+        return;
+    }
+
+    // extend padding pixels
+    if (out->width & ss_x) {
+        pixel *ptr = luma_src;
+        for (int y = 0; y < bh; y++) {
+            ptr[out->width] = ptr[out->width - 1];
+            ptr = (pixel *) ((char *) ptr + (in->linesize[0] << ss_y));
+        }
+    }
+
+    if (data->chroma_scaling_from_luma) {
+        for (int pl = 0; pl < 2; pl++)
+            FUNC(fguv_32x32xn_c)((pixel *) ((char *) out->data[1 + pl] + uv_off),
+                                 (const pixel *) ((const char *) in->data[1 + pl] + uv_off),
+                                 in->linesize[1], params, cpw, scaling[0],
+                                 grain_lut[1 + pl], bh, row, luma_src,
+                                 in->linesize[0], pl, is_id, ss_x, ss_y HBD_CALL);
+    } else {
+        for (int pl = 0; pl < 2; pl++) {
+            if (data->num_uv_points[pl]) {
+                FUNC(fguv_32x32xn_c)((pixel *) ((char *) out->data[1 + pl] + uv_off),
+                                     (const pixel *) ((const char *) in->data[1 + pl] + uv_off),
+                                     in->linesize[1], params, cpw, scaling[1 + pl],
+                                     grain_lut[1 + pl], bh, row, luma_src,
+                                     in->linesize[0], pl, is_id, ss_x, ss_y HBD_CALL);
+            }
+        }
+    }
+}
+
+static int FUNC(apply_film_grain)(AVFrame *out_frame, const AVFrame *in_frame,
+                                  const AVFilmGrainParams *params HBD_DECL)
+{
+    entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+    uint8_t scaling[3][SCALING_SIZE];
+
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out_frame->format);
+    const int rows = AV_CEIL_RSHIFT(out_frame->height, 5); /* log2(FG_BLOCK_SIZE) */
+    const int subx = desc->log2_chroma_w, suby = desc->log2_chroma_h;
+
+    // Generate grain LUTs as needed
+    FUNC(generate_grain_y_c)(grain_lut[0], params HBD_CALL);
+    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+        FUNC(generate_grain_uv_c)(grain_lut[1], grain_lut[0], params, 0, subx, suby HBD_CALL);
+    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+        FUNC(generate_grain_uv_c)(grain_lut[2], grain_lut[0], params, 1, subx, suby HBD_CALL);
+
+    // Generate scaling LUTs as needed
+    if (data->num_y_points || data->chroma_scaling_from_luma)
+        FUNC(generate_scaling)(data->y_points, data->num_y_points, scaling[0] HBD_CALL);
+    if (data->num_uv_points[0])
+        FUNC(generate_scaling)(data->uv_points[0], data->num_uv_points[0], scaling[1] HBD_CALL);
+    if (data->num_uv_points[1])
+        FUNC(generate_scaling)(data->uv_points[1], data->num_uv_points[1], scaling[2] HBD_CALL);
+
+    for (int row = 0; row < rows; row++) {
+        FUNC(apply_grain_row)(out_frame, in_frame, subx, suby, scaling, grain_lut,
+                              params, row HBD_CALL);
+    }
+
+    return 0;
+}