diff mbox series

[FFmpeg-devel,2/2] libavfilter/vf_dnn_detect: Add yolo support

Message ID 20231121022018.285533-2-wenbin.chen@intel.com
State Accepted
Commit 47b2328076f865a12dd0d50aed02fbef32f69490
Headers show
Series [FFmpeg-devel,1/2] libavfilter/vf_dnn_detect: Add model_type option. | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Chen, Wenbin Nov. 21, 2023, 2:20 a.m. UTC
From: Wenbin Chen <wenbin.chen@intel.com>

Add yolo support. Yolo model doesn't output final result. It outputs
candidate boxes, so we need post-process to remove overlap boxes to
get final results. Also, the box's coordinators relate to cell and
anchors, so we need these information to calculate boxes as well.

Model detail please refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/yolo-v2-tf

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavfilter/dnn/dnn_backend_openvino.c |   6 +-
 libavfilter/vf_dnn_detect.c            | 242 ++++++++++++++++++++++++-
 2 files changed, 244 insertions(+), 4 deletions(-)

Comments

Guo, Yejun Nov. 25, 2023, 4:52 a.m. UTC | #1
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> wenbin.chen-at-intel.com@ffmpeg.org
> Sent: Tuesday, November 21, 2023 10:20 AM
> To: ffmpeg-devel@ffmpeg.org
> Subject: [FFmpeg-devel] [PATCH 2/2] libavfilter/vf_dnn_detect: Add yolo
> support
> 
> From: Wenbin Chen <wenbin.chen@intel.com>
> 
> Add yolo support. Yolo model doesn't output final result. It outputs candidate
> boxes, so we need post-process to remove overlap boxes to get final results.
> Also, the box's coordinators relate to cell and anchors, so we need these
> information to calculate boxes as well.
> 
> Model detail please refer to:
> https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/
> public/yolo-v2-tf
> 
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
>  libavfilter/dnn/dnn_backend_openvino.c |   6 +-
>  libavfilter/vf_dnn_detect.c            | 242 ++++++++++++++++++++++++-
>  2 files changed, 244 insertions(+), 4 deletions(-)


Looks good to me, will push soon, thanks
diff mbox series

Patch

diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index d3af8c34ce..6fe8b9c243 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -386,9 +386,9 @@  static void infer_completion_callback(void *args)
         ov_shape_free(&output_shape);
         return;
     }
-    output.channels = dims[1];
-    output.height   = dims[2];
-    output.width    = dims[3];
+    output.channels = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1;
+    output.height   = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1;
+    output.width    = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1;
     av_assert0(request->lltask_count <= dims[0]);
     ov_shape_free(&output_shape);
 #else
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 9db90ee4cf..7ac3bb0b58 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -30,9 +30,11 @@ 
 #include "libavutil/time.h"
 #include "libavutil/avstring.h"
 #include "libavutil/detection_bbox.h"
+#include "libavutil/fifo.h"
 
 typedef enum {
-    DDMT_SSD
+    DDMT_SSD,
+    DDMT_YOLOV1V2,
 } DNNDetectionModelType;
 
 typedef struct DnnDetectContext {
@@ -43,6 +45,15 @@  typedef struct DnnDetectContext {
     char **labels;
     int label_count;
     DNNDetectionModelType model_type;
+    int cell_w;
+    int cell_h;
+    int nb_classes;
+    AVFifo *bboxes_fifo;
+    int scale_width;
+    int scale_height;
+    char *anchors_str;
+    float *anchors;
+    int nb_anchor;
 } DnnDetectContext;
 
 #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
@@ -61,11 +72,218 @@  static const AVOption dnn_detect_options[] = {
     { "labels",      "path to labels file",        OFFSET2(labels_filename), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
     { "model_type",  "DNN detection model type",   OFFSET2(model_type),      AV_OPT_TYPE_INT,       { .i64 = DDMT_SSD },    INT_MIN, INT_MAX, FLAGS, "model_type" },
         { "ssd",     "output shape [1, 1, N, 7]",  0,                        AV_OPT_TYPE_CONST,       { .i64 = DDMT_SSD },    0, 0, FLAGS, "model_type" },
+        { "yolo",    "output shape [1, N*Cx*Cy*DetectionBox]",  0,           AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV1V2 },    0, 0, FLAGS, "model_type" },
+    { "cell_w",      "cell width",                 OFFSET2(cell_w),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
+    { "cell_h",      "cell height",                OFFSET2(cell_h),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
+    { "nb_classes",  "The number of class",        OFFSET2(nb_classes),      AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
+    { "anchors",     "anchors, splited by '&'",    OFFSET2(anchors_str),         AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
     { NULL }
 };
 
 AVFILTER_DEFINE_CLASS(dnn_detect);
 
+static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
+{
+    float max_prob = 0;
+    int label_id = 0;
+    for (int i = 0; i < nb_classes; i++) {
+        if (label_data[i * cell_size] > max_prob) {
+            max_prob = label_data[i * cell_size];
+            label_id = i;
+        }
+    }
+    return label_id;
+}
+
+static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
+{
+    char *saveptr = NULL, *token;
+    float *anchors_buf;
+    int nb_anchor = 0, i = 0;
+    while(anchors_str[i] != '\0') {
+        if(anchors_str[i] == '&')
+            nb_anchor++;
+        i++;
+    }
+    nb_anchor++;
+    anchors_buf = av_mallocz(nb_anchor * sizeof(*anchors));
+    if (!anchors_buf) {
+        return 0;
+    }
+    for (int i = 0; i < nb_anchor; i++) {
+        token = av_strtok(anchors_str, "&", &saveptr);
+        anchors_buf[i] = strtof(token, NULL);
+        anchors_str = NULL;
+    }
+    *anchors = anchors_buf;
+    return nb_anchor;
+}
+
+/* Calculate Intersection Over Union */
+static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
+{
+    float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x);
+    float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y);
+    float intersection_area =
+        (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;
+    float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area;
+    return intersection_area / union_area;
+}
+
+static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index,
+                                      AVFilterContext *filter_ctx)
+{
+    DnnDetectContext *ctx = filter_ctx->priv;
+    float conf_threshold = ctx->confidence;
+    int detection_boxes, box_size, cell_w, cell_h, scale_w, scale_h;
+    int nb_classes = ctx->nb_classes;
+    float *output_data = output[output_index].data;
+    float *anchors = ctx->anchors;
+    AVDetectionBBox *bbox;
+
+    if (ctx->model_type == DDMT_YOLOV1V2) {
+        cell_w = ctx->cell_w;
+        cell_h = ctx->cell_h;
+        scale_w = cell_w;
+        scale_h = cell_h;
+    }
+    box_size = nb_classes + 5;
+
+    if (!cell_h || !cell_w) {
+        av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!nb_classes) {
+        av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!anchors) {
+        av_log(filter_ctx, AV_LOG_ERROR, "anchors is not set\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (output[output_index].channels * output[output_index].width *
+            output[output_index].height % (box_size * cell_w * cell_h)) {
+        av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n");
+        return AVERROR(EINVAL);
+    }
+    detection_boxes = output[output_index].channels *
+                      output[output_index].height *
+                      output[output_index].width / box_size / cell_w / cell_h;
+
+    /**
+     * find all candidate bbox
+     * yolo output can be reshaped to [B, N*D, Cx, Cy]
+     * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]
+     **/
+    for (int box_id = 0; box_id < detection_boxes; box_id++) {
+        for (int cx = 0; cx < cell_w; cx++)
+            for (int cy = 0; cy < cell_h; cy++) {
+                float x, y, w, h, conf;
+                float *detection_boxes_data;
+                int label_id;
+
+                detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;
+                conf = detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h];
+                if (conf < conf_threshold) {
+                    continue;
+                }
+
+                x    = detection_boxes_data[cy * cell_w + cx];
+                y    = detection_boxes_data[cy * cell_w + cx + cell_w * cell_h];
+                w    = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
+                h    = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
+                label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,
+                                    detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
+                conf = conf * detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h];
+
+                bbox = av_mallocz(sizeof(*bbox));
+                if (!bbox)
+                    return AVERROR(ENOMEM);
+
+                bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w;
+                bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h;
+                bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2;
+                bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2;
+                bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
+                if (ctx->labels && label_id < ctx->label_count) {
+                    av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
+                } else {
+                    snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
+                }
+
+                if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) {
+                    av_freep(&bbox);
+                    return AVERROR(ENOMEM);
+                }
+            }
+    }
+    return 0;
+}
+
+static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx)
+{
+    DnnDetectContext *ctx = filter_ctx->priv;
+    float conf_threshold = ctx->confidence;
+    AVDetectionBBox *bbox;
+    int nb_bboxes = 0;
+    AVDetectionBBoxHeader *header;
+    if (av_fifo_can_read(ctx->bboxes_fifo) == 0) {
+        av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
+        return 0;
+    }
+
+    /* remove overlap bboxes */
+    for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){
+        av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i);
+        for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) {
+            AVDetectionBBox *overlap_bbox;
+            av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j);
+            if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) &&
+                av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 &&
+                dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) {
+                    bbox->classify_count = -1; // bad result
+                    nb_bboxes++;
+                    break;
+                }
+        }
+    }
+    nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes;
+    header = av_detection_bbox_create_side_data(frame, nb_bboxes);
+    if (!header) {
+        av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
+         return -1;
+     }
+    av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
+
+    while(av_fifo_can_read(ctx->bboxes_fifo)) {
+        AVDetectionBBox *candidate_bbox;
+        av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1);
+
+        if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) {
+            bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
+            memcpy(bbox, candidate_bbox, sizeof(*bbox));
+            nb_bboxes--;
+        }
+        av_freep(&candidate_bbox);
+    }
+    return 0;
+}
+
+static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
+{
+    int ret = 0;
+    ret = dnn_detect_parse_yolo_output(frame, output, 0, filter_ctx);
+    if (ret < 0)
+        return ret;
+    ret = dnn_detect_fill_side_data(frame, filter_ctx);
+    if (ret < 0)
+        return ret;
+    return 0;
+}
+
 static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
 {
     DnnDetectContext *ctx = filter_ctx->priv;
@@ -158,6 +376,10 @@  static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, AVFilterCont
         if (ret < 0)
             return ret;
         break;
+    case DDMT_YOLOV1V2:
+        ret = dnn_detect_post_proc_yolo(frame, output, filter_ctx);
+        if (ret < 0)
+            return ret;
     }
 
     return 0;
@@ -356,11 +578,22 @@  static av_cold int dnn_detect_init(AVFilterContext *context)
     ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs);
     if (ret < 0)
         return ret;
+    ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW);
+    if (!ctx->bboxes_fifo)
+        return AVERROR(ENOMEM);
     ff_dnn_set_detect_post_proc(&ctx->dnnctx, dnn_detect_post_proc);
 
     if (ctx->labels_filename) {
         return read_detect_label_file(context);
     }
+    if (ctx->anchors_str) {
+        ret = dnn_detect_parse_anchors(ctx->anchors_str, &ctx->anchors);
+        if (!ctx->anchors) {
+            av_log(context, AV_LOG_ERROR, "failed to parse anchors_str\n");
+            return AVERROR(EINVAL);
+        }
+        ctx->nb_anchor = ret;
+    }
     return 0;
 }
 
@@ -460,7 +693,14 @@  static int dnn_detect_activate(AVFilterContext *filter_ctx)
 static av_cold void dnn_detect_uninit(AVFilterContext *context)
 {
     DnnDetectContext *ctx = context->priv;
+    AVDetectionBBox *bbox;
     ff_dnn_uninit(&ctx->dnnctx);
+    while(av_fifo_can_read(ctx->bboxes_fifo)) {
+        av_fifo_read(ctx->bboxes_fifo, &bbox, 1);
+        av_freep(&bbox);
+    }
+    av_fifo_freep2(&ctx->bboxes_fifo);
+    av_freep(&ctx->anchors);
     free_detect_labels(ctx);
 }