mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
libavfilter/vf_dnn_detect: Add two outputs ssd support
For this kind of model, we can directly use its output as final result just like ssd model. The difference is that it splits output into two tensors. [x_min, y_min, x_max, y_max, confidence] and [lable_id]. Model example refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/intel/person-detection-0106 Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> Reviewed-by: Guo Yejun <yejun.guo@intel.com>
This commit is contained in:
parent
86435582a6
commit
56c5930ec3
@ -359,24 +359,48 @@ static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
|
static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs,
|
||||||
|
AVFilterContext *filter_ctx)
|
||||||
{
|
{
|
||||||
DnnDetectContext *ctx = filter_ctx->priv;
|
DnnDetectContext *ctx = filter_ctx->priv;
|
||||||
float conf_threshold = ctx->confidence;
|
float conf_threshold = ctx->confidence;
|
||||||
int proposal_count = output->height;
|
int proposal_count = 0;
|
||||||
int detect_size = output->width;
|
int detect_size = 0;
|
||||||
float *detections = output->data;
|
float *detections = NULL, *labels = NULL;
|
||||||
int nb_bboxes = 0;
|
int nb_bboxes = 0;
|
||||||
AVDetectionBBoxHeader *header;
|
AVDetectionBBoxHeader *header;
|
||||||
AVDetectionBBox *bbox;
|
AVDetectionBBox *bbox;
|
||||||
|
int scale_w = ctx->scale_width;
|
||||||
|
int scale_h = ctx->scale_height;
|
||||||
|
|
||||||
if (output->width != 7) {
|
if (nb_outputs == 1 && output->width == 7) {
|
||||||
|
proposal_count = output->height;
|
||||||
|
detect_size = output->width;
|
||||||
|
detections = output->data;
|
||||||
|
} else if (nb_outputs == 2 && output[0].width == 5) {
|
||||||
|
proposal_count = output[0].height;
|
||||||
|
detect_size = output[0].width;
|
||||||
|
detections = output[0].data;
|
||||||
|
labels = output[1].data;
|
||||||
|
} else if (nb_outputs == 2 && output[1].width == 5) {
|
||||||
|
proposal_count = output[1].height;
|
||||||
|
detect_size = output[1].width;
|
||||||
|
detections = output[1].data;
|
||||||
|
labels = output[0].data;
|
||||||
|
} else {
|
||||||
av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
|
av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
|
||||||
return AVERROR(EINVAL);
|
return AVERROR(EINVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (proposal_count == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
for (int i = 0; i < proposal_count; ++i) {
|
for (int i = 0; i < proposal_count; ++i) {
|
||||||
float conf = detections[i * detect_size + 2];
|
float conf;
|
||||||
|
if (nb_outputs == 1)
|
||||||
|
conf = detections[i * detect_size + 2];
|
||||||
|
else
|
||||||
|
conf = detections[i * detect_size + 4];
|
||||||
if (conf < conf_threshold) {
|
if (conf < conf_threshold) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -398,12 +422,24 @@ static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterCon
|
|||||||
|
|
||||||
for (int i = 0; i < proposal_count; ++i) {
|
for (int i = 0; i < proposal_count; ++i) {
|
||||||
int av_unused image_id = (int)detections[i * detect_size + 0];
|
int av_unused image_id = (int)detections[i * detect_size + 0];
|
||||||
int label_id = (int)detections[i * detect_size + 1];
|
int label_id;
|
||||||
float conf = detections[i * detect_size + 2];
|
float conf, x0, y0, x1, y1;
|
||||||
float x0 = detections[i * detect_size + 3];
|
|
||||||
float y0 = detections[i * detect_size + 4];
|
if (nb_outputs == 1) {
|
||||||
float x1 = detections[i * detect_size + 5];
|
label_id = (int)detections[i * detect_size + 1];
|
||||||
float y1 = detections[i * detect_size + 6];
|
conf = detections[i * detect_size + 2];
|
||||||
|
x0 = detections[i * detect_size + 3];
|
||||||
|
y0 = detections[i * detect_size + 4];
|
||||||
|
x1 = detections[i * detect_size + 5];
|
||||||
|
y1 = detections[i * detect_size + 6];
|
||||||
|
} else {
|
||||||
|
label_id = (int)labels[i];
|
||||||
|
x0 = detections[i * detect_size] / scale_w;
|
||||||
|
y0 = detections[i * detect_size + 1] / scale_h;
|
||||||
|
x1 = detections[i * detect_size + 2] / scale_w;
|
||||||
|
y1 = detections[i * detect_size + 3] / scale_h;
|
||||||
|
conf = detections[i * detect_size + 4];
|
||||||
|
}
|
||||||
|
|
||||||
if (conf < conf_threshold) {
|
if (conf < conf_threshold) {
|
||||||
continue;
|
continue;
|
||||||
@ -447,7 +483,7 @@ static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outpu
|
|||||||
|
|
||||||
switch (ctx->model_type) {
|
switch (ctx->model_type) {
|
||||||
case DDMT_SSD:
|
case DDMT_SSD:
|
||||||
ret = dnn_detect_post_proc_ssd(frame, output, filter_ctx);
|
ret = dnn_detect_post_proc_ssd(frame, output, nb_outputs, filter_ctx);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
return ret;
|
return ret;
|
||||||
break;
|
break;
|
||||||
|
Loading…
Reference in New Issue
Block a user