summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
diff options
context:
space:
mode:
authorEric Dao <eric@erickhangdao.com>2025-03-10 17:54:31 -0400
committerEric Dao <eric@erickhangdao.com>2025-03-10 17:54:31 -0400
commitab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
treea1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/common/models/src/detection_model_yolo.cpp
parent40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
downloadthesis-master.tar.gz
thesis-master.tar.bz2
thesis-master.zip
completed thesisHEADmaster
Diffstat (limited to 'python/openvino/runtime/common/models/src/detection_model_yolo.cpp')
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_yolo.cpp481
1 files changed, 481 insertions, 0 deletions
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolo.cpp b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
new file mode 100644
index 0000000..2c4fb1d
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
@@ -0,0 +1,481 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolo.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+std::vector<float> defaultAnchors[] = {
+ // YOLOv1v2
+ {0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f},
+ // YOLOv3
+ {10.0f,
+ 13.0f,
+ 16.0f,
+ 30.0f,
+ 33.0f,
+ 23.0f,
+ 30.0f,
+ 61.0f,
+ 62.0f,
+ 45.0f,
+ 59.0f,
+ 119.0f,
+ 116.0f,
+ 90.0f,
+ 156.0f,
+ 198.0f,
+ 373.0f,
+ 326.0f},
+ // YOLOv4
+ {12.0f,
+ 16.0f,
+ 19.0f,
+ 36.0f,
+ 40.0f,
+ 28.0f,
+ 36.0f,
+ 75.0f,
+ 76.0f,
+ 55.0f,
+ 72.0f,
+ 146.0f,
+ 142.0f,
+ 110.0f,
+ 192.0f,
+ 243.0f,
+ 459.0f,
+ 401.0f},
+ // YOLOv4_Tiny
+ {10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f, 81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f},
+ // YOLOF
+ {16.0f, 16.0f, 32.0f, 32.0f, 64.0f, 64.0f, 128.0f, 128.0f, 256.0f, 256.0f, 512.0f, 512.0f}};
+
+const std::vector<int64_t> defaultMasks[] = {
+ // YOLOv1v2
+ {},
+ // YOLOv3
+ {},
+ // YOLOv4
+ {0, 1, 2, 3, 4, 5, 6, 7, 8},
+ // YOLOv4_Tiny
+ {1, 2, 3, 3, 4, 5},
+ // YOLOF
+ {0, 1, 2, 3, 4, 5}};
+
+static inline float sigmoid(float x) {
+ return 1.f / (1.f + exp(-x));
+}
+
+static inline float linear(float x) {
+ return x;
+}
+
+ModelYolo::ModelYolo(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ bool useAdvancedPostprocessing,
+ float boxIOUThreshold,
+ const std::vector<std::string>& labels,
+ const std::vector<float>& anchors,
+ const std::vector<int64_t>& masks,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout),
+ boxIOUThreshold(boxIOUThreshold),
+ useAdvancedPostprocessing(useAdvancedPostprocessing),
+ yoloVersion(YOLO_V3),
+ presetAnchors(anchors),
+ presetMasks(masks) {}
+
+void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("YOLO model wrapper accepts models that have only 1 input");
+ }
+
+ const auto& input = model->input();
+ const ov::Shape& inputShape = model->input().get_shape();
+ ov::Layout inputLayout = getInputLayout(input);
+
+ if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 3-channel input");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+
+ //--- Reading image input parameters
+ inputsNames.push_back(model->input().get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ std::map<std::string, ov::Shape> outShapes;
+ for (auto& out : outputs) {
+ ppp.output(out.get_any_name()).tensor().set_element_type(ov::element::f32);
+ if (out.get_shape().size() == 4) {
+ if (out.get_shape()[ov::layout::height_idx("NCHW")] != out.get_shape()[ov::layout::width_idx("NCHW")] &&
+ out.get_shape()[ov::layout::height_idx("NHWC")] == out.get_shape()[ov::layout::width_idx("NHWC")]) {
+ ppp.output(out.get_any_name()).model().set_layout("NHWC");
+ // outShapes are saved before ppp.build() thus set yoloRegionLayout as it is in model before ppp.build()
+ yoloRegionLayout = "NHWC";
+ }
+ // yolo-v1-tiny-tf out shape is [1, 21125] thus set layout only for 4 dim tensors
+ ppp.output(out.get_any_name()).tensor().set_layout("NCHW");
+ }
+ outputsNames.push_back(out.get_any_name());
+ outShapes[out.get_any_name()] = out.get_shape();
+ }
+ model = ppp.build();
+
+ yoloVersion = YOLO_V3;
+ bool isRegionFound = false;
+ for (const auto& op : model->get_ordered_ops()) {
+ if (std::string("RegionYolo") == op->get_type_name()) {
+ auto regionYolo = std::dynamic_pointer_cast<ov::op::v0::RegionYolo>(op);
+
+ if (regionYolo) {
+ if (!regionYolo->get_mask().size()) {
+ yoloVersion = YOLO_V1V2;
+ }
+
+ const auto& opName = op->get_friendly_name();
+ for (const auto& out : outputs) {
+ if (out.get_node()->get_friendly_name() == opName ||
+ out.get_node()->get_input_node_ptr(0)->get_friendly_name() == opName) {
+ isRegionFound = true;
+ regions.emplace(out.get_any_name(), Region(regionYolo));
+ }
+ }
+ }
+ }
+ }
+
+ if (!isRegionFound) {
+ switch (outputsNames.size()) {
+ case 1:
+ yoloVersion = YOLOF;
+ break;
+ case 2:
+ yoloVersion = YOLO_V4_TINY;
+ break;
+ case 3:
+ yoloVersion = YOLO_V4;
+ break;
+ }
+
+ int num = yoloVersion == YOLOF ? 6 : 3;
+ isObjConf = yoloVersion == YOLOF ? 0 : 1;
+ int i = 0;
+
+ auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
+ if (chosenMasks.size() != num * outputs.size()) {
+ throw std::runtime_error(std::string("Invalid size of masks array, got ") +
+ std::to_string(presetMasks.size()) + ", should be " +
+ std::to_string(num * outputs.size()));
+ }
+
+ std::sort(outputsNames.begin(),
+ outputsNames.end(),
+ [&outShapes, this](const std::string& x, const std::string& y) {
+ return outShapes[x][ov::layout::height_idx(yoloRegionLayout)] >
+ outShapes[y][ov::layout::height_idx(yoloRegionLayout)];
+ });
+
+ for (const auto& name : outputsNames) {
+ const auto& shape = outShapes[name];
+ if (shape[ov::layout::channels_idx(yoloRegionLayout)] % num != 0) {
+ throw std::logic_error(std::string("Output tensor ") + name + " has wrong channel dimension");
+ }
+ regions.emplace(
+ name,
+ Region(shape[ov::layout::channels_idx(yoloRegionLayout)] / num - 4 - (isObjConf ? 1 : 0),
+ 4,
+ presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
+ std::vector<int64_t>(chosenMasks.begin() + i * num, chosenMasks.begin() + (i + 1) * num),
+ shape[ov::layout::width_idx(yoloRegionLayout)],
+ shape[ov::layout::height_idx(yoloRegionLayout)]));
+ i++;
+ }
+ } else {
+ // Currently externally set anchors and masks are supported only for YoloV4
+ if (presetAnchors.size() || presetMasks.size()) {
+ slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
+ "This model is not YoloV4, so these options will be ignored."
+ << slog::endl;
+ }
+ }
+}
+
+std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+ std::vector<DetectedObject> objects;
+
+ // Parsing outputs
+ const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+ for (auto& output : infResult.outputsData) {
+ this->parseYOLOOutput(output.first,
+ output.second,
+ netInputHeight,
+ netInputWidth,
+ internalData.inputImgHeight,
+ internalData.inputImgWidth,
+ objects);
+ }
+
+ if (useAdvancedPostprocessing) {
+ // Advanced postprocessing
+ // Checking IOU threshold conformance
+ // For every i-th object we're finding all objects it intersects with, and comparing confidence
+ // If i-th object has greater confidence than all others, we include it into result
+ for (const auto& obj1 : objects) {
+ bool isGoodResult = true;
+ for (const auto& obj2 : objects) {
+ if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence &&
+ intersectionOverUnion(obj1, obj2) >= boxIOUThreshold) { // if obj1 is the same as obj2, condition
+ // expression will evaluate to false anyway
+ isGoodResult = false;
+ break;
+ }
+ }
+ if (isGoodResult) {
+ result->objects.push_back(obj1);
+ }
+ }
+ } else {
+ // Classic postprocessing
+ std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) {
+ return x.confidence > y.confidence;
+ });
+ for (size_t i = 0; i < objects.size(); ++i) {
+ if (objects[i].confidence == 0)
+ continue;
+ for (size_t j = i + 1; j < objects.size(); ++j)
+ if (intersectionOverUnion(objects[i], objects[j]) >= boxIOUThreshold)
+ objects[j].confidence = 0;
+ result->objects.push_back(objects[i]);
+ }
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
+
+void ModelYolo::parseYOLOOutput(const std::string& output_name,
+ const ov::Tensor& tensor,
+ const unsigned long resized_im_h,
+ const unsigned long resized_im_w,
+ const unsigned long original_im_h,
+ const unsigned long original_im_w,
+ std::vector<DetectedObject>& objects) {
+ // --------------------------- Extracting layer parameters -------------------------------------
+ auto it = regions.find(output_name);
+ if (it == regions.end()) {
+ throw std::runtime_error(std::string("Can't find output layer with name ") + output_name);
+ }
+ auto& region = it->second;
+
+ int sideW = 0;
+ int sideH = 0;
+ unsigned long scaleH;
+ unsigned long scaleW;
+ switch (yoloVersion) {
+ case YOLO_V1V2:
+ sideH = region.outputHeight;
+ sideW = region.outputWidth;
+ scaleW = region.outputWidth;
+ scaleH = region.outputHeight;
+ break;
+ case YOLO_V3:
+ case YOLO_V4:
+ case YOLO_V4_TINY:
+ case YOLOF:
+ sideH = static_cast<int>(tensor.get_shape()[ov::layout::height_idx("NCHW")]);
+ sideW = static_cast<int>(tensor.get_shape()[ov::layout::width_idx("NCHW")]);
+ scaleW = resized_im_w;
+ scaleH = resized_im_h;
+ break;
+ }
+
+ auto entriesNum = sideW * sideH;
+ const float* outData = tensor.data<float>();
+
+ auto postprocessRawData =
+ (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY || yoloVersion == YOLOF) ? sigmoid : linear;
+
+ // --------------------------- Parsing YOLO Region output -------------------------------------
+ for (int i = 0; i < entriesNum; ++i) {
+ int row = i / sideW;
+ int col = i % sideW;
+ for (int n = 0; n < region.num; ++n) {
+ //--- Getting region data
+ int obj_index = calculateEntryIndex(entriesNum,
+ region.coords,
+ region.classes + isObjConf,
+ n * entriesNum + i,
+ region.coords);
+ int box_index =
+ calculateEntryIndex(entriesNum, region.coords, region.classes + isObjConf, n * entriesNum + i, 0);
+ float scale = isObjConf ? postprocessRawData(outData[obj_index]) : 1;
+
+ //--- Preliminary check for confidence threshold conformance
+ if (scale >= confidenceThreshold) {
+ //--- Calculating scaled region's coordinates
+ float x, y;
+ if (yoloVersion == YOLOF) {
+ x = (static_cast<float>(col) / sideW +
+ outData[box_index + 0 * entriesNum] * region.anchors[2 * n] / scaleW) *
+ original_im_w;
+ y = (static_cast<float>(row) / sideH +
+ outData[box_index + 1 * entriesNum] * region.anchors[2 * n + 1] / scaleH) *
+ original_im_h;
+ } else {
+ x = static_cast<float>((col + postprocessRawData(outData[box_index + 0 * entriesNum])) / sideW *
+ original_im_w);
+ y = static_cast<float>((row + postprocessRawData(outData[box_index + 1 * entriesNum])) / sideH *
+ original_im_h);
+ }
+ float height = static_cast<float>(std::exp(outData[box_index + 3 * entriesNum]) *
+ region.anchors[2 * n + 1] * original_im_h / scaleH);
+ float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] *
+ original_im_w / scaleW);
+
+ DetectedObject obj;
+ obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w));
+ obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h));
+ obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x));
+ obj.height = clamp(height, 0.f, static_cast<float>(original_im_h - obj.y));
+
+ for (size_t j = 0; j < region.classes; ++j) {
+ int class_index = calculateEntryIndex(entriesNum,
+ region.coords,
+ region.classes + isObjConf,
+ n * entriesNum + i,
+ region.coords + isObjConf + j);
+ float prob = scale * postprocessRawData(outData[class_index]);
+
+ //--- Checking confidence threshold conformance and adding region to the list
+ if (prob >= confidenceThreshold) {
+ obj.confidence = prob;
+ obj.labelID = j;
+ obj.label = getLabelName(obj.labelID);
+ objects.push_back(obj);
+ }
+ }
+ }
+ }
+ }
+}
+
+int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses, int location, int entry) {
+ int n = location / totalCells;
+ int loc = location % totalCells;
+ return (n * (lcoords + lclasses) + entry) * totalCells + loc;
+}
+
+double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) {
+ double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x);
+ double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y);
+ double intersectionArea =
+ (overlappingWidth < 0 || overlappingHeight < 0) ? 0 : overlappingHeight * overlappingWidth;
+ double unionArea = o1.width * o1.height + o2.width * o2.height - intersectionArea;
+ return intersectionArea / unionArea;
+}
+
+ModelYolo::Region::Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo) {
+ coords = regionYolo->get_num_coords();
+ classes = regionYolo->get_num_classes();
+ auto mask = regionYolo->get_mask();
+ num = mask.size();
+
+ auto shape = regionYolo->get_input_shape(0);
+ outputWidth = shape[3];
+ outputHeight = shape[2];
+
+ if (num) {
+ // Parsing YoloV3 parameters
+ anchors.resize(num * 2);
+
+ for (int i = 0; i < num; ++i) {
+ anchors[i * 2] = regionYolo->get_anchors()[mask[i] * 2];
+ anchors[i * 2 + 1] = regionYolo->get_anchors()[mask[i] * 2 + 1];
+ }
+ } else {
+ // Parsing YoloV2 parameters
+ num = regionYolo->get_num_regions();
+ anchors = regionYolo->get_anchors();
+ if (anchors.empty()) {
+ anchors = defaultAnchors[YOLO_V1V2];
+ num = 5;
+ }
+ }
+}
+
+ModelYolo::Region::Region(size_t classes,
+ int coords,
+ const std::vector<float>& anchors,
+ const std::vector<int64_t>& masks,
+ size_t outputWidth,
+ size_t outputHeight)
+ : classes(classes),
+ coords(coords),
+ outputWidth(outputWidth),
+ outputHeight(outputHeight) {
+ num = masks.size();
+
+ if (anchors.size() == 0 || anchors.size() % 2 != 0) {
+ throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
+ }
+
+ if (num) {
+ this->anchors.resize(num * 2);
+
+ for (int i = 0; i < num; ++i) {
+ this->anchors[i * 2] = anchors[masks[i] * 2];
+ this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
+ }
+ } else {
+ this->anchors = anchors;
+ num = anchors.size() / 2;
+ }
+}