1 files changed, 481 insertions, 0 deletions
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolo.cpp b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
new file mode 100644
index 0000000..2c4fb1d
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
@@ -0,0 +1,481 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolo.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+std::vector<float> defaultAnchors[] = {
+    // YOLOv1v2
+    {0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f},
+    // YOLOv3
+    {10.0f,
+     13.0f,
+     16.0f,
+     30.0f,
+     33.0f,
+     23.0f,
+     30.0f,
+     61.0f,
+     62.0f,
+     45.0f,
+     59.0f,
+     119.0f,
+     116.0f,
+     90.0f,
+     156.0f,
+     198.0f,
+     373.0f,
+     326.0f},
+    // YOLOv4
+    {12.0f,
+     16.0f,
+     19.0f,
+     36.0f,
+     40.0f,
+     28.0f,
+     36.0f,
+     75.0f,
+     76.0f,
+     55.0f,
+     72.0f,
+     146.0f,
+     142.0f,
+     110.0f,
+     192.0f,
+     243.0f,
+     459.0f,
+     401.0f},
+    // YOLOv4_Tiny
+    {10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f, 81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f},
+    // YOLOF
+    {16.0f, 16.0f, 32.0f, 32.0f, 64.0f, 64.0f, 128.0f, 128.0f, 256.0f, 256.0f, 512.0f, 512.0f}};
+
+const std::vector<int64_t> defaultMasks[] = {
+    // YOLOv1v2
+    {},
+    // YOLOv3
+    {},
+    // YOLOv4
+    {0, 1, 2, 3, 4, 5, 6, 7, 8},
+    // YOLOv4_Tiny
+    {1, 2, 3, 3, 4, 5},
+    // YOLOF
+    {0, 1, 2, 3, 4, 5}};
+
+static inline float sigmoid(float x) {
+    return 1.f / (1.f + exp(-x));
+}
+
+static inline float linear(float x) {
+    return x;
+}
+
+ModelYolo::ModelYolo(const std::string& modelFileName,
+                     float confidenceThreshold,
+                     bool useAutoResize,
+                     bool useAdvancedPostprocessing,
+                     float boxIOUThreshold,
+                     const std::vector<std::string>& labels,
+                     const std::vector<float>& anchors,
+                     const std::vector<int64_t>& masks,
+                     const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout),
+      boxIOUThreshold(boxIOUThreshold),
+      useAdvancedPostprocessing(useAdvancedPostprocessing),
+      yoloVersion(YOLO_V3),
+      presetAnchors(anchors),
+      presetMasks(masks) {}
+
+void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("YOLO model wrapper accepts models that have only 1 input");
+    }
+
+    const auto& input = model->input();
+    const ov::Shape& inputShape = model->input().get_shape();
+    ov::Layout inputLayout = getInputLayout(input);
+
+    if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("Expected 3-channel input");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    if (useAutoResize) {
+        ppp.input().tensor().set_spatial_dynamic_shape();
+
+        ppp.input()
+            .preprocess()
+            .convert_element_type(ov::element::f32)
+            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+    }
+
+    ppp.input().model().set_layout(inputLayout);
+
+    //--- Reading image input parameters
+    inputsNames.push_back(model->input().get_any_name());
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    std::map<std::string, ov::Shape> outShapes;
+    for (auto& out : outputs) {
+        ppp.output(out.get_any_name()).tensor().set_element_type(ov::element::f32);
+        if (out.get_shape().size() == 4) {
+            if (out.get_shape()[ov::layout::height_idx("NCHW")] != out.get_shape()[ov::layout::width_idx("NCHW")] &&
+                out.get_shape()[ov::layout::height_idx("NHWC")] == out.get_shape()[ov::layout::width_idx("NHWC")]) {
+                ppp.output(out.get_any_name()).model().set_layout("NHWC");
+                // outShapes are saved before ppp.build() thus set yoloRegionLayout as it is in model before ppp.build()
+                yoloRegionLayout = "NHWC";
+            }
+            // yolo-v1-tiny-tf out shape is [1, 21125] thus set layout only for 4 dim tensors
+            ppp.output(out.get_any_name()).tensor().set_layout("NCHW");
+        }
+        outputsNames.push_back(out.get_any_name());
+        outShapes[out.get_any_name()] = out.get_shape();
+    }
+    model = ppp.build();
+
+    yoloVersion = YOLO_V3;
+    bool isRegionFound = false;
+    for (const auto& op : model->get_ordered_ops()) {
+        if (std::string("RegionYolo") == op->get_type_name()) {
+            auto regionYolo = std::dynamic_pointer_cast<ov::op::v0::RegionYolo>(op);
+
+            if (regionYolo) {
+                if (!regionYolo->get_mask().size()) {
+                    yoloVersion = YOLO_V1V2;
+                }
+
+                const auto& opName = op->get_friendly_name();
+                for (const auto& out : outputs) {
+                    if (out.get_node()->get_friendly_name() == opName ||
+                        out.get_node()->get_input_node_ptr(0)->get_friendly_name() == opName) {
+                        isRegionFound = true;
+                        regions.emplace(out.get_any_name(), Region(regionYolo));
+                    }
+                }
+            }
+        }
+    }
+
+    if (!isRegionFound) {
+        switch (outputsNames.size()) {
+            case 1:
+                yoloVersion = YOLOF;
+                break;
+            case 2:
+                yoloVersion = YOLO_V4_TINY;
+                break;
+            case 3:
+                yoloVersion = YOLO_V4;
+                break;
+        }
+
+        int num = yoloVersion == YOLOF ? 6 : 3;
+        isObjConf = yoloVersion == YOLOF ? 0 : 1;
+        int i = 0;
+
+        auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
+        if (chosenMasks.size() != num * outputs.size()) {
+            throw std::runtime_error(std::string("Invalid size of masks array, got ") +
+                                     std::to_string(presetMasks.size()) + ", should be " +
+                                     std::to_string(num * outputs.size()));
+        }
+
+        std::sort(outputsNames.begin(),
+                  outputsNames.end(),
+                  [&outShapes, this](const std::string& x, const std::string& y) {
+                      return outShapes[x][ov::layout::height_idx(yoloRegionLayout)] >
+                             outShapes[y][ov::layout::height_idx(yoloRegionLayout)];
+                  });
+
+        for (const auto& name : outputsNames) {
+            const auto& shape = outShapes[name];
+            if (shape[ov::layout::channels_idx(yoloRegionLayout)] % num != 0) {
+                throw std::logic_error(std::string("Output tensor ") + name + " has wrong channel dimension");
+            }
+            regions.emplace(
+                name,
+                Region(shape[ov::layout::channels_idx(yoloRegionLayout)] / num - 4 - (isObjConf ? 1 : 0),
+                       4,
+                       presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
+                       std::vector<int64_t>(chosenMasks.begin() + i * num, chosenMasks.begin() + (i + 1) * num),
+                       shape[ov::layout::width_idx(yoloRegionLayout)],
+                       shape[ov::layout::height_idx(yoloRegionLayout)]));
+            i++;
+        }
+    } else {
+        // Currently externally set anchors and masks are supported only for YoloV4
+        if (presetAnchors.size() || presetMasks.size()) {
+            slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
+                          "This model is not YoloV4, so these options will be ignored."
+                       << slog::endl;
+        }
+    }
+}
+
+std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    std::vector<DetectedObject> objects;
+
+    // Parsing outputs
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+    for (auto& output : infResult.outputsData) {
+        this->parseYOLOOutput(output.first,
+                              output.second,
+                              netInputHeight,
+                              netInputWidth,
+                              internalData.inputImgHeight,
+                              internalData.inputImgWidth,
+                              objects);
+    }
+
+    if (useAdvancedPostprocessing) {
+        // Advanced postprocessing
+        // Checking IOU threshold conformance
+        // For every i-th object we're finding all objects it intersects with, and comparing confidence
+        // If i-th object has greater confidence than all others, we include it into result
+        for (const auto& obj1 : objects) {
+            bool isGoodResult = true;
+            for (const auto& obj2 : objects) {
+                if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence &&
+                    intersectionOverUnion(obj1, obj2) >= boxIOUThreshold) {  // if obj1 is the same as obj2, condition
+                                                                             // expression will evaluate to false anyway
+                    isGoodResult = false;
+                    break;
+                }
+            }
+            if (isGoodResult) {
+                result->objects.push_back(obj1);
+            }
+        }
+    } else {
+        // Classic postprocessing
+        std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) {
+            return x.confidence > y.confidence;
+        });
+        for (size_t i = 0; i < objects.size(); ++i) {
+            if (objects[i].confidence == 0)
+                continue;
+            for (size_t j = i + 1; j < objects.size(); ++j)
+                if (intersectionOverUnion(objects[i], objects[j]) >= boxIOUThreshold)
+                    objects[j].confidence = 0;
+            result->objects.push_back(objects[i]);
+        }
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
+
+void ModelYolo::parseYOLOOutput(const std::string& output_name,
+                                const ov::Tensor& tensor,
+                                const unsigned long resized_im_h,
+                                const unsigned long resized_im_w,
+                                const unsigned long original_im_h,
+                                const unsigned long original_im_w,
+                                std::vector<DetectedObject>& objects) {
+    // --------------------------- Extracting layer parameters -------------------------------------
+    auto it = regions.find(output_name);
+    if (it == regions.end()) {
+        throw std::runtime_error(std::string("Can't find output layer with name ") + output_name);
+    }
+    auto& region = it->second;
+
+    int sideW = 0;
+    int sideH = 0;
+    unsigned long scaleH;
+    unsigned long scaleW;
+    switch (yoloVersion) {
+        case YOLO_V1V2:
+            sideH = region.outputHeight;
+            sideW = region.outputWidth;
+            scaleW = region.outputWidth;
+            scaleH = region.outputHeight;
+            break;
+        case YOLO_V3:
+        case YOLO_V4:
+        case YOLO_V4_TINY:
+        case YOLOF:
+            sideH = static_cast<int>(tensor.get_shape()[ov::layout::height_idx("NCHW")]);
+            sideW = static_cast<int>(tensor.get_shape()[ov::layout::width_idx("NCHW")]);
+            scaleW = resized_im_w;
+            scaleH = resized_im_h;
+            break;
+    }
+
+    auto entriesNum = sideW * sideH;
+    const float* outData = tensor.data<float>();
+
+    auto postprocessRawData =
+        (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY || yoloVersion == YOLOF) ? sigmoid : linear;
+
+    // --------------------------- Parsing YOLO Region output -------------------------------------
+    for (int i = 0; i < entriesNum; ++i) {
+        int row = i / sideW;
+        int col = i % sideW;
+        for (int n = 0; n < region.num; ++n) {
+            //--- Getting region data
+            int obj_index = calculateEntryIndex(entriesNum,
+                                                region.coords,
+                                                region.classes + isObjConf,
+                                                n * entriesNum + i,
+                                                region.coords);
+            int box_index =
+                calculateEntryIndex(entriesNum, region.coords, region.classes + isObjConf, n * entriesNum + i, 0);
+            float scale = isObjConf ? postprocessRawData(outData[obj_index]) : 1;
+
+            //--- Preliminary check for confidence threshold conformance
+            if (scale >= confidenceThreshold) {
+                //--- Calculating scaled region's coordinates
+                float x, y;
+                if (yoloVersion == YOLOF) {
+                    x = (static_cast<float>(col) / sideW +
+                         outData[box_index + 0 * entriesNum] * region.anchors[2 * n] / scaleW) *
+                        original_im_w;
+                    y = (static_cast<float>(row) / sideH +
+                         outData[box_index + 1 * entriesNum] * region.anchors[2 * n + 1] / scaleH) *
+                        original_im_h;
+                } else {
+                    x = static_cast<float>((col + postprocessRawData(outData[box_index + 0 * entriesNum])) / sideW *
+                                           original_im_w);
+                    y = static_cast<float>((row + postprocessRawData(outData[box_index + 1 * entriesNum])) / sideH *
+                                           original_im_h);
+                }
+                float height = static_cast<float>(std::exp(outData[box_index + 3 * entriesNum]) *
+                                                  region.anchors[2 * n + 1] * original_im_h / scaleH);
+                float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] *
+                                                 original_im_w / scaleW);
+
+                DetectedObject obj;
+                obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w));
+                obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h));
+                obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x));
+                obj.height = clamp(height, 0.f, static_cast<float>(original_im_h - obj.y));
+
+                for (size_t j = 0; j < region.classes; ++j) {
+                    int class_index = calculateEntryIndex(entriesNum,
+                                                          region.coords,
+                                                          region.classes + isObjConf,
+                                                          n * entriesNum + i,
+                                                          region.coords + isObjConf + j);
+                    float prob = scale * postprocessRawData(outData[class_index]);
+
+                    //--- Checking confidence threshold conformance and adding region to the list
+                    if (prob >= confidenceThreshold) {
+                        obj.confidence = prob;
+                        obj.labelID = j;
+                        obj.label = getLabelName(obj.labelID);
+                        objects.push_back(obj);
+                    }
+                }
+            }
+        }
+    }
+}
+
+int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses, int location, int entry) {
+    int n = location / totalCells;
+    int loc = location % totalCells;
+    return (n * (lcoords + lclasses) + entry) * totalCells + loc;
+}
+
+double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) {
+    double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x);
+    double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y);
+    double intersectionArea =
+        (overlappingWidth < 0 || overlappingHeight < 0) ? 0 : overlappingHeight * overlappingWidth;
+    double unionArea = o1.width * o1.height + o2.width * o2.height - intersectionArea;
+    return intersectionArea / unionArea;
+}
+
+ModelYolo::Region::Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo) {
+    coords = regionYolo->get_num_coords();
+    classes = regionYolo->get_num_classes();
+    auto mask = regionYolo->get_mask();
+    num = mask.size();
+
+    auto shape = regionYolo->get_input_shape(0);
+    outputWidth = shape[3];
+    outputHeight = shape[2];
+
+    if (num) {
+        // Parsing YoloV3 parameters
+        anchors.resize(num * 2);
+
+        for (int i = 0; i < num; ++i) {
+            anchors[i * 2] = regionYolo->get_anchors()[mask[i] * 2];
+            anchors[i * 2 + 1] = regionYolo->get_anchors()[mask[i] * 2 + 1];
+        }
+    } else {
+        // Parsing YoloV2 parameters
+        num = regionYolo->get_num_regions();
+        anchors = regionYolo->get_anchors();
+        if (anchors.empty()) {
+            anchors = defaultAnchors[YOLO_V1V2];
+            num = 5;
+        }
+    }
+}
+
+ModelYolo::Region::Region(size_t classes,
+                          int coords,
+                          const std::vector<float>& anchors,
+                          const std::vector<int64_t>& masks,
+                          size_t outputWidth,
+                          size_t outputHeight)
+    : classes(classes),
+      coords(coords),
+      outputWidth(outputWidth),
+      outputHeight(outputHeight) {
+    num = masks.size();
+
+    if (anchors.size() == 0 || anchors.size() % 2 != 0) {
+        throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
+    }
+
+    if (num) {
+        this->anchors.resize(num * 2);
+
+        for (int i = 0; i < num; ++i) {
+            this->anchors[i * 2] = anchors[masks[i] * 2];
+            this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
+        }
+    } else {
+        this->anchors = anchors;
+        num = anchors.size() / 2;
+    }
+}