summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/common/models
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/runtime/common/models')
-rw-r--r--python/openvino/runtime/common/models/CMakeLists.txt15
-rw-r--r--python/openvino/runtime/common/models/include/models/associative_embedding_decoder.h94
-rw-r--r--python/openvino/runtime/common/models/include/models/classification_model.h57
-rw-r--r--python/openvino/runtime/common/models/include/models/deblurring_model.h52
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model.h51
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_centernet.h59
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_faceboxes.h55
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_retinaface.h74
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_retinaface_pt.h81
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_ssd.h63
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_yolo.h107
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_yolov3_onnx.h50
-rw-r--r--python/openvino/runtime/common/models/include/models/detection_model_yolox.h54
-rw-r--r--python/openvino/runtime/common/models/include/models/hpe_model_associative_embedding.h89
-rw-r--r--python/openvino/runtime/common/models/include/models/hpe_model_openpose.h78
-rw-r--r--python/openvino/runtime/common/models/include/models/image_model.h49
-rw-r--r--python/openvino/runtime/common/models/include/models/input_data.h41
-rw-r--r--python/openvino/runtime/common/models/include/models/internal_model_data.h48
-rw-r--r--python/openvino/runtime/common/models/include/models/jpeg_restoration_model.h55
-rw-r--r--python/openvino/runtime/common/models/include/models/model_base.h77
-rw-r--r--python/openvino/runtime/common/models/include/models/openpose_decoder.h62
-rw-r--r--python/openvino/runtime/common/models/include/models/results.h122
-rw-r--r--python/openvino/runtime/common/models/include/models/segmentation_model.h50
-rw-r--r--python/openvino/runtime/common/models/include/models/style_transfer_model.h43
-rw-r--r--python/openvino/runtime/common/models/include/models/super_resolution_model.h49
-rw-r--r--python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp201
-rw-r--r--python/openvino/runtime/common/models/src/classification_model.cpp196
-rw-r--r--python/openvino/runtime/common/models/src/deblurring_model.cpp158
-rw-r--r--python/openvino/runtime/common/models/src/detection_model.cpp52
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_centernet.cpp302
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp261
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_retinaface.cpp394
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp277
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_ssd.cpp281
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_yolo.cpp481
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp188
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_yolox.cpp194
-rw-r--r--python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp264
-rw-r--r--python/openvino/runtime/common/models/src/hpe_model_openpose.cpp256
-rw-r--r--python/openvino/runtime/common/models/src/image_model.cpp57
-rw-r--r--python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp167
-rw-r--r--python/openvino/runtime/common/models/src/model_base.cpp67
-rw-r--r--python/openvino/runtime/common/models/src/openpose_decoder.cpp345
-rw-r--r--python/openvino/runtime/common/models/src/segmentation_model.cpp157
-rw-r--r--python/openvino/runtime/common/models/src/style_transfer_model.cpp107
-rw-r--r--python/openvino/runtime/common/models/src/super_resolution_model.cpp207
46 files changed, 6187 insertions, 0 deletions
diff --git a/python/openvino/runtime/common/models/CMakeLists.txt b/python/openvino/runtime/common/models/CMakeLists.txt
new file mode 100644
index 0000000..07c8da3
--- /dev/null
+++ b/python/openvino/runtime/common/models/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+file(GLOB SOURCES ./src/*.cpp)
+file(GLOB HEADERS ./include/models/*.h)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${SOURCES})
+source_group("include" FILES ${HEADERS})
+
+add_library(models STATIC ${SOURCES} ${HEADERS})
+target_include_directories(models PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+target_link_libraries(models PRIVATE openvino::runtime utils opencv_core opencv_imgproc)
diff --git a/python/openvino/runtime/common/models/include/models/associative_embedding_decoder.h b/python/openvino/runtime/common/models/include/models/associative_embedding_decoder.h
new file mode 100644
index 0000000..94afbda
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/associative_embedding_decoder.h
@@ -0,0 +1,94 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+struct Peak {
+ explicit Peak(const cv::Point2f& keypoint = cv::Point2f(-1, -1), const float score = 0.0f, const float tag = 0.0f)
+ : keypoint(keypoint),
+ score(score),
+ tag(tag) {}
+
+ cv::Point2f keypoint;
+ float score;
+ float tag;
+};
+
+class Pose {
+public:
+ explicit Pose(size_t numJoints) : peaks(numJoints) {}
+
+ void add(size_t index, Peak peak) {
+ peaks[index] = peak;
+ sum += peak.score;
+ poseTag = poseTag * static_cast<float>(validPointsNum) + peak.tag;
+ poseCenter = poseCenter * static_cast<float>(validPointsNum) + peak.keypoint;
+ validPointsNum += 1;
+ poseTag = poseTag / static_cast<float>(validPointsNum);
+ poseCenter = poseCenter / static_cast<float>(validPointsNum);
+ }
+
+ float getPoseTag() const {
+ return poseTag;
+ }
+
+ float getMeanScore() const {
+ return sum / static_cast<float>(size());
+ }
+
+ Peak& getPeak(size_t index) {
+ return peaks[index];
+ }
+
+ cv::Point2f& getPoseCenter() {
+ return poseCenter;
+ }
+
+ size_t size() const {
+ return peaks.size();
+ }
+
+private:
+ std::vector<Peak> peaks;
+ cv::Point2f poseCenter = cv::Point2f(0.f, 0.f);
+ int validPointsNum = 0;
+ float poseTag = 0;
+ float sum = 0;
+};
+
+void findPeaks(const std::vector<cv::Mat>& nmsHeatMaps,
+ const std::vector<cv::Mat>& aembdsMaps,
+ std::vector<std::vector<Peak>>& allPeaks,
+ size_t jointId,
+ size_t maxNumPeople,
+ float detectionThreshold);
+
+std::vector<Pose> matchByTag(std::vector<std::vector<Peak>>& allPeaks,
+ size_t maxNumPeople,
+ size_t numJoints,
+ float tagThreshold);
+
+void adjustAndRefine(std::vector<Pose>& allPoses,
+ const std::vector<cv::Mat>& heatMaps,
+ const std::vector<cv::Mat>& aembdsMaps,
+ int poseId,
+ float delta);
diff --git a/python/openvino/runtime/common/models/include/models/classification_model.h b/python/openvino/runtime/common/models/include/models/classification_model.h
new file mode 100644
index 0000000..6d32e44
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/classification_model.h
@@ -0,0 +1,57 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+namespace ov {
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ClassificationModel : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load.
+ /// @param nTop - number of top results.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param useAutoResize - if true, image will be resized by openvino.
+ /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+ /// @param labels - array of labels for every class.
+ /// @param layout - model input layout
+ ClassificationModel(const std::string& modelFileName,
+ size_t nTop,
+ bool useAutoResize,
+ const std::vector<std::string>& labels,
+ const std::string& layout = "");
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+ static std::vector<std::string> loadLabels(const std::string& labelFilename);
+
+protected:
+ size_t nTop;
+ std::vector<std::string> labels;
+
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/deblurring_model.h b/python/openvino/runtime/common/models/include/models/deblurring_model.h
new file mode 100644
index 0000000..33f5542
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/deblurring_model.h
@@ -0,0 +1,52 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+
+#include <opencv2/core/types.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class DeblurringModel : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param inputImgSize size of image to set model input shape
+ /// @param layout - model input layout
+ DeblurringModel(const std::string& modelFileName, const cv::Size& inputImgSize, const std::string& layout = "");
+
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+ void changeInputSize(std::shared_ptr<ov::Model>& model);
+
+ static const size_t stride = 32;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model.h b/python/openvino/runtime/common/models/include/models/detection_model.h
new file mode 100644
index 0000000..4d57a61
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model.h
@@ -0,0 +1,51 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+class DetectionModel : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param useAutoResize - if true, image will be resized by openvino.
+ /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+ /// @param labels - array of labels for every class. If this array is empty or contains less elements
+ /// than actual classes number, default "Label #N" will be shown for missing items.
+ /// @param layout - model input layout
+ DetectionModel(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ const std::vector<std::string>& labels,
+ const std::string& layout = "");
+
+ static std::vector<std::string> loadLabels(const std::string& labelFilename);
+
+protected:
+ float confidenceThreshold;
+ std::vector<std::string> labels;
+
+ std::string getLabelName(int labelID) {
+ return (size_t)labelID < labels.size() ? labels[labelID] : std::string("Label #") + std::to_string(labelID);
+ }
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_centernet.h b/python/openvino/runtime/common/models/include/models/detection_model_centernet.h
new file mode 100644
index 0000000..db9ebdb
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_centernet.h
@@ -0,0 +1,59 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class ModelCenterNet : public DetectionModel {
+public:
+ struct BBox {
+ float left;
+ float top;
+ float right;
+ float bottom;
+
+ float getWidth() const {
+ return (right - left) + 1.0f;
+ }
+ float getHeight() const {
+ return (bottom - top) + 1.0f;
+ }
+ };
+ static const int INIT_VECTOR_SIZE = 200;
+
+ ModelCenterNet(const std::string& modelFileName,
+ float confidenceThreshold,
+ const std::vector<std::string>& labels = std::vector<std::string>(),
+ const std::string& layout = "");
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_faceboxes.h b/python/openvino/runtime/common/models/include/models/detection_model_faceboxes.h
new file mode 100644
index 0000000..8ec2b21
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_faceboxes.h
@@ -0,0 +1,55 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <utils/nms.hpp>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ModelFaceBoxes : public DetectionModel {
+public:
+ static const int INIT_VECTOR_SIZE = 200;
+
+ ModelFaceBoxes(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ float boxIOUThreshold,
+ const std::string& layout = "");
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ size_t maxProposalsCount;
+ const float boxIOUThreshold;
+ const std::vector<float> variance;
+ const std::vector<int> steps;
+ const std::vector<std::vector<int>> minSizes;
+ std::vector<Anchor> anchors;
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+ void priorBoxes(const std::vector<std::pair<size_t, size_t>>& featureMaps);
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_retinaface.h b/python/openvino/runtime/common/models/include/models/detection_model_retinaface.h
new file mode 100644
index 0000000..ac2c235
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_retinaface.h
@@ -0,0 +1,74 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <utils/nms.hpp>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ModelRetinaFace : public DetectionModel {
+public:
+ static const int LANDMARKS_NUM = 5;
+ static const int INIT_VECTOR_SIZE = 200;
+ /// Loads model and performs required initialization
+ /// @param model_name name of model to load
+ /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param useAutoResize - if true, image will be resized by openvino.
+ /// @param boxIOUThreshold - threshold for NMS boxes filtering, varies in [0.0, 1.0] range.
+ /// @param layout - model input layout
+ ModelRetinaFace(const std::string& model_name,
+ float confidenceThreshold,
+ bool useAutoResize,
+ float boxIOUThreshold,
+ const std::string& layout = "");
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ struct AnchorCfgLine {
+ int stride;
+ std::vector<int> scales;
+ int baseSize;
+ std::vector<int> ratios;
+ };
+
+ bool shouldDetectMasks;
+ bool shouldDetectLandmarks;
+ const float boxIOUThreshold;
+ const float maskThreshold;
+ float landmarkStd;
+
+ enum OutputType { OUT_BOXES, OUT_SCORES, OUT_LANDMARKS, OUT_MASKSCORES, OUT_MAX };
+
+ std::vector<std::string> separateOutputsNames[OUT_MAX];
+ const std::vector<AnchorCfgLine> anchorCfg;
+ std::map<int, std::vector<Anchor>> anchorsFpn;
+ std::vector<std::vector<Anchor>> anchors;
+
+ void generateAnchorsFpn();
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_retinaface_pt.h b/python/openvino/runtime/common/models/include/models/detection_model_retinaface_pt.h
new file mode 100644
index 0000000..68cc907
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_retinaface_pt.h
@@ -0,0 +1,81 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core/types.hpp>
+#include <utils/nms.hpp>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class Model;
+class Tensor;
+} // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ModelRetinaFacePT : public DetectionModel {
+public:
+ struct Box {
+ float cX;
+ float cY;
+ float width;
+ float height;
+ };
+
+ /// Loads model and performs required initialization
+ /// @param model_name name of model to load
+ /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param useAutoResize - if true, image will be resized by openvino.
+ /// @param boxIOUThreshold - threshold for NMS boxes filtering, varies in [0.0, 1.0] range.
+ /// @param layout - model input layout
+ ModelRetinaFacePT(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ float boxIOUThreshold,
+ const std::string& layout = "");
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ size_t landmarksNum;
+ const float boxIOUThreshold;
+ float variance[2] = {0.1f, 0.2f};
+
+ enum OutputType { OUT_BOXES, OUT_SCORES, OUT_LANDMARKS, OUT_MAX };
+
+ std::vector<ModelRetinaFacePT::Box> priors;
+
+ std::vector<size_t> filterByScore(const ov::Tensor& scoresTensor, const float confidenceThreshold);
+ std::vector<float> getFilteredScores(const ov::Tensor& scoresTensor, const std::vector<size_t>& indicies);
+ std::vector<cv::Point2f> getFilteredLandmarks(const ov::Tensor& landmarksTensor,
+ const std::vector<size_t>& indicies,
+ int imgWidth,
+ int imgHeight);
+ std::vector<ModelRetinaFacePT::Box> generatePriorData();
+ std::vector<Anchor> getFilteredProposals(const ov::Tensor& boxesTensor,
+ const std::vector<size_t>& indicies,
+ int imgWidth,
+ int imgHeight);
+
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_ssd.h b/python/openvino/runtime/common/models/include/models/detection_model_ssd.h
new file mode 100644
index 0000000..646d7b0
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_ssd.h
@@ -0,0 +1,63 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class ModelSSD : public DetectionModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param useAutoResize - if true, image will be resized by openvino.
+ /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+ /// @param labels - array of labels for every class. If this array is empty or contains less elements
+ /// than actual classes number, default "Label #N" will be shown for missing items.
+ /// @param layout - model input layout
+ ModelSSD(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ const std::vector<std::string>& labels = std::vector<std::string>(),
+ const std::string& layout = "");
+
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ std::unique_ptr<ResultBase> postprocessSingleOutput(InferenceResult& infResult);
+ std::unique_ptr<ResultBase> postprocessMultipleOutputs(InferenceResult& infResult);
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+ void prepareSingleOutput(std::shared_ptr<ov::Model>& model);
+ void prepareMultipleOutputs(std::shared_ptr<ov::Model>& model);
+ size_t objectSize = 0;
+ size_t detectionsNumId = 0;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_yolo.h b/python/openvino/runtime/common/models/include/models/detection_model_yolo.h
new file mode 100644
index 0000000..38b0b64
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_yolo.h
@@ -0,0 +1,107 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <openvino/op/region_yolo.hpp>
+#include <openvino/openvino.hpp>
+
+#include "models/detection_model.h"
+
+struct DetectedObject;
+struct InferenceResult;
+struct ResultBase;
+
+class ModelYolo : public DetectionModel {
+protected:
+ class Region {
+ public:
+ int num = 0;
+ size_t classes = 0;
+ int coords = 0;
+ std::vector<float> anchors;
+ size_t outputWidth = 0;
+ size_t outputHeight = 0;
+
+ Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo);
+ Region(size_t classes,
+ int coords,
+ const std::vector<float>& anchors,
+ const std::vector<int64_t>& masks,
+ size_t outputWidth,
+ size_t outputHeight);
+ };
+
+public:
+ enum YoloVersion { YOLO_V1V2, YOLO_V3, YOLO_V4, YOLO_V4_TINY, YOLOF };
+
+ /// Constructor.
+ /// @param modelFileName name of model to load
+ /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param useAutoResize - if true, image will be resized by openvino.
+ /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+ /// @param useAdvancedPostprocessing - if true, an advanced algorithm for filtering/postprocessing will be used
+ /// (with better processing of multiple crossing objects). Otherwise, classic algorithm will be used.
+ /// @param boxIOUThreshold - threshold to treat separate output regions as one object for filtering
+ /// during postprocessing (only one of them should stay). The default value is 0.5
+ /// @param labels - array of labels for every class. If this array is empty or contains less elements
+ /// than actual classes number, default "Label #N" will be shown for missing items.
+ /// @param anchors - vector of anchors coordinates. Required for YOLOv4, for other versions it may be omitted.
+ /// @param masks - vector of masks values. Required for YOLOv4, for other versions it may be omitted.
+ /// @param layout - model input layout
+ ModelYolo(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ bool useAdvancedPostprocessing = true,
+ float boxIOUThreshold = 0.5,
+ const std::vector<std::string>& labels = std::vector<std::string>(),
+ const std::vector<float>& anchors = std::vector<float>(),
+ const std::vector<int64_t>& masks = std::vector<int64_t>(),
+ const std::string& layout = "");
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+ void parseYOLOOutput(const std::string& output_name,
+ const ov::Tensor& tensor,
+ const unsigned long resized_im_h,
+ const unsigned long resized_im_w,
+ const unsigned long original_im_h,
+ const unsigned long original_im_w,
+ std::vector<DetectedObject>& objects);
+
+ static int calculateEntryIndex(int entriesNum, int lcoords, size_t lclasses, int location, int entry);
+ static double intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2);
+
+ std::map<std::string, Region> regions;
+ double boxIOUThreshold;
+ bool useAdvancedPostprocessing;
+ bool isObjConf = 1;
+ YoloVersion yoloVersion;
+ const std::vector<float> presetAnchors;
+ const std::vector<int64_t> presetMasks;
+ ov::Layout yoloRegionLayout = "NCHW";
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_yolov3_onnx.h b/python/openvino/runtime/common/models/include/models/detection_model_yolov3_onnx.h
new file mode 100644
index 0000000..66c4f03
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_yolov3_onnx.h
@@ -0,0 +1,50 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include "models/detection_model.h"
+
+class ModelYoloV3ONNX: public DetectionModel {
+public:
+ /// Constructor.
+ /// @param modelFileName name of model to load
+ /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param labels - array of labels for every class. If this array is empty or contains less elements
+ /// than actual classes number, default "Label #N" will be shown for missing items.
+ /// @param layout - model input layout
+ ModelYoloV3ONNX(const std::string& modelFileName,
+ float confidenceThreshold,
+ const std::vector<std::string>& labels = std::vector<std::string>(),
+ const std::string& layout = "");
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+ std::string boxesOutputName;
+ std::string scoresOutputName;
+ std::string indicesOutputName;
+ static const int numberOfClasses = 80;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_yolox.h b/python/openvino/runtime/common/models/include/models/detection_model_yolox.h
new file mode 100644
index 0000000..d7e4ea3
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_yolox.h
@@ -0,0 +1,54 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include "models/detection_model.h"
+
+class ModelYoloX: public DetectionModel {
+public:
+ /// Constructor.
+ /// @param modelFileName name of model to load
+ /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+ /// Any detected object with confidence lower than this threshold will be ignored.
+ /// @param boxIOUThreshold - threshold to treat separate output regions as one object for filtering
+ /// during postprocessing (only one of them should stay). The default value is 0.5
+ /// @param labels - array of labels for every class. If this array is empty or contains less elements
+ /// than actual classes number, default "Label #N" will be shown for missing items.
+ /// @param layout - model input layout
+ ModelYoloX(const std::string& modelFileName,
+ float confidenceThreshold,
+ float boxIOUThreshold = 0.5,
+ const std::vector<std::string>& labels = std::vector<std::string>(),
+ const std::string& layout = "");
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+ void setStridesGrids();
+
+ double boxIOUThreshold;
+ std::vector<std::pair<size_t, size_t>> grids;
+ std::vector<size_t> expandedStrides;
+ static const size_t numberOfClasses = 80;
+};
diff --git a/python/openvino/runtime/common/models/include/models/hpe_model_associative_embedding.h b/python/openvino/runtime/common/models/include/models/hpe_model_associative_embedding.h
new file mode 100644
index 0000000..66e217e
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/hpe_model_associative_embedding.h
@@ -0,0 +1,89 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+#include <utils/image_utils.h>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+class Shape;
+} // namespace ov
+struct HumanPose;
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class HpeAssociativeEmbedding : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param aspectRatio - the ratio of input width to its height.
+ /// @param targetSize - the length of a short image side used for model reshaping.
+ /// @param confidenceThreshold - threshold to eliminate low-confidence poses.
+ /// Any pose with confidence lower than this threshold will be ignored.
+ /// @param layout - model input layout
+ HpeAssociativeEmbedding(const std::string& modelFileName,
+ double aspectRatio,
+ int targetSize,
+ float confidenceThreshold,
+ const std::string& layout = "",
+ float delta = 0.0,
+ RESIZE_MODE resizeMode = RESIZE_KEEP_ASPECT);
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+ cv::Size inputLayerSize;
+ double aspectRatio;
+ int targetSize;
+ float confidenceThreshold;
+ float delta;
+
+ std::string embeddingsTensorName;
+ std::string heatmapsTensorName;
+ std::string nmsHeatmapsTensorName;
+
+ static const int numJoints = 17;
+ static const int stride = 32;
+ static const int maxNumPeople = 30;
+ static const cv::Vec3f meanPixel;
+ static const float detectionThreshold;
+ static const float tagThreshold;
+
+ void changeInputSize(std::shared_ptr<ov::Model>& model);
+
+ std::string findTensorByName(const std::string& tensorName, const std::vector<std::string>& outputsNames);
+
+ std::vector<cv::Mat> split(float* data, const ov::Shape& shape);
+
+ std::vector<HumanPose> extractPoses(std::vector<cv::Mat>& heatMaps,
+ const std::vector<cv::Mat>& aembdsMaps,
+ const std::vector<cv::Mat>& nmsHeatMaps) const;
+};
diff --git a/python/openvino/runtime/common/models/include/models/hpe_model_openpose.h b/python/openvino/runtime/common/models/include/models/hpe_model_openpose.h
new file mode 100644
index 0000000..d5e1ce7
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/hpe_model_openpose.h
@@ -0,0 +1,78 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+} // namespace ov
+struct HumanPose;
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class HPEOpenPose : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param aspectRatio - the ratio of input width to its height.
+ /// @param targetSize - the height used for model reshaping.
+ /// @param confidenceThreshold - threshold to eliminate low-confidence keypoints.
+ /// @param layout - model input layout
+ HPEOpenPose(const std::string& modelFileName,
+ double aspectRatio,
+ int targetSize,
+ float confidenceThreshold,
+ const std::string& layout = "");
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+ static const size_t keypointsNumber = 18;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+ static const int minJointsNumber = 3;
+ static const int stride = 8;
+ static const int upsampleRatio = 4;
+ static const cv::Vec3f meanPixel;
+ static const float minPeaksDistance;
+ static const float midPointsScoreThreshold;
+ static const float foundMidPointsRatioThreshold;
+ static const float minSubsetScore;
+ cv::Size inputLayerSize;
+ double aspectRatio;
+ int targetSize;
+ float confidenceThreshold;
+
+ std::vector<HumanPose> extractPoses(const std::vector<cv::Mat>& heatMaps, const std::vector<cv::Mat>& pafs) const;
+ void resizeFeatureMaps(std::vector<cv::Mat>& featureMaps) const;
+
+ void changeInputSize(std::shared_ptr<ov::Model>& model);
+};
diff --git a/python/openvino/runtime/common/models/include/models/image_model.h b/python/openvino/runtime/common/models/include/models/image_model.h
new file mode 100644
index 0000000..b18daa1
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/image_model.h
@@ -0,0 +1,49 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+
+#include "models/model_base.h"
+#include "utils/image_utils.h"
+
+namespace ov {
+class InferRequest;
+} // namespace ov
+struct InputData;
+struct InternalModelData;
+
+class ImageModel : public ModelBase {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param useAutoResize - if true, image is resized by openvino
+ /// @param layout - model input layout
+ ImageModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout = "");
+
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+ bool useAutoResize;
+
+ size_t netInputHeight = 0;
+ size_t netInputWidth = 0;
+ cv::InterpolationFlags interpolationMode = cv::INTER_LINEAR;
+ RESIZE_MODE resizeMode = RESIZE_FILL;
+};
diff --git a/python/openvino/runtime/common/models/include/models/input_data.h b/python/openvino/runtime/common/models/include/models/input_data.h
new file mode 100644
index 0000000..bff9fa5
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/input_data.h
@@ -0,0 +1,41 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <opencv2/opencv.hpp>
+
+struct InputData {
+ virtual ~InputData() {}
+
+ template <class T>
+ T& asRef() {
+ return dynamic_cast<T&>(*this);
+ }
+
+ template <class T>
+ const T& asRef() const {
+ return dynamic_cast<const T&>(*this);
+ }
+};
+
+struct ImageInputData : public InputData {
+ cv::Mat inputImage;
+
+ ImageInputData() {}
+ ImageInputData(const cv::Mat& img) {
+ inputImage = img;
+ }
+};
diff --git a/python/openvino/runtime/common/models/include/models/internal_model_data.h b/python/openvino/runtime/common/models/include/models/internal_model_data.h
new file mode 100644
index 0000000..61d7744
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/internal_model_data.h
@@ -0,0 +1,48 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+struct InternalModelData {
+ virtual ~InternalModelData() {}
+
+ template <class T>
+ T& asRef() {
+ return dynamic_cast<T&>(*this);
+ }
+
+ template <class T>
+ const T& asRef() const {
+ return dynamic_cast<const T&>(*this);
+ }
+};
+
+struct InternalImageModelData : public InternalModelData {
+ InternalImageModelData(int width, int height) : inputImgWidth(width), inputImgHeight(height) {}
+
+ int inputImgWidth;
+ int inputImgHeight;
+};
+
+struct InternalScaleData : public InternalImageModelData {
+ InternalScaleData(int width, int height, float scaleX, float scaleY)
+ : InternalImageModelData(width, height),
+ scaleX(scaleX),
+ scaleY(scaleY) {}
+
+ float scaleX;
+ float scaleY;
+};
diff --git a/python/openvino/runtime/common/models/include/models/jpeg_restoration_model.h b/python/openvino/runtime/common/models/include/models/jpeg_restoration_model.h
new file mode 100644
index 0000000..8b22ac2
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/jpeg_restoration_model.h
@@ -0,0 +1,55 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <opencv2/core/types.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+class JPEGRestorationModel : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param inputImgSize size of image to set model input shape
+ /// @param jpegCompression flag allows to perform compression before the inference
+ /// @param layout - model input layout
+ JPEGRestorationModel(const std::string& modelFileName,
+ const cv::Size& inputImgSize,
+ bool jpegCompression,
+ const std::string& layout = "");
+
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+ void changeInputSize(std::shared_ptr<ov::Model>& model);
+
+ static const size_t stride = 8;
+ bool jpegCompression = false;
+};
diff --git a/python/openvino/runtime/common/models/include/models/model_base.h b/python/openvino/runtime/common/models/include/models/model_base.h
new file mode 100644
index 0000000..c6d9cc1
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/model_base.h
@@ -0,0 +1,77 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/args_helper.hpp>
+#include <utils/config_factory.h>
+#include <utils/ocv_common.hpp>
+
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class ModelBase {
+public:
+ ModelBase(const std::string& modelFileName, const std::string& layout = "")
+ : modelFileName(modelFileName),
+ inputsLayouts(parseLayoutString(layout)) {}
+
+ virtual ~ModelBase() {}
+
+ virtual std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) = 0;
+ virtual ov::CompiledModel compileModel(const ModelConfig& config, ov::Core& core);
+ virtual void onLoadCompleted(const std::vector<ov::InferRequest>& requests) {}
+ virtual std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) = 0;
+
+ const std::vector<std::string>& getOutputsNames() const {
+ return outputsNames;
+ }
+ const std::vector<std::string>& getInputsNames() const {
+ return inputsNames;
+ }
+
+ std::string getModelFileName() {
+ return modelFileName;
+ }
+
+ void setInputsPreprocessing(bool reverseInputChannels,
+ const std::string& meanValues,
+ const std::string& scaleValues) {
+ this->inputTransform = InputTransform(reverseInputChannels, meanValues, scaleValues);
+ }
+
+protected:
+ virtual void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) = 0;
+
+ std::shared_ptr<ov::Model> prepareModel(ov::Core& core);
+
+ InputTransform inputTransform = InputTransform();
+ std::vector<std::string> inputsNames;
+ std::vector<std::string> outputsNames;
+ ov::CompiledModel compiledModel;
+ std::string modelFileName;
+ ModelConfig config = {};
+ std::map<std::string, ov::Layout> inputsLayouts;
+ ov::Layout getInputLayout(const ov::Output<ov::Node>& input);
+};
diff --git a/python/openvino/runtime/common/models/include/models/openpose_decoder.h b/python/openvino/runtime/common/models/include/models/openpose_decoder.h
new file mode 100644
index 0000000..d40e56e
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/openpose_decoder.h
@@ -0,0 +1,62 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+struct HumanPose;
+
+struct Peak {
+ Peak(const int id = -1, const cv::Point2f& pos = cv::Point2f(), const float score = 0.0f);
+
+ int id;
+ cv::Point2f pos;
+ float score;
+};
+
+struct HumanPoseByPeaksIndices {
+ explicit HumanPoseByPeaksIndices(const int keypointsNumber);
+
+ std::vector<int> peaksIndices;
+ int nJoints;
+ float score;
+};
+
+struct TwoJointsConnection {
+ TwoJointsConnection(const int firstJointIdx, const int secondJointIdx, const float score);
+
+ int firstJointIdx;
+ int secondJointIdx;
+ float score;
+};
+
+void findPeaks(const std::vector<cv::Mat>& heatMaps,
+ const float minPeaksDistance,
+ std::vector<std::vector<Peak>>& allPeaks,
+ int heatMapId,
+ float confidenceThreshold);
+
+std::vector<HumanPose> groupPeaksToPoses(const std::vector<std::vector<Peak>>& allPeaks,
+ const std::vector<cv::Mat>& pafs,
+ const size_t keypointsNumber,
+ const float midPointsScoreThreshold,
+ const float foundMidPointsRatioThreshold,
+ const int minJointsNumber,
+ const float minSubsetScore);
diff --git a/python/openvino/runtime/common/models/include/models/results.h b/python/openvino/runtime/common/models/include/models/results.h
new file mode 100644
index 0000000..6b3a89d
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/results.h
@@ -0,0 +1,122 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <openvino/openvino.hpp>
+
+#include "internal_model_data.h"
+
+struct MetaData;
+struct ResultBase {
+ ResultBase(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+ : frameId(frameId),
+ metaData(metaData) {}
+ virtual ~ResultBase() {}
+
+ int64_t frameId;
+
+ std::shared_ptr<MetaData> metaData;
+ bool IsEmpty() {
+ return frameId < 0;
+ }
+
+ template <class T>
+ T& asRef() {
+ return dynamic_cast<T&>(*this);
+ }
+
+ template <class T>
+ const T& asRef() const {
+ return dynamic_cast<const T&>(*this);
+ }
+};
+
+struct InferenceResult : public ResultBase {
+ std::shared_ptr<InternalModelData> internalModelData;
+ std::map<std::string, ov::Tensor> outputsData;
+
+ /// Returns the first output tensor
+ /// This function is a useful addition to direct access to outputs list as many models have only one output
+ /// @returns first output tensor
+ ov::Tensor getFirstOutputTensor() {
+ if (outputsData.empty()) {
+ throw std::out_of_range("Outputs map is empty.");
+ }
+ return outputsData.begin()->second;
+ }
+
+ /// Returns true if object contains no valid data
+ /// @returns true if object contains no valid data
+ bool IsEmpty() {
+ return outputsData.empty();
+ }
+};
+
+struct ClassificationResult : public ResultBase {
+ ClassificationResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+ : ResultBase(frameId, metaData) {}
+
+ struct Classification {
+ unsigned int id;
+ std::string label;
+ float score;
+
+ Classification(unsigned int id, const std::string& label, float score) : id(id), label(label), score(score) {}
+ };
+
+ std::vector<Classification> topLabels;
+};
+
+struct DetectedObject : public cv::Rect2f {
+ unsigned int labelID;
+ std::string label;
+ float confidence;
+};
+
+struct DetectionResult : public ResultBase {
+ DetectionResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+ : ResultBase(frameId, metaData) {}
+ std::vector<DetectedObject> objects;
+};
+
+struct RetinaFaceDetectionResult : public DetectionResult {
+ RetinaFaceDetectionResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+ : DetectionResult(frameId, metaData) {}
+ std::vector<cv::Point2f> landmarks;
+};
+
+struct ImageResult : public ResultBase {
+ ImageResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+ : ResultBase(frameId, metaData) {}
+ cv::Mat resultImage;
+};
+
+struct HumanPose {
+ std::vector<cv::Point2f> keypoints;
+ float score;
+};
+
+struct HumanPoseResult : public ResultBase {
+ HumanPoseResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+ : ResultBase(frameId, metaData) {}
+ std::vector<HumanPose> poses;
+};
diff --git a/python/openvino/runtime/common/models/include/models/segmentation_model.h b/python/openvino/runtime/common/models/include/models/segmentation_model.h
new file mode 100644
index 0000000..9d4d2cb
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/segmentation_model.h
@@ -0,0 +1,50 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+namespace ov {
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+#pragma once
+class SegmentationModel : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param useAutoResize - if true, image will be resized by openvino.
+ /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+ /// @param layout - model input layout
+ SegmentationModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout = "");
+
+ static std::vector<std::string> loadLabels(const std::string& labelFilename);
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+ int outHeight = 0;
+ int outWidth = 0;
+ int outChannels = 0;
+};
diff --git a/python/openvino/runtime/common/models/include/models/style_transfer_model.h b/python/openvino/runtime/common/models/include/models/style_transfer_model.h
new file mode 100644
index 0000000..9bcc541
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/style_transfer_model.h
@@ -0,0 +1,43 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class StyleTransferModel : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param layout - model input layout
+ StyleTransferModel(const std::string& modelFileName, const std::string& layout = "");
+
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/super_resolution_model.h b/python/openvino/runtime/common/models/include/models/super_resolution_model.h
new file mode 100644
index 0000000..773b5c3
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/super_resolution_model.h
@@ -0,0 +1,49 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+
+#include <opencv2/core/types.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+} // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class SuperResolutionModel : public ImageModel {
+public:
+ /// Constructor
+ /// @param modelFileName name of model to load
+ /// @param layout - model input layout
+ SuperResolutionModel(const std::string& modelFileName,
+ const cv::Size& inputImgSize,
+ const std::string& layout = "");
+
+ std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+ std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+ void changeInputSize(std::shared_ptr<ov::Model>& model, int coeff);
+ void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp b/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp
new file mode 100644
index 0000000..b1e8285
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp
@@ -0,0 +1,201 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/associative_embedding_decoder.h"
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include <utils/kuhn_munkres.hpp>
+
+void findPeaks(const std::vector<cv::Mat>& nmsHeatMaps,
+ const std::vector<cv::Mat>& aembdsMaps,
+ std::vector<std::vector<Peak>>& allPeaks,
+ size_t jointId,
+ size_t maxNumPeople,
+ float detectionThreshold) {
+ const cv::Mat& nmsHeatMap = nmsHeatMaps[jointId];
+ const float* heatMapData = nmsHeatMap.ptr<float>();
+ cv::Size outputSize = nmsHeatMap.size();
+
+ std::vector<int> indices(outputSize.area());
+ std::iota(std::begin(indices), std::end(indices), 0);
+ std::partial_sort(std::begin(indices),
+ std::begin(indices) + maxNumPeople,
+ std::end(indices),
+ [heatMapData](int l, int r) {
+ return heatMapData[l] > heatMapData[r];
+ });
+
+ for (size_t personId = 0; personId < maxNumPeople; personId++) {
+ int index = indices[personId];
+ int x = index / outputSize.width;
+ int y = index % outputSize.width;
+ float tag = aembdsMaps[jointId].at<float>(x, y);
+ float score = heatMapData[index];
+ allPeaks[jointId].reserve(maxNumPeople);
+ if (score > detectionThreshold) {
+ allPeaks[jointId].emplace_back(Peak{cv::Point2f(static_cast<float>(x), static_cast<float>(y)), score, tag});
+ }
+ }
+}
+
+std::vector<Pose> matchByTag(std::vector<std::vector<Peak>>& allPeaks,
+ size_t maxNumPeople,
+ size_t numJoints,
+ float tagThreshold) {
+ size_t jointOrder[]{0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16};
+ std::vector<Pose> allPoses;
+ for (size_t jointId : jointOrder) {
+ std::vector<Peak>& jointPeaks = allPeaks[jointId];
+ std::vector<float> tags;
+ for (auto& peak : jointPeaks) {
+ tags.push_back(peak.tag);
+ }
+ if (allPoses.empty()) {
+ for (size_t personId = 0; personId < jointPeaks.size(); personId++) {
+ Peak peak = jointPeaks[personId];
+ Pose pose = Pose(numJoints);
+ pose.add(jointId, peak);
+ allPoses.push_back(pose);
+ }
+ continue;
+ }
+ if (jointPeaks.empty() || (allPoses.size() == maxNumPeople)) {
+ continue;
+ }
+ std::vector<float> posesTags;
+ std::vector<cv::Point2f> posesCenters;
+ for (auto& pose : allPoses) {
+ posesTags.push_back(pose.getPoseTag());
+ posesCenters.push_back(pose.getPoseCenter());
+ }
+ size_t numAdded = tags.size();
+ size_t numGrouped = posesTags.size();
+ cv::Mat tagsDiff(numAdded, numGrouped, CV_32F);
+ cv::Mat matchingCost(numAdded, numGrouped, CV_32F);
+ std::vector<float> dists(numAdded);
+ for (size_t j = 0; j < numGrouped; j++) {
+ float minDist = std::numeric_limits<float>::max();
+ // Compute euclidean distance (in spatial space) between the pose center and all joints.
+ const cv::Point2f center = posesCenters.at(j);
+ for (size_t i = 0; i < numAdded; i++) {
+ cv::Point2f v = jointPeaks.at(i).keypoint - center;
+ float dist = std::sqrt(v.x * v.x + v.y * v.y);
+ dists[i] = dist;
+ minDist = std::min(dist, minDist);
+ }
+ // Compute semantic distance (in embedding space) between the pose tag and all joints
+ // and corresponding matching costs.
+ auto poseTag = posesTags[j];
+ for (size_t i = 0; i < numAdded; i++) {
+ float diff = static_cast<float>(cv::norm(tags[i] - poseTag));
+ tagsDiff.at<float>(i, j) = diff;
+ if (diff < tagThreshold) {
+ diff *= dists[i] / (minDist + 1e-10f);
+ }
+ matchingCost.at<float>(i, j) = std::round(diff) * 100 - jointPeaks[i].score;
+ }
+ }
+
+ if (numAdded > numGrouped) {
+ cv::copyMakeBorder(matchingCost,
+ matchingCost,
+ 0,
+ 0,
+ 0,
+ numAdded - numGrouped,
+ cv::BORDER_CONSTANT,
+ 10000000);
+ }
+ // Get pairs
+ auto res = KuhnMunkres().Solve(matchingCost);
+ for (size_t row = 0; row < res.size(); row++) {
+ size_t col = res[row];
+ if (row < numAdded && col < numGrouped && tagsDiff.at<float>(row, col) < tagThreshold) {
+ allPoses[col].add(jointId, jointPeaks[row]);
+ } else {
+ Pose pose = Pose(numJoints);
+ pose.add(jointId, jointPeaks[row]);
+ allPoses.push_back(pose);
+ }
+ }
+ }
+ return allPoses;
+}
+
+namespace {
+cv::Point2f adjustLocation(const int x, const int y, const cv::Mat& heatMap) {
+ cv::Point2f delta(0.f, 0.f);
+ int width = heatMap.cols;
+ int height = heatMap.rows;
+ if ((1 < x) && (x < width - 1) && (1 < y) && (y < height - 1)) {
+ auto diffX = heatMap.at<float>(y, x + 1) - heatMap.at<float>(y, x - 1);
+ auto diffY = heatMap.at<float>(y + 1, x) - heatMap.at<float>(y - 1, x);
+ delta.x = diffX > 0 ? 0.25f : -0.25f;
+ delta.y = diffY > 0 ? 0.25f : -0.25f;
+ }
+ return delta;
+}
+} // namespace
+
+void adjustAndRefine(std::vector<Pose>& allPoses,
+ const std::vector<cv::Mat>& heatMaps,
+ const std::vector<cv::Mat>& aembdsMaps,
+ int poseId,
+ const float delta) {
+ Pose& pose = allPoses[poseId];
+ float poseTag = pose.getPoseTag();
+ for (size_t jointId = 0; jointId < pose.size(); jointId++) {
+ Peak& peak = pose.getPeak(jointId);
+ const cv::Mat& heatMap = heatMaps[jointId];
+ const cv::Mat& aembds = aembdsMaps[jointId];
+
+ if (peak.score > 0) {
+ // Adjust
+ int x = static_cast<int>(peak.keypoint.x);
+ int y = static_cast<int>(peak.keypoint.y);
+ peak.keypoint += adjustLocation(x, y, heatMap);
+ if (delta) {
+ peak.keypoint.x += delta;
+ peak.keypoint.y += delta;
+ }
+ } else {
+ // Refine
+ // Get position with the closest tag value to the pose tag
+ cv::Mat diff = cv::abs(aembds - poseTag);
+ diff.convertTo(diff, CV_32S, 1.0, 0.0);
+ diff.convertTo(diff, CV_32F);
+ diff -= heatMap;
+ double min;
+ cv::Point2i minLoc;
+ cv::minMaxLoc(diff, &min, 0, &minLoc);
+ int x = minLoc.x;
+ int y = minLoc.y;
+ float val = heatMap.at<float>(y, x);
+ if (val > 0) {
+ peak.keypoint.x = static_cast<float>(x);
+ peak.keypoint.y = static_cast<float>(y);
+ peak.keypoint += adjustLocation(x, y, heatMap);
+ // Peak score is assigned directly, so it does not affect the pose score.
+ peak.score = val;
+ }
+ }
+ }
+}
diff --git a/python/openvino/runtime/common/models/src/classification_model.cpp b/python/openvino/runtime/common/models/src/classification_model.cpp
new file mode 100644
index 0000000..90bc0d5
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/classification_model.cpp
@@ -0,0 +1,196 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/classification_model.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iterator>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/op/softmax.hpp>
+#include <openvino/op/topk.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/slog.hpp>
+
+#include "models/results.h"
+
+ClassificationModel::ClassificationModel(const std::string& modelFileName,
+ size_t nTop,
+ bool useAutoResize,
+ const std::vector<std::string>& labels,
+ const std::string& layout)
+ : ImageModel(modelFileName, useAutoResize, layout),
+ nTop(nTop),
+ labels(labels) {}
+
+std::unique_ptr<ResultBase> ClassificationModel::postprocess(InferenceResult& infResult) {
+ const ov::Tensor& indicesTensor = infResult.outputsData.find(outputsNames[0])->second;
+ const int* indicesPtr = indicesTensor.data<int>();
+ const ov::Tensor& scoresTensor = infResult.outputsData.find(outputsNames[1])->second;
+ const float* scoresPtr = scoresTensor.data<float>();
+
+ ClassificationResult* result = new ClassificationResult(infResult.frameId, infResult.metaData);
+ auto retVal = std::unique_ptr<ResultBase>(result);
+
+ result->topLabels.reserve(scoresTensor.get_size());
+ for (size_t i = 0; i < scoresTensor.get_size(); ++i) {
+ int ind = indicesPtr[i];
+ if (ind < 0 || ind >= static_cast<int>(labels.size())) {
+ throw std::runtime_error("Invalid index for the class label is found during postprocessing");
+ }
+ result->topLabels.emplace_back(ind, labels[ind], scoresPtr[i]);
+ }
+
+ return retVal;
+}
+
+std::vector<std::string> ClassificationModel::loadLabels(const std::string& labelFilename) {
+ std::vector<std::string> labels;
+
+ /* Read labels */
+ std::ifstream inputFile(labelFilename);
+ if (!inputFile.is_open())
+ throw std::runtime_error("Can't open the labels file: " + labelFilename);
+ std::string labelsLine;
+ while (std::getline(inputFile, labelsLine)) {
+ size_t labelBeginIdx = labelsLine.find(' ');
+ size_t labelEndIdx = labelsLine.find(','); // can be npos when class has only one label
+ if (labelBeginIdx == std::string::npos) {
+ throw std::runtime_error("The labels file has incorrect format.");
+ }
+ labels.push_back(labelsLine.substr(labelBeginIdx + 1, labelEndIdx - (labelBeginIdx + 1)));
+ }
+ if (labels.empty())
+ throw std::logic_error("File is empty: " + labelFilename);
+
+ return labels;
+}
+
+void ClassificationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("Classification model wrapper supports topologies with only 1 input");
+ }
+ const auto& input = model->input();
+ inputsNames.push_back(input.get_any_name());
+
+ const ov::Shape& inputShape = input.get_shape();
+ const ov::Layout& inputLayout = getInputLayout(input);
+
+ if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+ }
+
+ const auto width = inputShape[ov::layout::width_idx(inputLayout)];
+ const auto height = inputShape[ov::layout::height_idx(inputLayout)];
+ if (height != width) {
+ throw std::logic_error("Model input has incorrect image shape. Must be NxN square."
+ " Got " +
+ std::to_string(height) + "x" + std::to_string(width) + ".");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 1) {
+ throw std::logic_error("Classification model wrapper supports topologies with only 1 output");
+ }
+
+ const ov::Shape& outputShape = model->output().get_shape();
+ if (outputShape.size() != 2 && outputShape.size() != 4) {
+ throw std::logic_error("Classification model wrapper supports topologies only with"
+ " 2-dimensional or 4-dimensional output");
+ }
+
+ const ov::Layout outputLayout("NCHW");
+ if (outputShape.size() == 4 && (outputShape[ov::layout::height_idx(outputLayout)] != 1 ||
+ outputShape[ov::layout::width_idx(outputLayout)] != 1)) {
+ throw std::logic_error("Classification model wrapper supports topologies only"
+ " with 4-dimensional output which has last two dimensions of size 1");
+ }
+
+ size_t classesNum = outputShape[ov::layout::channels_idx(outputLayout)];
+ if (nTop > classesNum) {
+ throw std::logic_error("The model provides " + std::to_string(classesNum) + " classes, but " +
+ std::to_string(nTop) + " labels are requested to be predicted");
+ }
+ if (classesNum == labels.size() + 1) {
+ labels.insert(labels.begin(), "other");
+ slog::warn << "Inserted 'other' label as first." << slog::endl;
+ } else if (classesNum != labels.size()) {
+ throw std::logic_error("Model's number of classes and parsed labels must match (" +
+ std::to_string(outputShape[1]) + " and " + std::to_string(labels.size()) + ')');
+ }
+
+ ppp.output().tensor().set_element_type(ov::element::f32);
+ model = ppp.build();
+
+ // --------------------------- Adding softmax and topK output ---------------------------
+ auto nodes = model->get_ops();
+ auto softmaxNodeIt = std::find_if(std::begin(nodes), std::end(nodes), [](const std::shared_ptr<ov::Node>& op) {
+ return std::string(op->get_type_name()) == "Softmax";
+ });
+
+ std::shared_ptr<ov::Node> softmaxNode;
+ if (softmaxNodeIt == nodes.end()) {
+ auto logitsNode = model->get_output_op(0)->input(0).get_source_output().get_node();
+ softmaxNode = std::make_shared<ov::op::v1::Softmax>(logitsNode->output(0), 1);
+ } else {
+ softmaxNode = *softmaxNodeIt;
+ }
+ const auto k = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<size_t>{nTop});
+ std::shared_ptr<ov::Node> topkNode = std::make_shared<ov::op::v3::TopK>(softmaxNode,
+ k,
+ 1,
+ ov::op::v3::TopK::Mode::MAX,
+ ov::op::v3::TopK::SortType::SORT_VALUES);
+
+ auto indices = std::make_shared<ov::op::v0::Result>(topkNode->output(0));
+ auto scores = std::make_shared<ov::op::v0::Result>(topkNode->output(1));
+ ov::ResultVector res({scores, indices});
+ model = std::make_shared<ov::Model>(res, model->get_parameters(), "classification");
+
+ // manually set output tensors name for created topK node
+ model->outputs()[0].set_names({"indices"});
+ outputsNames.push_back("indices");
+ model->outputs()[1].set_names({"scores"});
+ outputsNames.push_back("scores");
+
+ // set output precisions
+ ppp = ov::preprocess::PrePostProcessor(model);
+ ppp.output("indices").tensor().set_element_type(ov::element::i32);
+ ppp.output("scores").tensor().set_element_type(ov::element::f32);
+ model = ppp.build();
+}
diff --git a/python/openvino/runtime/common/models/src/deblurring_model.cpp b/python/openvino/runtime/common/models/src/deblurring_model.cpp
new file mode 100644
index 0000000..261efb3
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/deblurring_model.cpp
@@ -0,0 +1,158 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/deblurring_model.h"
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+DeblurringModel::DeblurringModel(const std::string& modelFileName,
+ const cv::Size& inputImgSize,
+ const std::string& layout)
+ : ImageModel(modelFileName, false, layout) {
+ netInputHeight = inputImgSize.height;
+ netInputWidth = inputImgSize.width;
+}
+
+void DeblurringModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("Deblurring model wrapper supports topologies with only 1 input");
+ }
+
+ inputsNames.push_back(model->input().get_any_name());
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+ inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 1) {
+ throw std::logic_error("Deblurring model wrapper supports topologies with only 1 output");
+ }
+
+ outputsNames.push_back(model->output().get_any_name());
+
+ const ov::Shape& outputShape = model->output().get_shape();
+ const ov::Layout outputLayout("NCHW");
+ if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 ||
+ outputShape[ov::layout::channels_idx(outputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's output is expected");
+ }
+
+ ppp.output().tensor().set_element_type(ov::element::f32);
+ model = ppp.build();
+
+ changeInputSize(model);
+}
+
+void DeblurringModel::changeInputSize(std::shared_ptr<ov::Model>& model) {
+ const ov::Layout& layout = ov::layout::get_layout(model->input());
+ ov::Shape inputShape = model->input().get_shape();
+
+ const auto batchId = ov::layout::batch_idx(layout);
+ const auto heightId = ov::layout::height_idx(layout);
+ const auto widthId = ov::layout::width_idx(layout);
+
+ if (inputShape[heightId] % stride || inputShape[widthId] % stride) {
+ throw std::logic_error("Model input shape HxW = " + std::to_string(inputShape[heightId]) + "x" +
+ std::to_string(inputShape[widthId]) + "must be divisible by stride " +
+ std::to_string(stride));
+ }
+
+ netInputHeight = static_cast<int>((netInputHeight + stride - 1) / stride) * stride;
+ netInputWidth = static_cast<int>((netInputWidth + stride - 1) / stride) * stride;
+
+ inputShape[batchId] = 1;
+ inputShape[heightId] = netInputHeight;
+ inputShape[widthId] = netInputWidth;
+
+ model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> DeblurringModel::preprocess(const InputData& inputData, ov::InferRequest& request) {
+ auto& image = inputData.asRef<ImageInputData>().inputImage;
+ size_t h = image.rows;
+ size_t w = image.cols;
+ cv::Mat resizedImage;
+
+ if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) {
+ int bottom = netInputHeight - h;
+ int right = netInputWidth - w;
+ cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0);
+ } else {
+ slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+ cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight));
+ }
+ request.set_input_tensor(wrapMat2Tensor(resizedImage));
+
+ return std::make_shared<InternalImageModelData>(image.cols, image.rows);
+}
+
+std::unique_ptr<ResultBase> DeblurringModel::postprocess(InferenceResult& infResult) {
+ ImageResult* result = new ImageResult;
+ *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+
+ const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+ const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+ std::vector<cv::Mat> imgPlanes;
+ const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape();
+ const ov::Layout outputLayout("NCHW");
+ size_t outHeight = static_cast<int>((outputShape[ov::layout::height_idx(outputLayout)]));
+ size_t outWidth = static_cast<int>((outputShape[ov::layout::width_idx(outputLayout)]));
+ size_t numOfPixels = outWidth * outHeight;
+ imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))};
+ cv::Mat resultImg;
+ cv::merge(imgPlanes, resultImg);
+
+ if (netInputHeight - stride < static_cast<size_t>(inputImgSize.inputImgHeight) &&
+ static_cast<size_t>(inputImgSize.inputImgHeight) <= netInputHeight &&
+ netInputWidth - stride < static_cast<size_t>(inputImgSize.inputImgWidth) &&
+ static_cast<size_t>(inputImgSize.inputImgWidth) <= netInputWidth) {
+ result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+ } else {
+ cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+ }
+
+ result->resultImage.convertTo(result->resultImage, CV_8UC3, 255);
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model.cpp b/python/openvino/runtime/common/models/src/detection_model.cpp
new file mode 100644
index 0000000..83e2d22
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model.cpp
@@ -0,0 +1,52 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model.h"
+
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+DetectionModel::DetectionModel(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ const std::vector<std::string>& labels,
+ const std::string& layout)
+ : ImageModel(modelFileName, useAutoResize, layout),
+ confidenceThreshold(confidenceThreshold),
+ labels(labels) {}
+
+std::vector<std::string> DetectionModel::loadLabels(const std::string& labelFilename) {
+ std::vector<std::string> labelsList;
+
+ /* Read labels (if any) */
+ if (!labelFilename.empty()) {
+ std::ifstream inputFile(labelFilename);
+ if (!inputFile.is_open())
+ throw std::runtime_error("Can't open the labels file: " + labelFilename);
+ std::string label;
+ while (std::getline(inputFile, label)) {
+ labelsList.push_back(label);
+ }
+ if (labelsList.empty())
+ throw std::logic_error("File is empty: " + labelFilename);
+ }
+
+ return labelsList;
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_centernet.cpp b/python/openvino/runtime/common/models/src/detection_model_centernet.cpp
new file mode 100644
index 0000000..eac42a7
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_centernet.cpp
@@ -0,0 +1,302 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_centernet.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <utility>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelCenterNet::ModelCenterNet(const std::string& modelFileName,
+ float confidenceThreshold,
+ const std::vector<std::string>& labels,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout) {}
+
+void ModelCenterNet::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("CenterNet model wrapper expects models that have only 1 input");
+ }
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 3-channel input");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ inputTransform.setPrecision(ppp, model->input().get_any_name());
+ ppp.input().tensor().set_layout("NHWC");
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Reading image input parameters -------------------------------------------
+ inputsNames.push_back(model->input().get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 3) {
+ throw std::logic_error("CenterNet model wrapper expects models that have 3 outputs");
+ }
+
+ const ov::Layout outLayout{"NCHW"};
+ for (const auto& output : model->outputs()) {
+ auto outTensorName = output.get_any_name();
+ outputsNames.push_back(outTensorName);
+ ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outLayout);
+ }
+ std::sort(outputsNames.begin(), outputsNames.end());
+ model = ppp.build();
+}
+
+cv::Point2f getDir(const cv::Point2f& srcPoint, float rotRadius) {
+ float sn = sinf(rotRadius);
+ float cs = cosf(rotRadius);
+
+ cv::Point2f srcResult(0.0f, 0.0f);
+ srcResult.x = srcPoint.x * cs - srcPoint.y * sn;
+ srcResult.y = srcPoint.x * sn + srcPoint.y * cs;
+
+ return srcResult;
+}
+
+cv::Point2f get3rdPoint(const cv::Point2f& a, const cv::Point2f& b) {
+ cv::Point2f direct = a - b;
+ return b + cv::Point2f(-direct.y, direct.x);
+}
+
+cv::Mat getAffineTransform(float centerX,
+ float centerY,
+ int srcW,
+ float rot,
+ size_t outputWidth,
+ size_t outputHeight,
+ bool inv = false) {
+ float rotRad = static_cast<float>(CV_PI) * rot / 180.0f;
+ auto srcDir = getDir({0.0f, -0.5f * srcW}, rotRad);
+ cv::Point2f dstDir(0.0f, -0.5f * outputWidth);
+ std::vector<cv::Point2f> src(3, {0.0f, 0.0f});
+ std::vector<cv::Point2f> dst(3, {0.0f, 0.0f});
+
+ src[0] = {centerX, centerY};
+ src[1] = srcDir + src[0];
+ src[2] = get3rdPoint(src[0], src[1]);
+
+ dst[0] = {outputWidth * 0.5f, outputHeight * 0.5f};
+ dst[1] = dst[0] + dstDir;
+ dst[2] = get3rdPoint(dst[0], dst[1]);
+
+ cv::Mat trans;
+ if (inv) {
+ trans = cv::getAffineTransform(dst, src);
+ } else {
+ trans = cv::getAffineTransform(src, dst);
+ }
+
+ return trans;
+}
+
+std::shared_ptr<InternalModelData> ModelCenterNet::preprocess(const InputData& inputData, ov::InferRequest& request) {
+ auto& img = inputData.asRef<ImageInputData>().inputImage;
+ const auto& resizedImg = resizeImageExt(img, netInputWidth, netInputHeight, RESIZE_KEEP_ASPECT_LETTERBOX);
+
+ request.set_input_tensor(wrapMat2Tensor(inputTransform(resizedImg)));
+ return std::make_shared<InternalImageModelData>(img.cols, img.rows);
+}
+
+std::vector<std::pair<size_t, float>> nms(float* scoresPtr, const ov::Shape& shape, float threshold, int kernel = 3) {
+ std::vector<std::pair<size_t, float>> scores;
+ scores.reserve(ModelCenterNet::INIT_VECTOR_SIZE);
+ auto chSize = shape[2] * shape[3];
+
+ for (size_t i = 0; i < shape[1] * shape[2] * shape[3]; ++i) {
+ scoresPtr[i] = expf(scoresPtr[i]) / (1 + expf(scoresPtr[i]));
+ }
+
+ for (size_t ch = 0; ch < shape[1]; ++ch) {
+ for (size_t w = 0; w < shape[2]; ++w) {
+ for (size_t h = 0; h < shape[3]; ++h) {
+ float max = scoresPtr[chSize * ch + shape[2] * w + h];
+
+ // --------------------- filter on threshold--------------------------------------
+ if (max < threshold) {
+ continue;
+ }
+
+ // --------------------- store index and score------------------------------------
+ scores.push_back({chSize * ch + shape[2] * w + h, max});
+
+ bool next = true;
+ // ---------------------- maxpool2d -----------------------------------------------
+ for (int i = -kernel / 2; i < kernel / 2 + 1 && next; ++i) {
+ for (int j = -kernel / 2; j < kernel / 2 + 1; ++j) {
+ if (w + i >= 0 && w + i < shape[2] && h + j >= 0 && h + j < shape[3]) {
+ if (scoresPtr[chSize * ch + shape[2] * (w + i) + h + j] > max) {
+ scores.pop_back();
+ next = false;
+ break;
+ }
+ } else {
+ if (max < 0) {
+ scores.pop_back();
+ next = false;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return scores;
+}
+
+static std::vector<std::pair<size_t, float>> filterScores(const ov::Tensor& scoresTensor, float threshold) {
+ auto shape = scoresTensor.get_shape();
+ float* scoresPtr = scoresTensor.data<float>();
+
+ return nms(scoresPtr, shape, threshold);
+}
+
+std::vector<std::pair<float, float>> filterReg(const ov::Tensor& regressionTensor,
+ const std::vector<std::pair<size_t, float>>& scores,
+ size_t chSize) {
+ const float* regPtr = regressionTensor.data<float>();
+ std::vector<std::pair<float, float>> reg;
+
+ for (auto s : scores) {
+ reg.push_back({regPtr[s.first % chSize], regPtr[chSize + s.first % chSize]});
+ }
+
+ return reg;
+}
+
+std::vector<std::pair<float, float>> filterWH(const ov::Tensor& whTensor,
+ const std::vector<std::pair<size_t, float>>& scores,
+ size_t chSize) {
+ const float* whPtr = whTensor.data<float>();
+ std::vector<std::pair<float, float>> wh;
+
+ for (auto s : scores) {
+ wh.push_back({whPtr[s.first % chSize], whPtr[chSize + s.first % chSize]});
+ }
+
+ return wh;
+}
+
+std::vector<ModelCenterNet::BBox> calcBoxes(const std::vector<std::pair<size_t, float>>& scores,
+ const std::vector<std::pair<float, float>>& reg,
+ const std::vector<std::pair<float, float>>& wh,
+ const ov::Shape& shape) {
+ std::vector<ModelCenterNet::BBox> boxes(scores.size());
+
+ for (size_t i = 0; i < boxes.size(); ++i) {
+ size_t chIdx = scores[i].first % (shape[2] * shape[3]);
+ auto xCenter = chIdx % shape[3];
+ auto yCenter = chIdx / shape[3];
+
+ boxes[i].left = xCenter + reg[i].first - wh[i].first / 2.0f;
+ boxes[i].top = yCenter + reg[i].second - wh[i].second / 2.0f;
+ boxes[i].right = xCenter + reg[i].first + wh[i].first / 2.0f;
+ boxes[i].bottom = yCenter + reg[i].second + wh[i].second / 2.0f;
+ }
+
+ return boxes;
+}
+
+void transform(std::vector<ModelCenterNet::BBox>& boxes,
+ const ov::Shape& shape,
+ int scale,
+ float centerX,
+ float centerY) {
+ cv::Mat1f trans = getAffineTransform(centerX, centerY, scale, 0, shape[2], shape[3], true);
+
+ for (auto& b : boxes) {
+ ModelCenterNet::BBox newbb;
+
+ newbb.left = trans.at<float>(0, 0) * b.left + trans.at<float>(0, 1) * b.top + trans.at<float>(0, 2);
+ newbb.top = trans.at<float>(1, 0) * b.left + trans.at<float>(1, 1) * b.top + trans.at<float>(1, 2);
+ newbb.right = trans.at<float>(0, 0) * b.right + trans.at<float>(0, 1) * b.bottom + trans.at<float>(0, 2);
+ newbb.bottom = trans.at<float>(1, 0) * b.right + trans.at<float>(1, 1) * b.bottom + trans.at<float>(1, 2);
+
+ b = newbb;
+ }
+}
+
+std::unique_ptr<ResultBase> ModelCenterNet::postprocess(InferenceResult& infResult) {
+ // --------------------------- Filter data and get valid indices ---------------------------------
+ const auto& heatmapTensor = infResult.outputsData[outputsNames[0]];
+ const auto& heatmapTensorShape = heatmapTensor.get_shape();
+ const auto chSize = heatmapTensorShape[2] * heatmapTensorShape[3];
+ const auto scores = filterScores(heatmapTensor, confidenceThreshold);
+
+ const auto& regressionTensor = infResult.outputsData[outputsNames[1]];
+ const auto reg = filterReg(regressionTensor, scores, chSize);
+
+ const auto& whTensor = infResult.outputsData[outputsNames[2]];
+ const auto wh = filterWH(whTensor, scores, chSize);
+
+ // --------------------------- Calculate bounding boxes & apply inverse affine transform ----------
+ auto boxes = calcBoxes(scores, reg, wh, heatmapTensorShape);
+
+ const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+ const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+ const auto scale = std::max(imgWidth, imgHeight);
+ const float centerX = imgWidth / 2.0f;
+ const float centerY = imgHeight / 2.0f;
+
+ transform(boxes, heatmapTensorShape, scale, centerX, centerY);
+
+ // --------------------------- Create detection result objects ------------------------------------
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+
+ result->objects.reserve(scores.size());
+ for (size_t i = 0; i < scores.size(); ++i) {
+ DetectedObject desc;
+ desc.confidence = scores[i].second;
+ desc.labelID = scores[i].first / chSize;
+ desc.label = getLabelName(desc.labelID);
+ desc.x = clamp(boxes[i].left, 0.f, static_cast<float>(imgWidth));
+ desc.y = clamp(boxes[i].top, 0.f, static_cast<float>(imgHeight));
+ desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast<float>(imgWidth));
+ desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast<float>(imgHeight));
+
+ result->objects.push_back(desc);
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp b/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp
new file mode 100644
index 0000000..bb349a6
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp
@@ -0,0 +1,261 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_faceboxes.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/nms.hpp>
+#include <utils/ocv_common.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelFaceBoxes::ModelFaceBoxes(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ float boxIOUThreshold,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout),
+ maxProposalsCount(0),
+ boxIOUThreshold(boxIOUThreshold),
+ variance({0.1f, 0.2f}),
+ steps({32, 64, 128}),
+ minSizes({{32, 64, 128}, {256}, {512}}) {}
+
+void ModelFaceBoxes::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("FaceBoxes model wrapper expects models that have only 1 input");
+ }
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 3-channel input");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ inputTransform.setPrecision(ppp, model->input().get_any_name());
+ ppp.input().tensor().set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Reading image input parameters -------------------------------------------
+ inputsNames.push_back(model->input().get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 2) {
+ throw std::logic_error("FaceBoxes model wrapper expects models that have 2 outputs");
+ }
+
+ const ov::Layout outputLayout{"CHW"};
+ maxProposalsCount = model->outputs().front().get_shape()[ov::layout::height_idx(outputLayout)];
+ for (const auto& output : model->outputs()) {
+ const auto outTensorName = output.get_any_name();
+ outputsNames.push_back(outTensorName);
+ ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout);
+ }
+ std::sort(outputsNames.begin(), outputsNames.end());
+ model = ppp.build();
+
+ // --------------------------- Calculating anchors ----------------------------------------------------
+ std::vector<std::pair<size_t, size_t>> featureMaps;
+ for (auto s : steps) {
+ featureMaps.push_back({netInputHeight / s, netInputWidth / s});
+ }
+
+ priorBoxes(featureMaps);
+}
+
+void calculateAnchors(std::vector<Anchor>& anchors,
+ const std::vector<float>& vx,
+ const std::vector<float>& vy,
+ const int minSize,
+ const int step) {
+ float skx = static_cast<float>(minSize);
+ float sky = static_cast<float>(minSize);
+
+ std::vector<float> dense_cx, dense_cy;
+
+ for (auto x : vx) {
+ dense_cx.push_back(x * step);
+ }
+
+ for (auto y : vy) {
+ dense_cy.push_back(y * step);
+ }
+
+ for (auto cy : dense_cy) {
+ for (auto cx : dense_cx) {
+ anchors.push_back(
+ {cx - 0.5f * skx, cy - 0.5f * sky, cx + 0.5f * skx, cy + 0.5f * sky}); // left top right bottom
+ }
+ }
+}
+
+void calculateAnchorsZeroLevel(std::vector<Anchor>& anchors,
+ const int fx,
+ const int fy,
+ const std::vector<int>& minSizes,
+ const int step) {
+ for (auto s : minSizes) {
+ std::vector<float> vx, vy;
+ if (s == 32) {
+ vx.push_back(static_cast<float>(fx));
+ vx.push_back(fx + 0.25f);
+ vx.push_back(fx + 0.5f);
+ vx.push_back(fx + 0.75f);
+
+ vy.push_back(static_cast<float>(fy));
+ vy.push_back(fy + 0.25f);
+ vy.push_back(fy + 0.5f);
+ vy.push_back(fy + 0.75f);
+ } else if (s == 64) {
+ vx.push_back(static_cast<float>(fx));
+ vx.push_back(fx + 0.5f);
+
+ vy.push_back(static_cast<float>(fy));
+ vy.push_back(fy + 0.5f);
+ } else {
+ vx.push_back(fx + 0.5f);
+ vy.push_back(fy + 0.5f);
+ }
+ calculateAnchors(anchors, vx, vy, s, step);
+ }
+}
+
+void ModelFaceBoxes::priorBoxes(const std::vector<std::pair<size_t, size_t>>& featureMaps) {
+ anchors.reserve(maxProposalsCount);
+
+ for (size_t k = 0; k < featureMaps.size(); ++k) {
+ std::vector<float> a;
+ for (size_t i = 0; i < featureMaps[k].first; ++i) {
+ for (size_t j = 0; j < featureMaps[k].second; ++j) {
+ if (k == 0) {
+ calculateAnchorsZeroLevel(anchors, j, i, minSizes[k], steps[k]);
+ } else {
+ calculateAnchors(anchors, {j + 0.5f}, {i + 0.5f}, minSizes[k][0], steps[k]);
+ }
+ }
+ }
+ }
+}
+
+std::pair<std::vector<size_t>, std::vector<float>> filterScores(const ov::Tensor& scoresTensor,
+ const float confidenceThreshold) {
+ auto shape = scoresTensor.get_shape();
+ const float* scoresPtr = scoresTensor.data<float>();
+
+ std::vector<size_t> indices;
+ std::vector<float> scores;
+ scores.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE);
+ indices.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE);
+ for (size_t i = 1; i < shape[1] * shape[2]; i = i + 2) {
+ if (scoresPtr[i] > confidenceThreshold) {
+ indices.push_back(i / 2);
+ scores.push_back(scoresPtr[i]);
+ }
+ }
+
+ return {indices, scores};
+}
+
+std::vector<Anchor> filterBoxes(const ov::Tensor& boxesTensor,
+ const std::vector<Anchor>& anchors,
+ const std::vector<size_t>& validIndices,
+ const std::vector<float>& variance) {
+ auto shape = boxesTensor.get_shape();
+ const float* boxesPtr = boxesTensor.data<float>();
+
+ std::vector<Anchor> boxes;
+ boxes.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE);
+ for (auto i : validIndices) {
+ auto objStart = shape[2] * i;
+
+ auto dx = boxesPtr[objStart];
+ auto dy = boxesPtr[objStart + 1];
+ auto dw = boxesPtr[objStart + 2];
+ auto dh = boxesPtr[objStart + 3];
+
+ auto predCtrX = dx * variance[0] * anchors[i].getWidth() + anchors[i].getXCenter();
+ auto predCtrY = dy * variance[0] * anchors[i].getHeight() + anchors[i].getYCenter();
+ auto predW = exp(dw * variance[1]) * anchors[i].getWidth();
+ auto predH = exp(dh * variance[1]) * anchors[i].getHeight();
+
+ boxes.push_back({static_cast<float>(predCtrX - 0.5f * predW),
+ static_cast<float>(predCtrY - 0.5f * predH),
+ static_cast<float>(predCtrX + 0.5f * predW),
+ static_cast<float>(predCtrY + 0.5f * predH)});
+ }
+
+ return boxes;
+}
+
+std::unique_ptr<ResultBase> ModelFaceBoxes::postprocess(InferenceResult& infResult) {
+ // Filter scores and get valid indices for bounding boxes
+ const auto scoresTensor = infResult.outputsData[outputsNames[1]];
+ const auto scores = filterScores(scoresTensor, confidenceThreshold);
+
+ // Filter bounding boxes on indices
+ auto boxesTensor = infResult.outputsData[outputsNames[0]];
+ std::vector<Anchor> boxes = filterBoxes(boxesTensor, anchors, scores.first, variance);
+
+ // Apply Non-maximum Suppression
+ const std::vector<int> keep = nms(boxes, scores.second, boxIOUThreshold);
+
+ // Create detection result objects
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+ const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+ const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+ const float scaleX = static_cast<float>(netInputWidth) / imgWidth;
+ const float scaleY = static_cast<float>(netInputHeight) / imgHeight;
+
+ result->objects.reserve(keep.size());
+ for (auto i : keep) {
+ DetectedObject desc;
+ desc.confidence = scores.second[i];
+ desc.x = clamp(boxes[i].left / scaleX, 0.f, static_cast<float>(imgWidth));
+ desc.y = clamp(boxes[i].top / scaleY, 0.f, static_cast<float>(imgHeight));
+ desc.width = clamp(boxes[i].getWidth() / scaleX, 0.f, static_cast<float>(imgWidth));
+ desc.height = clamp(boxes[i].getHeight() / scaleY, 0.f, static_cast<float>(imgHeight));
+ desc.labelID = 0;
+ desc.label = labels[0];
+
+ result->objects.push_back(desc);
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp b/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp
new file mode 100644
index 0000000..8835725
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp
@@ -0,0 +1,394 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_retinaface.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <stdexcept>
+
+#include <opencv2/core.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/nms.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelRetinaFace::ModelRetinaFace(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ float boxIOUThreshold,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout), // Default label is "Face"
+ shouldDetectMasks(false),
+ shouldDetectLandmarks(false),
+ boxIOUThreshold(boxIOUThreshold),
+ maskThreshold(0.8f),
+ landmarkStd(1.0f),
+ anchorCfg({{32, {32, 16}, 16, {1}}, {16, {8, 4}, 16, {1}}, {8, {2, 1}, 16, {1}}}) {
+ generateAnchorsFpn();
+}
+
+void ModelRetinaFace::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("RetinaFace model wrapper expects models that have only 1 input");
+ }
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 3-channel input");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Reading image input parameters -------------------------------------------
+ inputsNames.push_back(model->input().get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ // --------------------------- Prepare output -----------------------------------------------------
+
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 6 && outputs.size() != 9 && outputs.size() != 12) {
+ throw std::logic_error("RetinaFace model wrapper expects models that have 6, 9 or 12 outputs");
+ }
+
+ const ov::Layout outputLayout{"NCHW"};
+ std::vector<size_t> outputsSizes[OUT_MAX];
+ for (const auto& output : model->outputs()) {
+ auto outTensorName = output.get_any_name();
+ outputsNames.push_back(outTensorName);
+ ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout);
+
+ OutputType type = OUT_MAX;
+ if (outTensorName.find("box") != std::string::npos) {
+ type = OUT_BOXES;
+ } else if (outTensorName.find("cls") != std::string::npos) {
+ type = OUT_SCORES;
+ } else if (outTensorName.find("landmark") != std::string::npos) {
+ type = OUT_LANDMARKS;
+ shouldDetectLandmarks = true;
+ } else if (outTensorName.find("type") != std::string::npos) {
+ type = OUT_MASKSCORES;
+ labels.clear();
+ labels.push_back("No Mask");
+ labels.push_back("Mask");
+ shouldDetectMasks = true;
+ landmarkStd = 0.2f;
+ } else {
+ continue;
+ }
+
+ size_t num = output.get_shape()[ov::layout::height_idx(outputLayout)];
+ size_t i = 0;
+ for (; i < outputsSizes[type].size(); ++i) {
+ if (num < outputsSizes[type][i]) {
+ break;
+ }
+ }
+ separateOutputsNames[type].insert(separateOutputsNames[type].begin() + i, outTensorName);
+ outputsSizes[type].insert(outputsSizes[type].begin() + i, num);
+ }
+ model = ppp.build();
+
+ for (size_t idx = 0; idx < outputsSizes[OUT_BOXES].size(); ++idx) {
+ size_t width = outputsSizes[OUT_BOXES][idx];
+ size_t height = outputsSizes[OUT_BOXES][idx];
+ auto s = anchorCfg[idx].stride;
+ auto anchorNum = anchorsFpn[s].size();
+
+ anchors.push_back(std::vector<Anchor>(height * width * anchorNum));
+ for (size_t iw = 0; iw < width; ++iw) {
+ size_t sw = iw * s;
+ for (size_t ih = 0; ih < height; ++ih) {
+ size_t sh = ih * s;
+ for (size_t k = 0; k < anchorNum; ++k) {
+ Anchor& anc = anchors[idx][(ih * width + iw) * anchorNum + k];
+ anc.left = anchorsFpn[s][k].left + sw;
+ anc.top = anchorsFpn[s][k].top + sh;
+ anc.right = anchorsFpn[s][k].right + sw;
+ anc.bottom = anchorsFpn[s][k].bottom + sh;
+ }
+ }
+ }
+ }
+}
+
+std::vector<Anchor> ratioEnum(const Anchor& anchor, const std::vector<int>& ratios) {
+ std::vector<Anchor> retVal;
+ const auto w = anchor.getWidth();
+ const auto h = anchor.getHeight();
+ const auto xCtr = anchor.getXCenter();
+ const auto yCtr = anchor.getYCenter();
+
+ for (const auto ratio : ratios) {
+ const auto size = w * h;
+ const auto sizeRatio = static_cast<float>(size) / ratio;
+ const auto ws = sqrt(sizeRatio);
+ const auto hs = ws * ratio;
+ retVal.push_back({static_cast<float>(xCtr - 0.5f * (ws - 1.0f)),
+ static_cast<float>(yCtr - 0.5f * (hs - 1.0f)),
+ static_cast<float>(xCtr + 0.5f * (ws - 1.0f)),
+ static_cast<float>(yCtr + 0.5f * (hs - 1.0f))});
+ }
+ return retVal;
+}
+
+std::vector<Anchor> scaleEnum(const Anchor& anchor, const std::vector<int>& scales) {
+ std::vector<Anchor> retVal;
+ const auto w = anchor.getWidth();
+ const auto h = anchor.getHeight();
+ const auto xCtr = anchor.getXCenter();
+ const auto yCtr = anchor.getYCenter();
+
+ for (auto scale : scales) {
+ const auto ws = w * scale;
+ const auto hs = h * scale;
+ retVal.push_back({static_cast<float>(xCtr - 0.5f * (ws - 1.0f)),
+ static_cast<float>(yCtr - 0.5f * (hs - 1.0f)),
+ static_cast<float>(xCtr + 0.5f * (ws - 1.0f)),
+ static_cast<float>(yCtr + 0.5f * (hs - 1.0f))});
+ }
+ return retVal;
+}
+
+std::vector<Anchor> generateAnchors(const int baseSize,
+ const std::vector<int>& ratios,
+ const std::vector<int>& scales) {
+ Anchor baseAnchor{0.0f, 0.0f, baseSize - 1.0f, baseSize - 1.0f};
+ auto ratioAnchors = ratioEnum(baseAnchor, ratios);
+ std::vector<Anchor> retVal;
+
+ for (const auto& ra : ratioAnchors) {
+ auto addon = scaleEnum(ra, scales);
+ retVal.insert(retVal.end(), addon.begin(), addon.end());
+ }
+ return retVal;
+}
+
+void ModelRetinaFace::generateAnchorsFpn() {
+ auto cfg = anchorCfg;
+ std::sort(cfg.begin(), cfg.end(), [](const AnchorCfgLine& x, const AnchorCfgLine& y) {
+ return x.stride > y.stride;
+ });
+
+ for (const auto& cfgLine : cfg) {
+ anchorsFpn.emplace(cfgLine.stride, generateAnchors(cfgLine.baseSize, cfgLine.ratios, cfgLine.scales));
+ }
+}
+
+std::vector<size_t> thresholding(const ov::Tensor& scoresTensor, const int anchorNum, const float confidenceThreshold) {
+ std::vector<size_t> indices;
+ indices.reserve(ModelRetinaFace::INIT_VECTOR_SIZE);
+ auto shape = scoresTensor.get_shape();
+ size_t restAnchors = shape[1] - anchorNum;
+ const float* scoresPtr = scoresTensor.data<float>();
+
+ for (size_t x = anchorNum; x < shape[1]; ++x) {
+ for (size_t y = 0; y < shape[2]; ++y) {
+ for (size_t z = 0; z < shape[3]; ++z) {
+ auto idx = (x * shape[2] + y) * shape[3] + z;
+ auto score = scoresPtr[idx];
+ if (score >= confidenceThreshold) {
+ indices.push_back((y * shape[3] + z) * restAnchors + (x - anchorNum));
+ }
+ }
+ }
+ }
+
+ return indices;
+}
+
+void filterScores(std::vector<float>& scores,
+ const std::vector<size_t>& indices,
+ const ov::Tensor& scoresTensor,
+ const int anchorNum) {
+ const auto& shape = scoresTensor.get_shape();
+ const float* scoresPtr = scoresTensor.data<float>();
+ const auto start = shape[2] * shape[3] * anchorNum;
+
+ for (auto i : indices) {
+ auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum;
+ scores.push_back(scoresPtr[start + offset]);
+ }
+}
+
+void filterBoxes(std::vector<Anchor>& boxes,
+ const std::vector<size_t>& indices,
+ const ov::Tensor& boxesTensor,
+ int anchorNum,
+ const std::vector<Anchor>& anchors) {
+ const auto& shape = boxesTensor.get_shape();
+ const float* boxesPtr = boxesTensor.data<float>();
+ const auto boxPredLen = shape[1] / anchorNum;
+ const auto blockWidth = shape[2] * shape[3];
+
+ for (auto i : indices) {
+ auto offset = blockWidth * boxPredLen * (i % anchorNum) + (i / anchorNum);
+
+ const auto dx = boxesPtr[offset];
+ const auto dy = boxesPtr[offset + blockWidth];
+ const auto dw = boxesPtr[offset + blockWidth * 2];
+ const auto dh = boxesPtr[offset + blockWidth * 3];
+
+ const auto predCtrX = dx * anchors[i].getWidth() + anchors[i].getXCenter();
+ const auto predCtrY = dy * anchors[i].getHeight() + anchors[i].getYCenter();
+ const auto predW = exp(dw) * anchors[i].getWidth();
+ const auto predH = exp(dh) * anchors[i].getHeight();
+
+ boxes.push_back({static_cast<float>(predCtrX - 0.5f * (predW - 1.0f)),
+ static_cast<float>(predCtrY - 0.5f * (predH - 1.0f)),
+ static_cast<float>(predCtrX + 0.5f * (predW - 1.0f)),
+ static_cast<float>(predCtrY + 0.5f * (predH - 1.0f))});
+ }
+}
+
+void filterLandmarks(std::vector<cv::Point2f>& landmarks,
+ const std::vector<size_t>& indices,
+ const ov::Tensor& landmarksTensor,
+ int anchorNum,
+ const std::vector<Anchor>& anchors,
+ const float landmarkStd) {
+ const auto& shape = landmarksTensor.get_shape();
+ const float* landmarksPtr = landmarksTensor.data<float>();
+ const auto landmarkPredLen = shape[1] / anchorNum;
+ const auto blockWidth = shape[2] * shape[3];
+
+ for (auto i : indices) {
+ for (int j = 0; j < ModelRetinaFace::LANDMARKS_NUM; ++j) {
+ auto offset = (i % anchorNum) * landmarkPredLen * shape[2] * shape[3] + i / anchorNum;
+ auto deltaX = landmarksPtr[offset + j * 2 * blockWidth] * landmarkStd;
+ auto deltaY = landmarksPtr[offset + (j * 2 + 1) * blockWidth] * landmarkStd;
+ landmarks.push_back({deltaX * anchors[i].getWidth() + anchors[i].getXCenter(),
+ deltaY * anchors[i].getHeight() + anchors[i].getYCenter()});
+ }
+ }
+}
+
+void filterMasksScores(std::vector<float>& masks,
+ const std::vector<size_t>& indices,
+ const ov::Tensor& maskScoresTensor,
+ const int anchorNum) {
+ auto shape = maskScoresTensor.get_shape();
+ const float* maskScoresPtr = maskScoresTensor.data<float>();
+ auto start = shape[2] * shape[3] * anchorNum * 2;
+
+ for (auto i : indices) {
+ auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum;
+ masks.push_back(maskScoresPtr[start + offset]);
+ }
+}
+
+std::unique_ptr<ResultBase> ModelRetinaFace::postprocess(InferenceResult& infResult) {
+ std::vector<float> scores;
+ scores.reserve(INIT_VECTOR_SIZE);
+ std::vector<Anchor> boxes;
+ boxes.reserve(INIT_VECTOR_SIZE);
+ std::vector<cv::Point2f> landmarks;
+ std::vector<float> masks;
+
+ if (shouldDetectLandmarks) {
+ landmarks.reserve(INIT_VECTOR_SIZE);
+ }
+ if (shouldDetectMasks) {
+ masks.reserve(INIT_VECTOR_SIZE);
+ }
+
+ // --------------------------- Gather & Filter output from all levels
+ // ----------------------------------------------------------
+ for (size_t idx = 0; idx < anchorCfg.size(); ++idx) {
+ const auto boxRaw = infResult.outputsData[separateOutputsNames[OUT_BOXES][idx]];
+ const auto scoresRaw = infResult.outputsData[separateOutputsNames[OUT_SCORES][idx]];
+ auto s = anchorCfg[idx].stride;
+ auto anchorNum = anchorsFpn[s].size();
+
+ auto validIndices = thresholding(scoresRaw, anchorNum, confidenceThreshold);
+ filterScores(scores, validIndices, scoresRaw, anchorNum);
+ filterBoxes(boxes, validIndices, boxRaw, anchorNum, anchors[idx]);
+ if (shouldDetectLandmarks) {
+ const auto landmarksRaw = infResult.outputsData[separateOutputsNames[OUT_LANDMARKS][idx]];
+ filterLandmarks(landmarks, validIndices, landmarksRaw, anchorNum, anchors[idx], landmarkStd);
+ }
+ if (shouldDetectMasks) {
+ const auto masksRaw = infResult.outputsData[separateOutputsNames[OUT_MASKSCORES][idx]];
+ filterMasksScores(masks, validIndices, masksRaw, anchorNum);
+ }
+ }
+ // --------------------------- Apply Non-maximum Suppression
+ // ---------------------------------------------------------- !shouldDetectLandmarks determines nms behavior, if
+ // true - boundaries are included in areas calculation
+ const auto keep = nms(boxes, scores, boxIOUThreshold, !shouldDetectLandmarks);
+
+ // --------------------------- Create detection result objects
+ // --------------------------------------------------------
+ RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData);
+
+ const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+ const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+ const auto scaleX = static_cast<float>(netInputWidth) / imgWidth;
+ const auto scaleY = static_cast<float>(netInputHeight) / imgHeight;
+
+ result->objects.reserve(keep.size());
+ result->landmarks.reserve(keep.size() * ModelRetinaFace::LANDMARKS_NUM);
+ for (auto i : keep) {
+ DetectedObject desc;
+ desc.confidence = scores[i];
+ //--- Scaling coordinates
+ boxes[i].left /= scaleX;
+ boxes[i].top /= scaleY;
+ boxes[i].right /= scaleX;
+ boxes[i].bottom /= scaleY;
+
+ desc.x = clamp(boxes[i].left, 0.f, static_cast<float>(imgWidth));
+ desc.y = clamp(boxes[i].top, 0.f, static_cast<float>(imgHeight));
+ desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast<float>(imgWidth));
+ desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast<float>(imgHeight));
+ //--- Default label 0 - Face. If detecting masks then labels would be 0 - No Mask, 1 - Mask
+ desc.labelID = shouldDetectMasks ? (masks[i] > maskThreshold) : 0;
+ desc.label = labels[desc.labelID];
+ result->objects.push_back(desc);
+
+ //--- Scaling landmarks coordinates
+ for (size_t l = 0; l < ModelRetinaFace::LANDMARKS_NUM && shouldDetectLandmarks; ++l) {
+ landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x =
+ clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x / scaleX, 0.f, static_cast<float>(imgWidth));
+ landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y =
+ clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y / scaleY, 0.f, static_cast<float>(imgHeight));
+ result->landmarks.push_back(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l]);
+ }
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp
new file mode 100644
index 0000000..8322c3c
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp
@@ -0,0 +1,277 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_retinaface_pt.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/nms.hpp>
+#include <utils/ocv_common.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelRetinaFacePT::ModelRetinaFacePT(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ float boxIOUThreshold,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout), // Default label is "Face"
+ landmarksNum(0),
+ boxIOUThreshold(boxIOUThreshold) {}
+
+void ModelRetinaFacePT::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("RetinaFacePT model wrapper expects models that have only 1 input");
+ }
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 3-channel input");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ inputTransform.setPrecision(ppp, model->input().get_any_name());
+ ppp.input().tensor().set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Reading image input parameters -------------------------------------------
+ inputsNames.push_back(model->input().get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 3) {
+ throw std::logic_error("RetinaFace model wrapper expects models that have 3 outputs");
+ }
+
+ landmarksNum = 0;
+
+ outputsNames.resize(2);
+ std::vector<uint32_t> outputsSizes[OUT_MAX];
+ const ov::Layout chw("CHW");
+ const ov::Layout nchw("NCHW");
+ for (auto& output : model->outputs()) {
+ auto outTensorName = output.get_any_name();
+ outputsNames.push_back(outTensorName);
+ ppp.output(outTensorName)
+ .tensor()
+ .set_element_type(ov::element::f32)
+ .set_layout(output.get_shape().size() == 4 ? nchw : chw);
+
+ if (outTensorName.find("bbox") != std::string::npos) {
+ outputsNames[OUT_BOXES] = outTensorName;
+ } else if (outTensorName.find("cls") != std::string::npos) {
+ outputsNames[OUT_SCORES] = outTensorName;
+ } else if (outTensorName.find("landmark") != std::string::npos) {
+ // Landmarks might be optional, if it is present, resize names array to fit landmarks output name to the
+ // last item of array Considering that other outputs names are already filled in or will be filled later
+ outputsNames.resize(std::max(outputsNames.size(), (size_t)OUT_LANDMARKS + 1));
+ outputsNames[OUT_LANDMARKS] = outTensorName;
+ landmarksNum =
+ output.get_shape()[ov::layout::width_idx(chw)] / 2; // Each landmark consist of 2 variables (x and y)
+ } else {
+ continue;
+ }
+ }
+
+ if (outputsNames[OUT_BOXES] == "" || outputsNames[OUT_SCORES] == "") {
+ throw std::logic_error("Bbox or cls layers are not found");
+ }
+
+ model = ppp.build();
+ priors = generatePriorData();
+}
+
+std::vector<size_t> ModelRetinaFacePT::filterByScore(const ov::Tensor& scoresTensor, const float confidenceThreshold) {
+ std::vector<size_t> indicies;
+ const auto& shape = scoresTensor.get_shape();
+ const float* scoresPtr = scoresTensor.data<float>();
+
+ for (size_t x = 0; x < shape[1]; ++x) {
+ const auto idx = (x * shape[2] + 1);
+ const auto score = scoresPtr[idx];
+ if (score >= confidenceThreshold) {
+ indicies.push_back(x);
+ }
+ }
+
+ return indicies;
+}
+
+std::vector<float> ModelRetinaFacePT::getFilteredScores(const ov::Tensor& scoresTensor,
+ const std::vector<size_t>& indicies) {
+ const auto& shape = scoresTensor.get_shape();
+ const float* scoresPtr = scoresTensor.data<float>();
+
+ std::vector<float> scores;
+ scores.reserve(indicies.size());
+
+ for (auto i : indicies) {
+ scores.push_back(scoresPtr[i * shape[2] + 1]);
+ }
+ return scores;
+}
+
+std::vector<cv::Point2f> ModelRetinaFacePT::getFilteredLandmarks(const ov::Tensor& landmarksTensor,
+ const std::vector<size_t>& indicies,
+ int imgWidth,
+ int imgHeight) {
+ const auto& shape = landmarksTensor.get_shape();
+ const float* landmarksPtr = landmarksTensor.data<float>();
+
+ std::vector<cv::Point2f> landmarks(landmarksNum * indicies.size());
+
+ for (size_t i = 0; i < indicies.size(); i++) {
+ const size_t idx = indicies[i];
+ const auto& prior = priors[idx];
+ for (size_t j = 0; j < landmarksNum; j++) {
+ landmarks[i * landmarksNum + j].x =
+ clamp(prior.cX + landmarksPtr[idx * shape[2] + j * 2] * variance[0] * prior.width, 0.f, 1.f) * imgWidth;
+ landmarks[i * landmarksNum + j].y =
+ clamp(prior.cY + landmarksPtr[idx * shape[2] + j * 2 + 1] * variance[0] * prior.height, 0.f, 1.f) *
+ imgHeight;
+ }
+ }
+ return landmarks;
+}
+
+std::vector<ModelRetinaFacePT::Box> ModelRetinaFacePT::generatePriorData() {
+ const float globalMinSizes[][2] = {{16, 32}, {64, 128}, {256, 512}};
+ const float steps[] = {8., 16., 32.};
+ std::vector<ModelRetinaFacePT::Box> anchors;
+ for (size_t stepNum = 0; stepNum < arraySize(steps); stepNum++) {
+ const int featureW = static_cast<int>(std::round(netInputWidth / steps[stepNum]));
+ const int featureH = static_cast<int>(std::round(netInputHeight / steps[stepNum]));
+
+ const auto& minSizes = globalMinSizes[stepNum];
+ for (int i = 0; i < featureH; i++) {
+ for (int j = 0; j < featureW; j++) {
+ for (auto minSize : minSizes) {
+ const float sKX = minSize / netInputWidth;
+ const float sKY = minSize / netInputHeight;
+ const float denseCY = (i + 0.5f) * steps[stepNum] / netInputHeight;
+ const float denseCX = (j + 0.5f) * steps[stepNum] / netInputWidth;
+ anchors.push_back(ModelRetinaFacePT::Box{denseCX, denseCY, sKX, sKY});
+ }
+ }
+ }
+ }
+ return anchors;
+}
+
+std::vector<Anchor> ModelRetinaFacePT::getFilteredProposals(const ov::Tensor& boxesTensor,
+ const std::vector<size_t>& indicies,
+ int imgWidth,
+ int imgHeight) {
+ std::vector<Anchor> rects;
+ rects.reserve(indicies.size());
+
+ const auto& shape = boxesTensor.get_shape();
+ const float* boxesPtr = boxesTensor.data<float>();
+
+ if (shape[1] != priors.size()) {
+ throw std::logic_error("rawBoxes size is not equal to priors size");
+ }
+
+ for (auto i : indicies) {
+ const auto pRawBox = reinterpret_cast<const Box*>(boxesPtr + i * shape[2]);
+ const auto& prior = priors[i];
+ const float cX = priors[i].cX + pRawBox->cX * variance[0] * prior.width;
+ const float cY = priors[i].cY + pRawBox->cY * variance[0] * prior.height;
+ const float width = prior.width * exp(pRawBox->width * variance[1]);
+ const float height = prior.height * exp(pRawBox->height * variance[1]);
+ rects.push_back(Anchor{clamp(cX - width / 2, 0.f, 1.f) * imgWidth,
+ clamp(cY - height / 2, 0.f, 1.f) * imgHeight,
+ clamp(cX + width / 2, 0.f, 1.f) * imgWidth,
+ clamp(cY + height / 2, 0.f, 1.f) * imgHeight});
+ }
+
+ return rects;
+}
+
+std::unique_ptr<ResultBase> ModelRetinaFacePT::postprocess(InferenceResult& infResult) {
+ // (raw_output, scale_x, scale_y, face_prob_threshold, image_size):
+ const auto boxesTensor = infResult.outputsData[outputsNames[OUT_BOXES]];
+ const auto scoresTensor = infResult.outputsData[outputsNames[OUT_SCORES]];
+
+ const auto& validIndicies = filterByScore(scoresTensor, confidenceThreshold);
+ const auto& scores = getFilteredScores(scoresTensor, validIndicies);
+
+ const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+ const auto& landmarks = landmarksNum ? getFilteredLandmarks(infResult.outputsData[outputsNames[OUT_LANDMARKS]],
+ validIndicies,
+ internalData.inputImgWidth,
+ internalData.inputImgHeight)
+ : std::vector<cv::Point2f>();
+
+ const auto& proposals =
+ getFilteredProposals(boxesTensor, validIndicies, internalData.inputImgWidth, internalData.inputImgHeight);
+
+ const auto& keptIndicies = nms(proposals, scores, boxIOUThreshold, !landmarksNum);
+
+ // --------------------------- Create detection result objects
+ // --------------------------------------------------------
+ RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData);
+
+ result->objects.reserve(keptIndicies.size());
+ result->landmarks.reserve(keptIndicies.size() * landmarksNum);
+ for (auto i : keptIndicies) {
+ DetectedObject desc;
+ desc.confidence = scores[i];
+
+ //--- Scaling coordinates
+ desc.x = proposals[i].left;
+ desc.y = proposals[i].top;
+ desc.width = proposals[i].getWidth();
+ desc.height = proposals[i].getHeight();
+
+ desc.labelID = 0;
+ desc.label = labels[desc.labelID];
+ result->objects.push_back(desc);
+
+ //--- Filtering landmarks coordinates
+ for (uint32_t l = 0; l < landmarksNum; ++l) {
+ result->landmarks.emplace_back(landmarks[i * landmarksNum + l].x, landmarks[i * landmarksNum + l].y);
+ }
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_ssd.cpp b/python/openvino/runtime/common/models/src/detection_model_ssd.cpp
new file mode 100644
index 0000000..ef741ee
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_ssd.cpp
@@ -0,0 +1,281 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_ssd.h"
+
+#include <algorithm>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/ocv_common.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+struct InputData;
+
+ModelSSD::ModelSSD(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ const std::vector<std::string>& labels,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout) {}
+
+std::shared_ptr<InternalModelData> ModelSSD::preprocess(const InputData& inputData, ov::InferRequest& request) {
+ if (inputsNames.size() > 1) {
+ const auto& imageInfoTensor = request.get_tensor(inputsNames[1]);
+ const auto info = imageInfoTensor.data<float>();
+ info[0] = static_cast<float>(netInputHeight);
+ info[1] = static_cast<float>(netInputWidth);
+ info[2] = 1;
+ request.set_tensor(inputsNames[1], imageInfoTensor);
+ }
+
+ return DetectionModel::preprocess(inputData, request);
+}
+
+std::unique_ptr<ResultBase> ModelSSD::postprocess(InferenceResult& infResult) {
+ return outputsNames.size() > 1 ? postprocessMultipleOutputs(infResult) : postprocessSingleOutput(infResult);
+}
+
+std::unique_ptr<ResultBase> ModelSSD::postprocessSingleOutput(InferenceResult& infResult) {
+ const ov::Tensor& detectionsTensor = infResult.getFirstOutputTensor();
+ size_t detectionsNum = detectionsTensor.get_shape()[detectionsNumId];
+ const float* detections = detectionsTensor.data<float>();
+
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+ auto retVal = std::unique_ptr<ResultBase>(result);
+
+ const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+ for (size_t i = 0; i < detectionsNum; i++) {
+ float image_id = detections[i * objectSize + 0];
+ if (image_id < 0) {
+ break;
+ }
+
+ float confidence = detections[i * objectSize + 2];
+
+ /** Filtering out objects with confidence < confidence_threshold probability **/
+ if (confidence > confidenceThreshold) {
+ DetectedObject desc;
+
+ desc.confidence = confidence;
+ desc.labelID = static_cast<int>(detections[i * objectSize + 1]);
+ desc.label = getLabelName(desc.labelID);
+
+ desc.x = clamp(detections[i * objectSize + 3] * internalData.inputImgWidth,
+ 0.f,
+ static_cast<float>(internalData.inputImgWidth));
+ desc.y = clamp(detections[i * objectSize + 4] * internalData.inputImgHeight,
+ 0.f,
+ static_cast<float>(internalData.inputImgHeight));
+ desc.width = clamp(detections[i * objectSize + 5] * internalData.inputImgWidth,
+ 0.f,
+ static_cast<float>(internalData.inputImgWidth)) -
+ desc.x;
+ desc.height = clamp(detections[i * objectSize + 6] * internalData.inputImgHeight,
+ 0.f,
+ static_cast<float>(internalData.inputImgHeight)) -
+ desc.y;
+
+ result->objects.push_back(desc);
+ }
+ }
+
+ return retVal;
+}
+
+std::unique_ptr<ResultBase> ModelSSD::postprocessMultipleOutputs(InferenceResult& infResult) {
+ const float* boxes = infResult.outputsData[outputsNames[0]].data<float>();
+ size_t detectionsNum = infResult.outputsData[outputsNames[0]].get_shape()[detectionsNumId];
+ const float* labels = infResult.outputsData[outputsNames[1]].data<float>();
+ const float* scores = outputsNames.size() > 2 ? infResult.outputsData[outputsNames[2]].data<float>() : nullptr;
+
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+ auto retVal = std::unique_ptr<ResultBase>(result);
+
+ const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+ // In models with scores are stored in separate output, coordinates are normalized to [0,1]
+ // In other multiple-outputs models coordinates are normalized to [0,netInputWidth] and [0,netInputHeight]
+ float widthScale = static_cast<float>(internalData.inputImgWidth) / (scores ? 1 : netInputWidth);
+ float heightScale = static_cast<float>(internalData.inputImgHeight) / (scores ? 1 : netInputHeight);
+
+ for (size_t i = 0; i < detectionsNum; i++) {
+ float confidence = scores ? scores[i] : boxes[i * objectSize + 4];
+
+ /** Filtering out objects with confidence < confidence_threshold probability **/
+ if (confidence > confidenceThreshold) {
+ DetectedObject desc;
+
+ desc.confidence = confidence;
+ desc.labelID = static_cast<int>(labels[i]);
+ desc.label = getLabelName(desc.labelID);
+
+ desc.x = clamp(boxes[i * objectSize] * widthScale, 0.f, static_cast<float>(internalData.inputImgWidth));
+ desc.y =
+ clamp(boxes[i * objectSize + 1] * heightScale, 0.f, static_cast<float>(internalData.inputImgHeight));
+ desc.width =
+ clamp(boxes[i * objectSize + 2] * widthScale, 0.f, static_cast<float>(internalData.inputImgWidth)) -
+ desc.x;
+ desc.height =
+ clamp(boxes[i * objectSize + 3] * heightScale, 0.f, static_cast<float>(internalData.inputImgHeight)) -
+ desc.y;
+
+ result->objects.push_back(desc);
+ }
+ }
+
+ return retVal;
+}
+
+void ModelSSD::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ ov::preprocess::PrePostProcessor ppp(model);
+ for (const auto& input : model->inputs()) {
+ auto inputTensorName = input.get_any_name();
+ const ov::Shape& shape = input.get_shape();
+ ov::Layout inputLayout = getInputLayout(input);
+
+ if (shape.size() == 4) { // 1st input contains images
+ if (inputsNames.empty()) {
+ inputsNames.push_back(inputTensorName);
+ } else {
+ inputsNames[0] = inputTensorName;
+ }
+
+ inputTransform.setPrecision(ppp, inputTensorName);
+ ppp.input(inputTensorName).tensor().set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input(inputTensorName).tensor().set_spatial_dynamic_shape();
+
+ ppp.input(inputTensorName)
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input(inputTensorName).model().set_layout(inputLayout);
+
+ netInputWidth = shape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = shape[ov::layout::height_idx(inputLayout)];
+ } else if (shape.size() == 2) { // 2nd input contains image info
+ inputsNames.resize(2);
+ inputsNames[1] = inputTensorName;
+ ppp.input(inputTensorName).tensor().set_element_type(ov::element::f32);
+ } else {
+ throw std::logic_error("Unsupported " + std::to_string(input.get_shape().size()) +
+ "D "
+ "input layer '" +
+ input.get_any_name() +
+ "'. "
+ "Only 2D and 4D input layers are supported");
+ }
+ }
+ model = ppp.build();
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() == 1) {
+ prepareSingleOutput(model);
+ } else {
+ prepareMultipleOutputs(model);
+ }
+}
+
+void ModelSSD::prepareSingleOutput(std::shared_ptr<ov::Model>& model) {
+ const auto& output = model->output();
+ outputsNames.push_back(output.get_any_name());
+
+ const ov::Shape& shape = output.get_shape();
+ const ov::Layout& layout("NCHW");
+ if (shape.size() != 4) {
+ throw std::logic_error("SSD single output must have 4 dimensions, but had " + std::to_string(shape.size()));
+ }
+ detectionsNumId = ov::layout::height_idx(layout);
+ objectSize = shape[ov::layout::width_idx(layout)];
+ if (objectSize != 7) {
+ throw std::logic_error("SSD single output must have 7 as a last dimension, but had " +
+ std::to_string(objectSize));
+ }
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.output().tensor().set_element_type(ov::element::f32).set_layout(layout);
+ model = ppp.build();
+}
+
+void ModelSSD::prepareMultipleOutputs(std::shared_ptr<ov::Model>& model) {
+ const ov::OutputVector& outputs = model->outputs();
+ for (auto& output : outputs) {
+ const auto& tensorNames = output.get_names();
+ for (const auto& name : tensorNames) {
+ if (name.find("boxes") != std::string::npos) {
+ outputsNames.push_back(name);
+ break;
+ } else if (name.find("labels") != std::string::npos) {
+ outputsNames.push_back(name);
+ break;
+ } else if (name.find("scores") != std::string::npos) {
+ outputsNames.push_back(name);
+ break;
+ }
+ }
+ }
+ if (outputsNames.size() != 2 && outputsNames.size() != 3) {
+ throw std::logic_error("SSD model wrapper must have 2 or 3 outputs, but had " +
+ std::to_string(outputsNames.size()));
+ }
+ std::sort(outputsNames.begin(), outputsNames.end());
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ const auto& boxesShape = model->output(outputsNames[0]).get_partial_shape().get_max_shape();
+
+ ov::Layout boxesLayout;
+ if (boxesShape.size() == 2) {
+ boxesLayout = "NC";
+ detectionsNumId = ov::layout::batch_idx(boxesLayout);
+ objectSize = boxesShape[ov::layout::channels_idx(boxesLayout)];
+
+ if (objectSize != 5) {
+ throw std::logic_error("Incorrect 'boxes' output shape, [n][5] shape is required");
+ }
+ } else if (boxesShape.size() == 3) {
+ boxesLayout = "CHW";
+ detectionsNumId = ov::layout::height_idx(boxesLayout);
+ objectSize = boxesShape[ov::layout::width_idx(boxesLayout)];
+
+ if (objectSize != 4) {
+ throw std::logic_error("Incorrect 'boxes' output shape, [b][n][4] shape is required");
+ }
+ } else {
+ throw std::logic_error("Incorrect number of 'boxes' output dimensions, expected 2 or 3, but had " +
+ std::to_string(boxesShape.size()));
+ }
+
+ ppp.output(outputsNames[0]).tensor().set_layout(boxesLayout);
+
+ for (const auto& outName : outputsNames) {
+ ppp.output(outName).tensor().set_element_type(ov::element::f32);
+ }
+ model = ppp.build();
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolo.cpp b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
new file mode 100644
index 0000000..2c4fb1d
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
@@ -0,0 +1,481 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolo.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+std::vector<float> defaultAnchors[] = {
+ // YOLOv1v2
+ {0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f},
+ // YOLOv3
+ {10.0f,
+ 13.0f,
+ 16.0f,
+ 30.0f,
+ 33.0f,
+ 23.0f,
+ 30.0f,
+ 61.0f,
+ 62.0f,
+ 45.0f,
+ 59.0f,
+ 119.0f,
+ 116.0f,
+ 90.0f,
+ 156.0f,
+ 198.0f,
+ 373.0f,
+ 326.0f},
+ // YOLOv4
+ {12.0f,
+ 16.0f,
+ 19.0f,
+ 36.0f,
+ 40.0f,
+ 28.0f,
+ 36.0f,
+ 75.0f,
+ 76.0f,
+ 55.0f,
+ 72.0f,
+ 146.0f,
+ 142.0f,
+ 110.0f,
+ 192.0f,
+ 243.0f,
+ 459.0f,
+ 401.0f},
+ // YOLOv4_Tiny
+ {10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f, 81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f},
+ // YOLOF
+ {16.0f, 16.0f, 32.0f, 32.0f, 64.0f, 64.0f, 128.0f, 128.0f, 256.0f, 256.0f, 512.0f, 512.0f}};
+
+const std::vector<int64_t> defaultMasks[] = {
+ // YOLOv1v2
+ {},
+ // YOLOv3
+ {},
+ // YOLOv4
+ {0, 1, 2, 3, 4, 5, 6, 7, 8},
+ // YOLOv4_Tiny
+ {1, 2, 3, 3, 4, 5},
+ // YOLOF
+ {0, 1, 2, 3, 4, 5}};
+
+static inline float sigmoid(float x) {
+ return 1.f / (1.f + exp(-x));
+}
+
+static inline float linear(float x) {
+ return x;
+}
+
+ModelYolo::ModelYolo(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ bool useAdvancedPostprocessing,
+ float boxIOUThreshold,
+ const std::vector<std::string>& labels,
+ const std::vector<float>& anchors,
+ const std::vector<int64_t>& masks,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout),
+ boxIOUThreshold(boxIOUThreshold),
+ useAdvancedPostprocessing(useAdvancedPostprocessing),
+ yoloVersion(YOLO_V3),
+ presetAnchors(anchors),
+ presetMasks(masks) {}
+
+void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("YOLO model wrapper accepts models that have only 1 input");
+ }
+
+ const auto& input = model->input();
+ const ov::Shape& inputShape = model->input().get_shape();
+ ov::Layout inputLayout = getInputLayout(input);
+
+ if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 3-channel input");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+
+ //--- Reading image input parameters
+ inputsNames.push_back(model->input().get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ std::map<std::string, ov::Shape> outShapes;
+ for (auto& out : outputs) {
+ ppp.output(out.get_any_name()).tensor().set_element_type(ov::element::f32);
+ if (out.get_shape().size() == 4) {
+ if (out.get_shape()[ov::layout::height_idx("NCHW")] != out.get_shape()[ov::layout::width_idx("NCHW")] &&
+ out.get_shape()[ov::layout::height_idx("NHWC")] == out.get_shape()[ov::layout::width_idx("NHWC")]) {
+ ppp.output(out.get_any_name()).model().set_layout("NHWC");
+ // outShapes are saved before ppp.build() thus set yoloRegionLayout as it is in model before ppp.build()
+ yoloRegionLayout = "NHWC";
+ }
+ // yolo-v1-tiny-tf out shape is [1, 21125] thus set layout only for 4 dim tensors
+ ppp.output(out.get_any_name()).tensor().set_layout("NCHW");
+ }
+ outputsNames.push_back(out.get_any_name());
+ outShapes[out.get_any_name()] = out.get_shape();
+ }
+ model = ppp.build();
+
+ yoloVersion = YOLO_V3;
+ bool isRegionFound = false;
+ for (const auto& op : model->get_ordered_ops()) {
+ if (std::string("RegionYolo") == op->get_type_name()) {
+ auto regionYolo = std::dynamic_pointer_cast<ov::op::v0::RegionYolo>(op);
+
+ if (regionYolo) {
+ if (!regionYolo->get_mask().size()) {
+ yoloVersion = YOLO_V1V2;
+ }
+
+ const auto& opName = op->get_friendly_name();
+ for (const auto& out : outputs) {
+ if (out.get_node()->get_friendly_name() == opName ||
+ out.get_node()->get_input_node_ptr(0)->get_friendly_name() == opName) {
+ isRegionFound = true;
+ regions.emplace(out.get_any_name(), Region(regionYolo));
+ }
+ }
+ }
+ }
+ }
+
+ if (!isRegionFound) {
+ switch (outputsNames.size()) {
+ case 1:
+ yoloVersion = YOLOF;
+ break;
+ case 2:
+ yoloVersion = YOLO_V4_TINY;
+ break;
+ case 3:
+ yoloVersion = YOLO_V4;
+ break;
+ }
+
+ int num = yoloVersion == YOLOF ? 6 : 3;
+ isObjConf = yoloVersion == YOLOF ? 0 : 1;
+ int i = 0;
+
+ auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
+ if (chosenMasks.size() != num * outputs.size()) {
+ throw std::runtime_error(std::string("Invalid size of masks array, got ") +
+ std::to_string(presetMasks.size()) + ", should be " +
+ std::to_string(num * outputs.size()));
+ }
+
+ std::sort(outputsNames.begin(),
+ outputsNames.end(),
+ [&outShapes, this](const std::string& x, const std::string& y) {
+ return outShapes[x][ov::layout::height_idx(yoloRegionLayout)] >
+ outShapes[y][ov::layout::height_idx(yoloRegionLayout)];
+ });
+
+ for (const auto& name : outputsNames) {
+ const auto& shape = outShapes[name];
+ if (shape[ov::layout::channels_idx(yoloRegionLayout)] % num != 0) {
+ throw std::logic_error(std::string("Output tensor ") + name + " has wrong channel dimension");
+ }
+ regions.emplace(
+ name,
+ Region(shape[ov::layout::channels_idx(yoloRegionLayout)] / num - 4 - (isObjConf ? 1 : 0),
+ 4,
+ presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
+ std::vector<int64_t>(chosenMasks.begin() + i * num, chosenMasks.begin() + (i + 1) * num),
+ shape[ov::layout::width_idx(yoloRegionLayout)],
+ shape[ov::layout::height_idx(yoloRegionLayout)]));
+ i++;
+ }
+ } else {
+ // Currently externally set anchors and masks are supported only for YoloV4
+ if (presetAnchors.size() || presetMasks.size()) {
+ slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
+ "This model is not YoloV4, so these options will be ignored."
+ << slog::endl;
+ }
+ }
+}
+
+std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+ std::vector<DetectedObject> objects;
+
+ // Parsing outputs
+ const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+ for (auto& output : infResult.outputsData) {
+ this->parseYOLOOutput(output.first,
+ output.second,
+ netInputHeight,
+ netInputWidth,
+ internalData.inputImgHeight,
+ internalData.inputImgWidth,
+ objects);
+ }
+
+ if (useAdvancedPostprocessing) {
+ // Advanced postprocessing
+ // Checking IOU threshold conformance
+ // For every i-th object we're finding all objects it intersects with, and comparing confidence
+ // If i-th object has greater confidence than all others, we include it into result
+ for (const auto& obj1 : objects) {
+ bool isGoodResult = true;
+ for (const auto& obj2 : objects) {
+ if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence &&
+ intersectionOverUnion(obj1, obj2) >= boxIOUThreshold) { // if obj1 is the same as obj2, condition
+ // expression will evaluate to false anyway
+ isGoodResult = false;
+ break;
+ }
+ }
+ if (isGoodResult) {
+ result->objects.push_back(obj1);
+ }
+ }
+ } else {
+ // Classic postprocessing
+ std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) {
+ return x.confidence > y.confidence;
+ });
+ for (size_t i = 0; i < objects.size(); ++i) {
+ if (objects[i].confidence == 0)
+ continue;
+ for (size_t j = i + 1; j < objects.size(); ++j)
+ if (intersectionOverUnion(objects[i], objects[j]) >= boxIOUThreshold)
+ objects[j].confidence = 0;
+ result->objects.push_back(objects[i]);
+ }
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
+
+void ModelYolo::parseYOLOOutput(const std::string& output_name,
+ const ov::Tensor& tensor,
+ const unsigned long resized_im_h,
+ const unsigned long resized_im_w,
+ const unsigned long original_im_h,
+ const unsigned long original_im_w,
+ std::vector<DetectedObject>& objects) {
+ // --------------------------- Extracting layer parameters -------------------------------------
+ auto it = regions.find(output_name);
+ if (it == regions.end()) {
+ throw std::runtime_error(std::string("Can't find output layer with name ") + output_name);
+ }
+ auto& region = it->second;
+
+ int sideW = 0;
+ int sideH = 0;
+ unsigned long scaleH;
+ unsigned long scaleW;
+ switch (yoloVersion) {
+ case YOLO_V1V2:
+ sideH = region.outputHeight;
+ sideW = region.outputWidth;
+ scaleW = region.outputWidth;
+ scaleH = region.outputHeight;
+ break;
+ case YOLO_V3:
+ case YOLO_V4:
+ case YOLO_V4_TINY:
+ case YOLOF:
+ sideH = static_cast<int>(tensor.get_shape()[ov::layout::height_idx("NCHW")]);
+ sideW = static_cast<int>(tensor.get_shape()[ov::layout::width_idx("NCHW")]);
+ scaleW = resized_im_w;
+ scaleH = resized_im_h;
+ break;
+ }
+
+ auto entriesNum = sideW * sideH;
+ const float* outData = tensor.data<float>();
+
+ auto postprocessRawData =
+ (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY || yoloVersion == YOLOF) ? sigmoid : linear;
+
+ // --------------------------- Parsing YOLO Region output -------------------------------------
+ for (int i = 0; i < entriesNum; ++i) {
+ int row = i / sideW;
+ int col = i % sideW;
+ for (int n = 0; n < region.num; ++n) {
+ //--- Getting region data
+ int obj_index = calculateEntryIndex(entriesNum,
+ region.coords,
+ region.classes + isObjConf,
+ n * entriesNum + i,
+ region.coords);
+ int box_index =
+ calculateEntryIndex(entriesNum, region.coords, region.classes + isObjConf, n * entriesNum + i, 0);
+ float scale = isObjConf ? postprocessRawData(outData[obj_index]) : 1;
+
+ //--- Preliminary check for confidence threshold conformance
+ if (scale >= confidenceThreshold) {
+ //--- Calculating scaled region's coordinates
+ float x, y;
+ if (yoloVersion == YOLOF) {
+ x = (static_cast<float>(col) / sideW +
+ outData[box_index + 0 * entriesNum] * region.anchors[2 * n] / scaleW) *
+ original_im_w;
+ y = (static_cast<float>(row) / sideH +
+ outData[box_index + 1 * entriesNum] * region.anchors[2 * n + 1] / scaleH) *
+ original_im_h;
+ } else {
+ x = static_cast<float>((col + postprocessRawData(outData[box_index + 0 * entriesNum])) / sideW *
+ original_im_w);
+ y = static_cast<float>((row + postprocessRawData(outData[box_index + 1 * entriesNum])) / sideH *
+ original_im_h);
+ }
+ float height = static_cast<float>(std::exp(outData[box_index + 3 * entriesNum]) *
+ region.anchors[2 * n + 1] * original_im_h / scaleH);
+ float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] *
+ original_im_w / scaleW);
+
+ DetectedObject obj;
+ obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w));
+ obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h));
+ obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x));
+ obj.height = clamp(height, 0.f, static_cast<float>(original_im_h - obj.y));
+
+ for (size_t j = 0; j < region.classes; ++j) {
+ int class_index = calculateEntryIndex(entriesNum,
+ region.coords,
+ region.classes + isObjConf,
+ n * entriesNum + i,
+ region.coords + isObjConf + j);
+ float prob = scale * postprocessRawData(outData[class_index]);
+
+ //--- Checking confidence threshold conformance and adding region to the list
+ if (prob >= confidenceThreshold) {
+ obj.confidence = prob;
+ obj.labelID = j;
+ obj.label = getLabelName(obj.labelID);
+ objects.push_back(obj);
+ }
+ }
+ }
+ }
+ }
+}
+
+int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses, int location, int entry) {
+ int n = location / totalCells;
+ int loc = location % totalCells;
+ return (n * (lcoords + lclasses) + entry) * totalCells + loc;
+}
+
+double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) {
+ double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x);
+ double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y);
+ double intersectionArea =
+ (overlappingWidth < 0 || overlappingHeight < 0) ? 0 : overlappingHeight * overlappingWidth;
+ double unionArea = o1.width * o1.height + o2.width * o2.height - intersectionArea;
+ return intersectionArea / unionArea;
+}
+
+ModelYolo::Region::Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo) {
+ coords = regionYolo->get_num_coords();
+ classes = regionYolo->get_num_classes();
+ auto mask = regionYolo->get_mask();
+ num = mask.size();
+
+ auto shape = regionYolo->get_input_shape(0);
+ outputWidth = shape[3];
+ outputHeight = shape[2];
+
+ if (num) {
+ // Parsing YoloV3 parameters
+ anchors.resize(num * 2);
+
+ for (int i = 0; i < num; ++i) {
+ anchors[i * 2] = regionYolo->get_anchors()[mask[i] * 2];
+ anchors[i * 2 + 1] = regionYolo->get_anchors()[mask[i] * 2 + 1];
+ }
+ } else {
+ // Parsing YoloV2 parameters
+ num = regionYolo->get_num_regions();
+ anchors = regionYolo->get_anchors();
+ if (anchors.empty()) {
+ anchors = defaultAnchors[YOLO_V1V2];
+ num = 5;
+ }
+ }
+}
+
+ModelYolo::Region::Region(size_t classes,
+ int coords,
+ const std::vector<float>& anchors,
+ const std::vector<int64_t>& masks,
+ size_t outputWidth,
+ size_t outputHeight)
+ : classes(classes),
+ coords(coords),
+ outputWidth(outputWidth),
+ outputHeight(outputHeight) {
+ num = masks.size();
+
+ if (anchors.size() == 0 || anchors.size() % 2 != 0) {
+ throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
+ }
+
+ if (num) {
+ this->anchors.resize(num * 2);
+
+ for (int i = 0; i < num; ++i) {
+ this->anchors[i * 2] = anchors[masks[i] * 2];
+ this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
+ }
+ } else {
+ this->anchors = anchors;
+ num = anchors.size() / 2;
+ }
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp b/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp
new file mode 100644
index 0000000..132eb9e
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp
@@ -0,0 +1,188 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolov3_onnx.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+#include "utils/image_utils.h"
+
+ModelYoloV3ONNX::ModelYoloV3ONNX(const std::string& modelFileName,
+ float confidenceThreshold,
+ const std::vector<std::string>& labels,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout) {
+ interpolationMode = cv::INTER_CUBIC;
+ resizeMode = RESIZE_KEEP_ASPECT_LETTERBOX;
+ }
+
+
+void ModelYoloV3ONNX::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare inputs ------------------------------------------------------
+ const ov::OutputVector& inputs = model->inputs();
+ if (inputs.size() != 2) {
+ throw std::logic_error("YoloV3ONNX model wrapper expects models that have 2 inputs");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ inputsNames.reserve(inputs.size());
+ for (auto& input : inputs) {
+ const ov::Shape& currentShape = input.get_shape();
+ std::string currentName = input.get_any_name();
+ const ov::Layout& currentLayout = getInputLayout(input);
+
+ if (currentShape.size() == 4) {
+ if (currentShape[ov::layout::channels_idx(currentLayout)] != 3) {
+ throw std::logic_error("Expected 4D image input with 3 channels");
+ }
+ inputsNames[0] = currentName;
+ netInputWidth = currentShape[ov::layout::width_idx(currentLayout)];
+ netInputHeight = currentShape[ov::layout::height_idx(currentLayout)];
+ ppp.input(currentName).tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+ } else if (currentShape.size() == 2) {
+ if (currentShape[ov::layout::channels_idx(currentLayout)] != 2) {
+ throw std::logic_error("Expected 2D image info input with 2 channels");
+ }
+ inputsNames[1] = currentName;
+ ppp.input(currentName).tensor().set_element_type(ov::element::i32);
+ }
+ ppp.input(currentName).model().set_layout(currentLayout);
+ }
+
+ // --------------------------- Prepare outputs -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 3) {
+ throw std::logic_error("YoloV3ONNX model wrapper expects models that have 3 outputs");
+ }
+
+ for (auto& output : outputs) {
+ const ov::Shape& currentShape = output.get_partial_shape().get_max_shape();
+ std::string currentName = output.get_any_name();
+ if (currentShape.back() == 3) {
+ indicesOutputName = currentName;
+ ppp.output(currentName).tensor().set_element_type(ov::element::i32);
+ } else if (currentShape[2] == 4) {
+ boxesOutputName = currentName;
+ ppp.output(currentName).tensor().set_element_type(ov::element::f32);
+ } else if (currentShape[1] == numberOfClasses) {
+ scoresOutputName = currentName;
+ ppp.output(currentName).tensor().set_element_type(ov::element::f32);
+ } else {
+ throw std::logic_error("Expected shapes [:,:,4], [:,"
+ + std::to_string(numberOfClasses) + ",:] and [:,3] for outputs");
+ }
+ outputsNames.push_back(currentName);
+ }
+ model = ppp.build();
+}
+
+std::shared_ptr<InternalModelData> ModelYoloV3ONNX::preprocess(const InputData& inputData,
+ ov::InferRequest& request) {
+ const auto& origImg = inputData.asRef<ImageInputData>().inputImage;
+
+ cv::Mat info(cv::Size(1, 2), CV_32SC1);
+ info.at<int>(0, 0) = origImg.rows;
+ info.at<int>(0, 1) = origImg.cols;
+ auto allocator = std::make_shared<SharedTensorAllocator>(info);
+ ov::Tensor infoInput = ov::Tensor(ov::element::i32, ov::Shape({1, 2}), ov::Allocator(allocator));
+
+ request.set_tensor(inputsNames[1], infoInput);
+
+ return ImageModel::preprocess(inputData, request);
+}
+
+namespace {
+float getScore(const ov::Tensor& scoresTensor, size_t classInd, size_t boxInd) {
+ const float* scoresPtr = scoresTensor.data<float>();
+ const auto shape = scoresTensor.get_shape();
+ int N = shape[2];
+
+ return scoresPtr[classInd * N + boxInd];
+}
+}
+
+std::unique_ptr<ResultBase> ModelYoloV3ONNX::postprocess(InferenceResult& infResult) {
+ // Get info about input image
+ const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+ const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+
+ // Get outputs tensors
+ const ov::Tensor& boxes = infResult.outputsData[boxesOutputName];
+ const float* boxesPtr = boxes.data<float>();
+
+ const ov::Tensor& scores = infResult.outputsData[scoresOutputName];
+ const ov::Tensor& indices = infResult.outputsData[indicesOutputName];
+
+ const int* indicesData = indices.data<int>();
+ const auto indicesShape = indices.get_shape();
+ const auto boxShape = boxes.get_shape();
+
+ // Generate detection results
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+ size_t numberOfBoxes = indicesShape.size() == 3 ? indicesShape[1] : indicesShape[0];
+ int indicesStride = indicesShape.size() == 3 ? indicesShape[2] : indicesShape[1];
+
+ for (size_t i = 0; i < numberOfBoxes; ++i) {
+ int batchInd = indicesData[i * indicesStride];
+ int classInd = indicesData[i * indicesStride + 1];
+ int boxInd = indicesData[i * indicesStride + 2];
+
+ if (batchInd == -1) {
+ break;
+ }
+
+ float score = getScore(scores, classInd, boxInd);
+
+ if (score > confidenceThreshold) {
+ DetectedObject obj;
+ size_t startPos = boxShape[2] * boxInd;
+
+ auto x = boxesPtr[startPos + 1];
+ auto y = boxesPtr[startPos];
+ auto width = boxesPtr[startPos + 3] - x;
+ auto height = boxesPtr[startPos + 2] - y;
+
+ // Create new detected box
+ obj.x = clamp(x, 0.f, static_cast<float>(imgWidth));
+ obj.y = clamp(y, 0.f, static_cast<float>(imgHeight));
+ obj.height = clamp(height, 0.f, static_cast<float>(imgHeight));
+ obj.width = clamp(width, 0.f, static_cast<float>(imgWidth));
+ obj.confidence = score;
+ obj.labelID = classInd;
+ obj.label = getLabelName(classInd);
+
+ result->objects.push_back(obj);
+
+ }
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolox.cpp b/python/openvino/runtime/common/models/src/detection_model_yolox.cpp
new file mode 100644
index 0000000..1e434ff
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolox.cpp
@@ -0,0 +1,194 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolox.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+#include "utils/image_utils.h"
+#include "utils/nms.hpp"
+
+ModelYoloX::ModelYoloX(const std::string& modelFileName,
+ float confidenceThreshold,
+ float boxIOUThreshold,
+ const std::vector<std::string>& labels,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout),
+ boxIOUThreshold(boxIOUThreshold) {
+ resizeMode = RESIZE_KEEP_ASPECT;
+}
+
+void ModelYoloX::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ const ov::OutputVector& inputs = model->inputs();
+ if (inputs.size() != 1) {
+ throw std::logic_error("YOLOX model wrapper accepts models that have only 1 input");
+ }
+
+ //--- Check image input
+ const auto& input = model->input();
+ const ov::Shape& inputShape = model->input().get_shape();
+ ov::Layout inputLayout = getInputLayout(input);
+
+ if (inputShape.size() != 4 && inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 4D image input with 3 channels");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ ppp.input().model().set_layout(inputLayout);
+
+ //--- Reading image input parameters
+ inputsNames.push_back(input.get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+ setStridesGrids();
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 1) {
+ throw std::logic_error("YoloX model wrapper expects models that have only 1 output");
+ }
+ const auto& output = model->output();
+ outputsNames.push_back(output.get_any_name());
+ const ov::Shape& shape = output.get_shape();
+
+ if (shape.size() != 3) {
+ throw std::logic_error("YOLOX single output must have 3 dimensions, but had " + std::to_string(shape.size()));
+ }
+ ppp.output().tensor().set_element_type(ov::element::f32);
+
+ model = ppp.build();
+}
+
+void ModelYoloX::setStridesGrids() {
+ std::vector<size_t> strides = {8, 16, 32};
+ std::vector<size_t> hsizes(3);
+ std::vector<size_t> wsizes(3);
+
+ for (size_t i = 0; i < strides.size(); ++i) {
+ hsizes[i] = netInputHeight / strides[i];
+ wsizes[i] = netInputWidth / strides[i];
+ }
+
+ for (size_t size_index = 0; size_index < hsizes.size(); ++size_index) {
+ for (size_t h_index = 0; h_index < hsizes[size_index]; ++h_index) {
+ for (size_t w_index = 0; w_index < wsizes[size_index]; ++w_index) {
+ grids.emplace_back(w_index, h_index);
+ expandedStrides.push_back(strides[size_index]);
+ }
+ }
+ }
+}
+
+std::shared_ptr<InternalModelData> ModelYoloX::preprocess(const InputData& inputData,
+ ov::InferRequest& request) {
+ const auto& origImg = inputData.asRef<ImageInputData>().inputImage;
+ double scale = std::min(static_cast<double>(netInputWidth) / origImg.cols,
+ static_cast<double>(netInputHeight) / origImg.rows);
+
+ cv::Mat resizedImage = resizeImageExt(origImg, netInputWidth, netInputHeight, resizeMode,
+ interpolationMode, nullptr, cv::Scalar(114, 114, 114));
+
+ request.set_input_tensor(wrapMat2Tensor(resizedImage));
+ return std::make_shared<InternalScaleData>(origImg.cols, origImg.rows, scale, scale);
+}
+
+std::unique_ptr<ResultBase> ModelYoloX::postprocess(InferenceResult& infResult) {
+ // Get metadata about input image shape and scale
+ const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
+
+ // Get output tensor
+ const ov::Tensor& output = infResult.outputsData[outputsNames[0]];
+ const auto& outputShape = output.get_shape();
+ float* outputPtr = output.data<float>();
+
+ // Generate detection results
+ DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+
+ // Update coordinates according to strides
+ for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) {
+ size_t startPos = outputShape[2] * box_index;
+ outputPtr[startPos] = (outputPtr[startPos] + grids[box_index].first) * expandedStrides[box_index];
+ outputPtr[startPos + 1] = (outputPtr[startPos + 1] + grids[box_index].second) * expandedStrides[box_index];
+ outputPtr[startPos + 2] = std::exp(outputPtr[startPos + 2]) * expandedStrides[box_index];
+ outputPtr[startPos + 3] = std::exp(outputPtr[startPos + 3]) * expandedStrides[box_index];
+ }
+
+ // Filter predictions
+ std::vector<Anchor> validBoxes;
+ std::vector<float> scores;
+ std::vector<size_t> classes;
+ for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) {
+ size_t startPos = outputShape[2] * box_index;
+ float score = outputPtr[startPos + 4];
+ if (score < confidenceThreshold)
+ continue;
+ float maxClassScore = -1;
+ size_t mainClass = 0;
+ for (size_t class_index = 0; class_index < numberOfClasses; ++class_index) {
+ if (outputPtr[startPos + 5 + class_index] > maxClassScore) {
+ maxClassScore = outputPtr[startPos + 5 + class_index];
+ mainClass = class_index;
+ }
+ }
+
+ // Filter by score
+ score *= maxClassScore;
+ if (score < confidenceThreshold)
+ continue;
+
+ // Add successful boxes
+ scores.push_back(score);
+ classes.push_back(mainClass);
+ Anchor trueBox = {outputPtr[startPos + 0] - outputPtr[startPos + 2] / 2, outputPtr[startPos + 1] - outputPtr[startPos + 3] / 2,
+ outputPtr[startPos + 0] + outputPtr[startPos + 2] / 2, outputPtr[startPos + 1] + outputPtr[startPos + 3] / 2};
+ validBoxes.push_back(Anchor({trueBox.left / scale.scaleX, trueBox.top / scale.scaleY,
+ trueBox.right / scale.scaleX, trueBox.bottom / scale.scaleY}));
+ }
+
+ // NMS for valid boxes
+ std::vector<int> keep = nms(validBoxes, scores, boxIOUThreshold, true);
+ for (auto& index: keep) {
+ // Create new detected box
+ DetectedObject obj;
+ obj.x = clamp(validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
+ obj.y = clamp(validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight));
+ obj.height = clamp(validBoxes[index].bottom - validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight));
+ obj.width = clamp(validBoxes[index].right - validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
+ obj.confidence = scores[index];
+ obj.labelID = classes[index];
+ obj.label = getLabelName(classes[index]);
+ result->objects.push_back(obj);
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp
new file mode 100644
index 0000000..33a3604
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp
@@ -0,0 +1,264 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/hpe_model_associative_embedding.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/associative_embedding_decoder.h"
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+const cv::Vec3f HpeAssociativeEmbedding::meanPixel = cv::Vec3f::all(128);
+const float HpeAssociativeEmbedding::detectionThreshold = 0.1f;
+const float HpeAssociativeEmbedding::tagThreshold = 1.0f;
+
+HpeAssociativeEmbedding::HpeAssociativeEmbedding(const std::string& modelFileName,
+ double aspectRatio,
+ int targetSize,
+ float confidenceThreshold,
+ const std::string& layout,
+ float delta,
+ RESIZE_MODE resizeMode)
+ : ImageModel(modelFileName, false, layout),
+ aspectRatio(aspectRatio),
+ targetSize(targetSize),
+ confidenceThreshold(confidenceThreshold),
+ delta(delta) {
+ resizeMode = resizeMode;
+ interpolationMode = cv::INTER_CUBIC;
+ }
+
+void HpeAssociativeEmbedding::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input Tensors ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("HPE AE model wrapper supports topologies with only 1 input.");
+ }
+ inputsNames.push_back(model->input().get_any_name());
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+ inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Prepare output Tensors -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 2 && outputs.size() != 3) {
+ throw std::logic_error("HPE AE model model wrapper supports topologies only with 2 or 3 outputs");
+ }
+
+ for (const auto& output : model->outputs()) {
+ const auto& outTensorName = output.get_any_name();
+ ppp.output(outTensorName).tensor().set_element_type(ov::element::f32);
+
+ for (const auto& name : output.get_names()) {
+ outputsNames.push_back(name);
+ }
+
+ const ov::Shape& outputShape = output.get_shape();
+ if (outputShape.size() != 4 && outputShape.size() != 5) {
+ throw std::logic_error("output tensors are expected to be 4-dimensional or 5-dimensional");
+ }
+ if (outputShape[ov::layout::batch_idx("NC...")] != 1 || outputShape[ov::layout::channels_idx("NC...")] != 17) {
+ throw std::logic_error("output tensors are expected to have 1 batch size and 17 channels");
+ }
+ }
+ model = ppp.build();
+
+ embeddingsTensorName = findTensorByName("embeddings", outputsNames);
+ heatmapsTensorName = findTensorByName("heatmaps", outputsNames);
+ try {
+ nmsHeatmapsTensorName = findTensorByName("nms_heatmaps", outputsNames);
+ } catch (const std::runtime_error&) { nmsHeatmapsTensorName = heatmapsTensorName; }
+
+ changeInputSize(model);
+}
+
+void HpeAssociativeEmbedding::changeInputSize(std::shared_ptr<ov::Model>& model) {
+ ov::Shape inputShape = model->input().get_shape();
+ const ov::Layout& layout = ov::layout::get_layout(model->input());
+ const auto batchId = ov::layout::batch_idx(layout);
+ const auto heightId = ov::layout::height_idx(layout);
+ const auto widthId = ov::layout::width_idx(layout);
+
+ if (!targetSize) {
+ targetSize = static_cast<int>(std::min(inputShape[heightId], inputShape[widthId]));
+ }
+ int inputHeight = aspectRatio >= 1.0 ? targetSize : static_cast<int>(std::round(targetSize / aspectRatio));
+ int inputWidth = aspectRatio >= 1.0 ? static_cast<int>(std::round(targetSize * aspectRatio)) : targetSize;
+ int height = static_cast<int>((inputHeight + stride - 1) / stride) * stride;
+ int width = static_cast<int>((inputWidth + stride - 1) / stride) * stride;
+ inputShape[batchId] = 1;
+ inputShape[heightId] = height;
+ inputShape[widthId] = width;
+ inputLayerSize = cv::Size(width, height);
+
+ model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> HpeAssociativeEmbedding::preprocess(const InputData& inputData,
+ ov::InferRequest& request) {
+ auto& image = inputData.asRef<ImageInputData>().inputImage;
+ cv::Rect roi;
+ auto paddedImage = resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi);
+ if (inputLayerSize.height - stride >= roi.height || inputLayerSize.width - stride >= roi.width) {
+ slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+ }
+ request.set_input_tensor(wrapMat2Tensor(paddedImage));
+
+ return std::make_shared<InternalScaleData>(paddedImage.cols,
+ paddedImage.rows,
+ image.size().width / static_cast<float>(roi.width),
+ image.size().height / static_cast<float>(roi.height));
+}
+
+std::unique_ptr<ResultBase> HpeAssociativeEmbedding::postprocess(InferenceResult& infResult) {
+ HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData);
+
+ const auto& aembds = infResult.outputsData[embeddingsTensorName];
+ const ov::Shape& aembdsShape = aembds.get_shape();
+ float* const aembdsMapped = aembds.data<float>();
+ std::vector<cv::Mat> aembdsMaps = split(aembdsMapped, aembdsShape);
+
+ const auto& heats = infResult.outputsData[heatmapsTensorName];
+ const ov::Shape& heatMapsShape = heats.get_shape();
+ float* const heatMapsMapped = heats.data<float>();
+ std::vector<cv::Mat> heatMaps = split(heatMapsMapped, heatMapsShape);
+
+ std::vector<cv::Mat> nmsHeatMaps = heatMaps;
+ if (nmsHeatmapsTensorName != heatmapsTensorName) {
+ const auto& nmsHeats = infResult.outputsData[nmsHeatmapsTensorName];
+ const ov::Shape& nmsHeatMapsShape = nmsHeats.get_shape();
+ float* const nmsHeatMapsMapped = nmsHeats.data<float>();
+ nmsHeatMaps = split(nmsHeatMapsMapped, nmsHeatMapsShape);
+ }
+ std::vector<HumanPose> poses = extractPoses(heatMaps, aembdsMaps, nmsHeatMaps);
+
+ // Rescale poses to the original image
+ const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
+ const float outputScale = inputLayerSize.width / static_cast<float>(heatMapsShape[3]);
+ float shiftX = 0.0, shiftY = 0.0;
+ float scaleX = 1.0, scaleY = 1.0;
+
+ if (resizeMode == RESIZE_KEEP_ASPECT_LETTERBOX) {
+ scaleX = scaleY = std::min(scale.scaleX, scale.scaleY);
+ if (aspectRatio >= 1.0)
+ shiftX = static_cast<float>((targetSize * scaleX * aspectRatio - scale.inputImgWidth * scaleX) / 2);
+ else
+ shiftY = static_cast<float>((targetSize * scaleY / aspectRatio - scale.inputImgHeight * scaleY) / 2);
+ scaleX = scaleY *= outputScale;
+ } else {
+ scaleX = scale.scaleX * outputScale;
+ scaleY = scale.scaleY * outputScale;
+ }
+
+ for (auto& pose : poses) {
+ for (auto& keypoint : pose.keypoints) {
+ if (keypoint != cv::Point2f(-1, -1)) {
+ keypoint.x = keypoint.x * scaleX + shiftX;
+ keypoint.y = keypoint.y * scaleY + shiftY;
+ }
+ }
+ result->poses.push_back(pose);
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
+
+std::string HpeAssociativeEmbedding::findTensorByName(const std::string& tensorName,
+ const std::vector<std::string>& outputsNames) {
+ std::vector<std::string> suitableLayers;
+ for (auto& outputName : outputsNames) {
+ if (outputName.rfind(tensorName, 0) == 0) {
+ suitableLayers.push_back(outputName);
+ }
+ }
+ if (suitableLayers.empty()) {
+ throw std::runtime_error("Suitable tensor for " + tensorName + " output is not found");
+ } else if (suitableLayers.size() > 1) {
+ throw std::runtime_error("More than 1 tensor matched to " + tensorName + " output");
+ }
+ return suitableLayers[0];
+}
+
+std::vector<cv::Mat> HpeAssociativeEmbedding::split(float* data, const ov::Shape& shape) {
+ std::vector<cv::Mat> flattenData(shape[1]);
+ for (size_t i = 0; i < flattenData.size(); i++) {
+ flattenData[i] = cv::Mat(shape[2], shape[3], CV_32FC1, data + i * shape[2] * shape[3]);
+ }
+ return flattenData;
+}
+
+std::vector<HumanPose> HpeAssociativeEmbedding::extractPoses(std::vector<cv::Mat>& heatMaps,
+ const std::vector<cv::Mat>& aembdsMaps,
+ const std::vector<cv::Mat>& nmsHeatMaps) const {
+ std::vector<std::vector<Peak>> allPeaks(numJoints);
+ for (int i = 0; i < numJoints; i++) {
+ findPeaks(nmsHeatMaps, aembdsMaps, allPeaks, i, maxNumPeople, detectionThreshold);
+ }
+ std::vector<Pose> allPoses = matchByTag(allPeaks, maxNumPeople, numJoints, tagThreshold);
+ // swap for all poses
+ for (auto& pose : allPoses) {
+ for (size_t j = 0; j < numJoints; j++) {
+ Peak& peak = pose.getPeak(j);
+ std::swap(peak.keypoint.x, peak.keypoint.y);
+ }
+ }
+ std::vector<HumanPose> poses;
+ for (size_t i = 0; i < allPoses.size(); i++) {
+ Pose& pose = allPoses[i];
+ // Filtering poses with low mean scores
+ if (pose.getMeanScore() <= confidenceThreshold) {
+ continue;
+ }
+ for (size_t j = 0; j < heatMaps.size(); j++) {
+ heatMaps[j] = cv::abs(heatMaps[j]);
+ }
+ adjustAndRefine(allPoses, heatMaps, aembdsMaps, i, delta);
+ std::vector<cv::Point2f> keypoints;
+ for (size_t j = 0; j < numJoints; j++) {
+ Peak& peak = pose.getPeak(j);
+ keypoints.push_back(peak.keypoint);
+ }
+ poses.push_back({keypoints, pose.getMeanScore()});
+ }
+ return poses;
+}
diff --git a/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp b/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp
new file mode 100644
index 0000000..d8b4cb6
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp
@@ -0,0 +1,256 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/hpe_model_openpose.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/openpose_decoder.h"
+#include "models/results.h"
+
+const cv::Vec3f HPEOpenPose::meanPixel = cv::Vec3f::all(128);
+const float HPEOpenPose::minPeaksDistance = 3.0f;
+const float HPEOpenPose::midPointsScoreThreshold = 0.05f;
+const float HPEOpenPose::foundMidPointsRatioThreshold = 0.8f;
+const float HPEOpenPose::minSubsetScore = 0.2f;
+
+HPEOpenPose::HPEOpenPose(const std::string& modelFileName,
+ double aspectRatio,
+ int targetSize,
+ float confidenceThreshold,
+ const std::string& layout)
+ : ImageModel(modelFileName, false, layout),
+ aspectRatio(aspectRatio),
+ targetSize(targetSize),
+ confidenceThreshold(confidenceThreshold) {
+ resizeMode = RESIZE_KEEP_ASPECT;
+ interpolationMode = cv::INTER_CUBIC;
+ }
+
+void HPEOpenPose::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("HPE OpenPose model wrapper supports topologies with only 1 input");
+ }
+ inputsNames.push_back(model->input().get_any_name());
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+ inputShape[ov::layout::channels_idx(inputLayout)] != 3)
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 2) {
+ throw std::runtime_error("HPE OpenPose supports topologies with only 2 outputs");
+ }
+
+ const ov::Layout outputLayout("NCHW");
+ for (const auto& output : model->outputs()) {
+ const auto& outTensorName = output.get_any_name();
+ ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout);
+ outputsNames.push_back(outTensorName);
+ }
+ model = ppp.build();
+
+ const size_t batchId = ov::layout::batch_idx(outputLayout);
+ const size_t channelsId = ov::layout::channels_idx(outputLayout);
+ const size_t widthId = ov::layout::width_idx(outputLayout);
+ const size_t heightId = ov::layout::height_idx(outputLayout);
+
+ ov::Shape heatmapsOutputShape = model->outputs().front().get_shape();
+ ov::Shape pafsOutputShape = model->outputs().back().get_shape();
+ if (heatmapsOutputShape[channelsId] > pafsOutputShape[channelsId]) {
+ std::swap(heatmapsOutputShape, pafsOutputShape);
+ std::swap(outputsNames[0], outputsNames[1]);
+ }
+
+ if (heatmapsOutputShape.size() != 4 || heatmapsOutputShape[batchId] != 1 ||
+ heatmapsOutputShape[ov::layout::channels_idx(outputLayout)] != keypointsNumber + 1) {
+ throw std::logic_error("1x" + std::to_string(keypointsNumber + 1) +
+ "xHFMxWFM dimension of model's heatmap is expected");
+ }
+ if (pafsOutputShape.size() != 4 || pafsOutputShape[batchId] != 1 ||
+ pafsOutputShape[channelsId] != 2 * (keypointsNumber + 1)) {
+ throw std::logic_error("1x" + std::to_string(2 * (keypointsNumber + 1)) +
+ "xHFMxWFM dimension of model's output is expected");
+ }
+ if (pafsOutputShape[heightId] != heatmapsOutputShape[heightId] ||
+ pafsOutputShape[widthId] != heatmapsOutputShape[widthId]) {
+ throw std::logic_error("output and heatmap are expected to have matching last two dimensions");
+ }
+
+ changeInputSize(model);
+}
+
+void HPEOpenPose::changeInputSize(std::shared_ptr<ov::Model>& model) {
+ ov::Shape inputShape = model->input().get_shape();
+ const ov::Layout& layout = ov::layout::get_layout(model->inputs().front());
+ const auto batchId = ov::layout::batch_idx(layout);
+ const auto heightId = ov::layout::height_idx(layout);
+ const auto widthId = ov::layout::width_idx(layout);
+
+ if (!targetSize) {
+ targetSize = inputShape[heightId];
+ }
+ int height = static_cast<int>((targetSize + stride - 1) / stride) * stride;
+ int inputWidth = static_cast<int>(std::round(targetSize * aspectRatio));
+ int width = static_cast<int>((inputWidth + stride - 1) / stride) * stride;
+ inputShape[batchId] = 1;
+ inputShape[heightId] = height;
+ inputShape[widthId] = width;
+ inputLayerSize = cv::Size(width, height);
+ model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> HPEOpenPose::preprocess(const InputData& inputData, ov::InferRequest& request) {
+ auto& image = inputData.asRef<ImageInputData>().inputImage;
+ cv::Rect roi;
+ auto paddedImage =
+ resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi);
+ if (inputLayerSize.width < roi.width)
+ throw std::runtime_error("The image aspect ratio doesn't fit current model shape");
+
+ if (inputLayerSize.width - stride >= roi.width) {
+ slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+ }
+
+ request.set_input_tensor(wrapMat2Tensor(paddedImage));
+ return std::make_shared<InternalScaleData>(paddedImage.cols,
+ paddedImage.rows,
+ image.cols / static_cast<float>(roi.width),
+ image.rows / static_cast<float>(roi.height));
+}
+
+std::unique_ptr<ResultBase> HPEOpenPose::postprocess(InferenceResult& infResult) {
+ HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData);
+
+ const auto& heatMapsMapped = infResult.outputsData[outputsNames[0]];
+ const auto& outputMapped = infResult.outputsData[outputsNames[1]];
+
+ const ov::Shape& outputShape = outputMapped.get_shape();
+ const ov::Shape& heatMapShape = heatMapsMapped.get_shape();
+
+ float* const predictions = outputMapped.data<float>();
+ float* const heats = heatMapsMapped.data<float>();
+
+ std::vector<cv::Mat> heatMaps(keypointsNumber);
+ for (size_t i = 0; i < heatMaps.size(); i++) {
+ heatMaps[i] =
+ cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, heats + i * heatMapShape[2] * heatMapShape[3]);
+ }
+ resizeFeatureMaps(heatMaps);
+
+ std::vector<cv::Mat> pafs(outputShape[1]);
+ for (size_t i = 0; i < pafs.size(); i++) {
+ pafs[i] =
+ cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, predictions + i * heatMapShape[2] * heatMapShape[3]);
+ }
+ resizeFeatureMaps(pafs);
+
+ std::vector<HumanPose> poses = extractPoses(heatMaps, pafs);
+
+ const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
+ float scaleX = stride / upsampleRatio * scale.scaleX;
+ float scaleY = stride / upsampleRatio * scale.scaleY;
+ for (auto& pose : poses) {
+ for (auto& keypoint : pose.keypoints) {
+ if (keypoint != cv::Point2f(-1, -1)) {
+ keypoint.x *= scaleX;
+ keypoint.y *= scaleY;
+ }
+ }
+ }
+ for (size_t i = 0; i < poses.size(); ++i) {
+ result->poses.push_back(poses[i]);
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
+
+void HPEOpenPose::resizeFeatureMaps(std::vector<cv::Mat>& featureMaps) const {
+ for (auto& featureMap : featureMaps) {
+ cv::resize(featureMap, featureMap, cv::Size(), upsampleRatio, upsampleRatio, cv::INTER_CUBIC);
+ }
+}
+
+class FindPeaksBody : public cv::ParallelLoopBody {
+public:
+ FindPeaksBody(const std::vector<cv::Mat>& heatMaps,
+ float minPeaksDistance,
+ std::vector<std::vector<Peak>>& peaksFromHeatMap,
+ float confidenceThreshold)
+ : heatMaps(heatMaps),
+ minPeaksDistance(minPeaksDistance),
+ peaksFromHeatMap(peaksFromHeatMap),
+ confidenceThreshold(confidenceThreshold) {}
+
+ void operator()(const cv::Range& range) const override {
+ for (int i = range.start; i < range.end; i++) {
+ findPeaks(heatMaps, minPeaksDistance, peaksFromHeatMap, i, confidenceThreshold);
+ }
+ }
+
+private:
+ const std::vector<cv::Mat>& heatMaps;
+ float minPeaksDistance;
+ std::vector<std::vector<Peak>>& peaksFromHeatMap;
+ float confidenceThreshold;
+};
+
+std::vector<HumanPose> HPEOpenPose::extractPoses(const std::vector<cv::Mat>& heatMaps,
+ const std::vector<cv::Mat>& pafs) const {
+ std::vector<std::vector<Peak>> peaksFromHeatMap(heatMaps.size());
+ FindPeaksBody findPeaksBody(heatMaps, minPeaksDistance, peaksFromHeatMap, confidenceThreshold);
+ cv::parallel_for_(cv::Range(0, static_cast<int>(heatMaps.size())), findPeaksBody);
+ int peaksBefore = 0;
+ for (size_t heatmapId = 1; heatmapId < heatMaps.size(); heatmapId++) {
+ peaksBefore += static_cast<int>(peaksFromHeatMap[heatmapId - 1].size());
+ for (auto& peak : peaksFromHeatMap[heatmapId]) {
+ peak.id += peaksBefore;
+ }
+ }
+ std::vector<HumanPose> poses = groupPeaksToPoses(peaksFromHeatMap,
+ pafs,
+ keypointsNumber,
+ midPointsScoreThreshold,
+ foundMidPointsRatioThreshold,
+ minJointsNumber,
+ minSubsetScore);
+ return poses;
+}
diff --git a/python/openvino/runtime/common/models/src/image_model.cpp b/python/openvino/runtime/common/models/src/image_model.cpp
new file mode 100644
index 0000000..511faf3
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/image_model.cpp
@@ -0,0 +1,57 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/image_model.h"
+
+#include <stdexcept>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+
+ImageModel::ImageModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout)
+ : ModelBase(modelFileName, layout),
+ useAutoResize(useAutoResize) {}
+
+std::shared_ptr<InternalModelData> ImageModel::preprocess(const InputData& inputData, ov::InferRequest& request) {
+ const auto& origImg = inputData.asRef<ImageInputData>().inputImage;
+ auto img = inputTransform(origImg);
+
+ if (!useAutoResize) {
+ // /* Resize and copy data from the image to the input tensor */
+ const ov::Tensor& frameTensor = request.get_tensor(inputsNames[0]); // first input should be image
+ const ov::Shape& tensorShape = frameTensor.get_shape();
+ const ov::Layout layout("NHWC");
+ const size_t width = tensorShape[ov::layout::width_idx(layout)];
+ const size_t height = tensorShape[ov::layout::height_idx(layout)];
+ const size_t channels = tensorShape[ov::layout::channels_idx(layout)];
+ if (static_cast<size_t>(img.channels()) != channels) {
+ throw std::runtime_error("The number of channels for model input and image must match");
+ }
+ if (channels != 1 && channels != 3) {
+ throw std::runtime_error("Unsupported number of channels");
+ }
+ img = resizeImageExt(img, width, height, resizeMode, interpolationMode);
+ }
+ request.set_tensor(inputsNames[0], wrapMat2Tensor(img));
+ return std::make_shared<InternalImageModelData>(origImg.cols, origImg.rows);
+}
diff --git a/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp b/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp
new file mode 100644
index 0000000..8eb3ae1
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp
@@ -0,0 +1,167 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/image_model.h"
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/jpeg_restoration_model.h"
+#include "models/results.h"
+
+JPEGRestorationModel::JPEGRestorationModel(const std::string& modelFileName,
+ const cv::Size& inputImgSize,
+ bool _jpegCompression,
+ const std::string& layout)
+ : ImageModel(modelFileName, false, layout) {
+ netInputHeight = inputImgSize.height;
+ netInputWidth = inputImgSize.width;
+ jpegCompression = _jpegCompression;
+}
+
+void JPEGRestorationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 input");
+ }
+ inputsNames.push_back(model->input().get_any_name());
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+ inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 1) {
+ throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 output");
+ }
+ const ov::Shape& outputShape = model->output().get_shape();
+ const ov::Layout outputLayout{"NCHW"};
+ if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 ||
+ outputShape[ov::layout::channels_idx(outputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's output is expected");
+ }
+
+ outputsNames.push_back(model->output().get_any_name());
+ ppp.output().tensor().set_element_type(ov::element::f32);
+ model = ppp.build();
+
+ changeInputSize(model);
+}
+
+void JPEGRestorationModel::changeInputSize(std::shared_ptr<ov::Model>& model) {
+ ov::Shape inputShape = model->input().get_shape();
+ const ov::Layout& layout = ov::layout::get_layout(model->input());
+
+ const auto batchId = ov::layout::batch_idx(layout);
+ const auto heightId = ov::layout::height_idx(layout);
+ const auto widthId = ov::layout::width_idx(layout);
+
+ if (inputShape[heightId] % stride || inputShape[widthId] % stride) {
+ throw std::logic_error("The shape of the model input must be divisible by stride");
+ }
+
+ netInputHeight = static_cast<int>((netInputHeight + stride - 1) / stride) * stride;
+ netInputWidth = static_cast<int>((netInputWidth + stride - 1) / stride) * stride;
+
+ inputShape[batchId] = 1;
+ inputShape[heightId] = netInputHeight;
+ inputShape[widthId] = netInputWidth;
+
+ model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> JPEGRestorationModel::preprocess(const InputData& inputData,
+ ov::InferRequest& request) {
+ cv::Mat image = inputData.asRef<ImageInputData>().inputImage;
+ const size_t h = image.rows;
+ const size_t w = image.cols;
+ cv::Mat resizedImage;
+ if (jpegCompression) {
+ std::vector<uchar> encimg;
+ std::vector<int> params{cv::IMWRITE_JPEG_QUALITY, 40};
+ cv::imencode(".jpg", image, encimg, params);
+ image = cv::imdecode(cv::Mat(encimg), 3);
+ }
+
+ if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) {
+ int bottom = netInputHeight - h;
+ int right = netInputWidth - w;
+ cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0);
+ } else {
+ slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+ cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight));
+ }
+ request.set_input_tensor(wrapMat2Tensor(resizedImage));
+
+ return std::make_shared<InternalImageModelData>(image.cols, image.rows);
+}
+
+std::unique_ptr<ResultBase> JPEGRestorationModel::postprocess(InferenceResult& infResult) {
+ ImageResult* result = new ImageResult;
+ *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+
+ const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+ const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+ std::vector<cv::Mat> imgPlanes;
+ const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape();
+ const size_t outHeight = static_cast<int>(outputShape[2]);
+ const size_t outWidth = static_cast<int>(outputShape[3]);
+ const size_t numOfPixels = outWidth * outHeight;
+ imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))};
+ cv::Mat resultImg;
+ cv::merge(imgPlanes, resultImg);
+
+ if (netInputHeight - stride < static_cast<size_t>(inputImgSize.inputImgHeight) &&
+ static_cast<size_t>(inputImgSize.inputImgHeight) <= netInputHeight &&
+ netInputWidth - stride < static_cast<size_t>(inputImgSize.inputImgWidth) &&
+ static_cast<size_t>(inputImgSize.inputImgWidth) <= netInputWidth) {
+ result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+ } else {
+ cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+ }
+
+ result->resultImage.convertTo(result->resultImage, CV_8UC3, 255);
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/model_base.cpp b/python/openvino/runtime/common/models/src/model_base.cpp
new file mode 100644
index 0000000..c2ebd1b
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/model_base.cpp
@@ -0,0 +1,67 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/model_base.h"
+
+#include <utility>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/config_factory.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+std::shared_ptr<ov::Model> ModelBase::prepareModel(ov::Core& core) {
+ // --------------------------- Read IR Generated by ModelOptimizer (.xml and .bin files) ------------
+ /** Read model **/
+ slog::info << "Reading model " << modelFileName << slog::endl;
+ std::shared_ptr<ov::Model> model = core.read_model(modelFileName);
+ logBasicModelInfo(model);
+ // -------------------------- Reading all outputs names and customizing I/O tensors (in inherited classes)
+ prepareInputsOutputs(model);
+
+ /** Set batch size to 1 **/
+ ov::set_batch(model, 1);
+
+ return model;
+}
+
+ov::CompiledModel ModelBase::compileModel(const ModelConfig& config, ov::Core& core) {
+ this->config = config;
+ auto model = prepareModel(core);
+ compiledModel = core.compile_model(model, config.deviceName, config.compiledModelConfig);
+ logCompiledModelInfo(compiledModel, modelFileName, config.deviceName);
+ return compiledModel;
+}
+
+ov::Layout ModelBase::getInputLayout(const ov::Output<ov::Node>& input) {
+ const ov::Shape& inputShape = input.get_shape();
+ ov::Layout layout = ov::layout::get_layout(input);
+ if (layout.empty()) {
+ if (inputsLayouts.empty()) {
+ layout = getLayoutFromShape(inputShape);
+ slog::warn << "Automatically detected layout '" << layout.to_string() << "' for input '"
+ << input.get_any_name() << "' will be used." << slog::endl;
+ } else if (inputsLayouts.size() == 1) {
+ layout = inputsLayouts.begin()->second;
+ } else {
+ layout = inputsLayouts[input.get_any_name()];
+ }
+ }
+
+ return layout;
+}
diff --git a/python/openvino/runtime/common/models/src/openpose_decoder.cpp b/python/openvino/runtime/common/models/src/openpose_decoder.cpp
new file mode 100644
index 0000000..6d51607
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/openpose_decoder.cpp
@@ -0,0 +1,345 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/openpose_decoder.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <utils/common.hpp>
+
+#include "models/results.h"
+
+Peak::Peak(const int id, const cv::Point2f& pos, const float score) : id(id), pos(pos), score(score) {}
+
+HumanPoseByPeaksIndices::HumanPoseByPeaksIndices(const int keypointsNumber)
+ : peaksIndices(std::vector<int>(keypointsNumber, -1)),
+ nJoints(0),
+ score(0.0f) {}
+
+TwoJointsConnection::TwoJointsConnection(const int firstJointIdx, const int secondJointIdx, const float score)
+ : firstJointIdx(firstJointIdx),
+ secondJointIdx(secondJointIdx),
+ score(score) {}
+
+void findPeaks(const std::vector<cv::Mat>& heatMaps,
+ const float minPeaksDistance,
+ std::vector<std::vector<Peak>>& allPeaks,
+ int heatMapId,
+ float confidenceThreshold) {
+ std::vector<cv::Point> peaks;
+ const cv::Mat& heatMap = heatMaps[heatMapId];
+ const float* heatMapData = heatMap.ptr<float>();
+ size_t heatMapStep = heatMap.step1();
+ for (int y = -1; y < heatMap.rows + 1; y++) {
+ for (int x = -1; x < heatMap.cols + 1; x++) {
+ float val = 0;
+ if (x >= 0 && y >= 0 && x < heatMap.cols && y < heatMap.rows) {
+ val = heatMapData[y * heatMapStep + x];
+ val = val >= confidenceThreshold ? val : 0;
+ }
+
+ float left_val = 0;
+ if (y >= 0 && x < (heatMap.cols - 1) && y < heatMap.rows) {
+ left_val = heatMapData[y * heatMapStep + x + 1];
+ left_val = left_val >= confidenceThreshold ? left_val : 0;
+ }
+
+ float right_val = 0;
+ if (x > 0 && y >= 0 && y < heatMap.rows) {
+ right_val = heatMapData[y * heatMapStep + x - 1];
+ right_val = right_val >= confidenceThreshold ? right_val : 0;
+ }
+
+ float top_val = 0;
+ if (x >= 0 && x < heatMap.cols && y < (heatMap.rows - 1)) {
+ top_val = heatMapData[(y + 1) * heatMapStep + x];
+ top_val = top_val >= confidenceThreshold ? top_val : 0;
+ }
+
+ float bottom_val = 0;
+ if (x >= 0 && y > 0 && x < heatMap.cols) {
+ bottom_val = heatMapData[(y - 1) * heatMapStep + x];
+ bottom_val = bottom_val >= confidenceThreshold ? bottom_val : 0;
+ }
+
+ if ((val > left_val) && (val > right_val) && (val > top_val) && (val > bottom_val)) {
+ peaks.push_back(cv::Point(x, y));
+ }
+ }
+ }
+ std::sort(peaks.begin(), peaks.end(), [](const cv::Point& a, const cv::Point& b) {
+ return a.x < b.x;
+ });
+ std::vector<bool> isActualPeak(peaks.size(), true);
+ int peakCounter = 0;
+ std::vector<Peak>& peaksWithScoreAndID = allPeaks[heatMapId];
+ for (size_t i = 0; i < peaks.size(); i++) {
+ if (isActualPeak[i]) {
+ for (size_t j = i + 1; j < peaks.size(); j++) {
+ if (sqrt((peaks[i].x - peaks[j].x) * (peaks[i].x - peaks[j].x) +
+ (peaks[i].y - peaks[j].y) * (peaks[i].y - peaks[j].y)) < minPeaksDistance) {
+ isActualPeak[j] = false;
+ }
+ }
+ peaksWithScoreAndID.push_back(Peak(peakCounter++, peaks[i], heatMap.at<float>(peaks[i])));
+ }
+ }
+}
+
+std::vector<HumanPose> groupPeaksToPoses(const std::vector<std::vector<Peak>>& allPeaks,
+ const std::vector<cv::Mat>& pafs,
+ const size_t keypointsNumber,
+ const float midPointsScoreThreshold,
+ const float foundMidPointsRatioThreshold,
+ const int minJointsNumber,
+ const float minSubsetScore) {
+ static const std::pair<int, int> limbIdsHeatmap[] = {{2, 3},
+ {2, 6},
+ {3, 4},
+ {4, 5},
+ {6, 7},
+ {7, 8},
+ {2, 9},
+ {9, 10},
+ {10, 11},
+ {2, 12},
+ {12, 13},
+ {13, 14},
+ {2, 1},
+ {1, 15},
+ {15, 17},
+ {1, 16},
+ {16, 18},
+ {3, 17},
+ {6, 18}};
+ static const std::pair<int, int> limbIdsPaf[] = {{31, 32},
+ {39, 40},
+ {33, 34},
+ {35, 36},
+ {41, 42},
+ {43, 44},
+ {19, 20},
+ {21, 22},
+ {23, 24},
+ {25, 26},
+ {27, 28},
+ {29, 30},
+ {47, 48},
+ {49, 50},
+ {53, 54},
+ {51, 52},
+ {55, 56},
+ {37, 38},
+ {45, 46}};
+
+ std::vector<Peak> candidates;
+ for (const auto& peaks : allPeaks) {
+ candidates.insert(candidates.end(), peaks.begin(), peaks.end());
+ }
+ std::vector<HumanPoseByPeaksIndices> subset(0, HumanPoseByPeaksIndices(keypointsNumber));
+ for (size_t k = 0; k < arraySize(limbIdsPaf); k++) {
+ std::vector<TwoJointsConnection> connections;
+ const int mapIdxOffset = keypointsNumber + 1;
+ std::pair<cv::Mat, cv::Mat> scoreMid = {pafs[limbIdsPaf[k].first - mapIdxOffset],
+ pafs[limbIdsPaf[k].second - mapIdxOffset]};
+ const int idxJointA = limbIdsHeatmap[k].first - 1;
+ const int idxJointB = limbIdsHeatmap[k].second - 1;
+ const std::vector<Peak>& candA = allPeaks[idxJointA];
+ const std::vector<Peak>& candB = allPeaks[idxJointB];
+ const size_t nJointsA = candA.size();
+ const size_t nJointsB = candB.size();
+ if (nJointsA == 0 && nJointsB == 0) {
+ continue;
+ } else if (nJointsA == 0) {
+ for (size_t i = 0; i < nJointsB; i++) {
+ int num = 0;
+ for (size_t j = 0; j < subset.size(); j++) {
+ if (subset[j].peaksIndices[idxJointB] == candB[i].id) {
+ num++;
+ continue;
+ }
+ }
+ if (num == 0) {
+ HumanPoseByPeaksIndices personKeypoints(keypointsNumber);
+ personKeypoints.peaksIndices[idxJointB] = candB[i].id;
+ personKeypoints.nJoints = 1;
+ personKeypoints.score = candB[i].score;
+ subset.push_back(personKeypoints);
+ }
+ }
+ continue;
+ } else if (nJointsB == 0) {
+ for (size_t i = 0; i < nJointsA; i++) {
+ int num = 0;
+ for (size_t j = 0; j < subset.size(); j++) {
+ if (subset[j].peaksIndices[idxJointA] == candA[i].id) {
+ num++;
+ continue;
+ }
+ }
+ if (num == 0) {
+ HumanPoseByPeaksIndices personKeypoints(keypointsNumber);
+ personKeypoints.peaksIndices[idxJointA] = candA[i].id;
+ personKeypoints.nJoints = 1;
+ personKeypoints.score = candA[i].score;
+ subset.push_back(personKeypoints);
+ }
+ }
+ continue;
+ }
+
+ std::vector<TwoJointsConnection> tempJointConnections;
+ for (size_t i = 0; i < nJointsA; i++) {
+ for (size_t j = 0; j < nJointsB; j++) {
+ cv::Point2f pt = candA[i].pos * 0.5 + candB[j].pos * 0.5;
+ cv::Point mid = cv::Point(cvRound(pt.x), cvRound(pt.y));
+ cv::Point2f vec = candB[j].pos - candA[i].pos;
+ double norm_vec = cv::norm(vec);
+ if (norm_vec == 0) {
+ continue;
+ }
+ vec /= norm_vec;
+ float score = vec.x * scoreMid.first.at<float>(mid) + vec.y * scoreMid.second.at<float>(mid);
+ int height_n = pafs[0].rows / 2;
+ float suc_ratio = 0.0f;
+ float mid_score = 0.0f;
+ const int mid_num = 10;
+ const float scoreThreshold = -100.0f;
+ if (score > scoreThreshold) {
+ float p_sum = 0;
+ int p_count = 0;
+ cv::Size2f step((candB[j].pos.x - candA[i].pos.x) / (mid_num - 1),
+ (candB[j].pos.y - candA[i].pos.y) / (mid_num - 1));
+ for (int n = 0; n < mid_num; n++) {
+ cv::Point midPoint(cvRound(candA[i].pos.x + n * step.width),
+ cvRound(candA[i].pos.y + n * step.height));
+ cv::Point2f pred(scoreMid.first.at<float>(midPoint), scoreMid.second.at<float>(midPoint));
+ score = vec.x * pred.x + vec.y * pred.y;
+ if (score > midPointsScoreThreshold) {
+ p_sum += score;
+ p_count++;
+ }
+ }
+ suc_ratio = static_cast<float>(p_count / mid_num);
+ float ratio = p_count > 0 ? p_sum / p_count : 0.0f;
+ mid_score = ratio + static_cast<float>(std::min(height_n / norm_vec - 1, 0.0));
+ }
+ if (mid_score > 0 && suc_ratio > foundMidPointsRatioThreshold) {
+ tempJointConnections.push_back(TwoJointsConnection(i, j, mid_score));
+ }
+ }
+ }
+ if (!tempJointConnections.empty()) {
+ std::sort(tempJointConnections.begin(),
+ tempJointConnections.end(),
+ [](const TwoJointsConnection& a, const TwoJointsConnection& b) {
+ return (a.score > b.score);
+ });
+ }
+ size_t num_limbs = std::min(nJointsA, nJointsB);
+ size_t cnt = 0;
+ std::vector<int> occurA(nJointsA, 0);
+ std::vector<int> occurB(nJointsB, 0);
+ for (size_t row = 0; row < tempJointConnections.size(); row++) {
+ if (cnt == num_limbs) {
+ break;
+ }
+ const int& indexA = tempJointConnections[row].firstJointIdx;
+ const int& indexB = tempJointConnections[row].secondJointIdx;
+ const float& score = tempJointConnections[row].score;
+ if (occurA[indexA] == 0 && occurB[indexB] == 0) {
+ connections.push_back(TwoJointsConnection(candA[indexA].id, candB[indexB].id, score));
+ cnt++;
+ occurA[indexA] = 1;
+ occurB[indexB] = 1;
+ }
+ }
+ if (connections.empty()) {
+ continue;
+ }
+
+ bool extraJointConnections = (k == 17 || k == 18);
+ if (k == 0) {
+ subset = std::vector<HumanPoseByPeaksIndices>(connections.size(), HumanPoseByPeaksIndices(keypointsNumber));
+ for (size_t i = 0; i < connections.size(); i++) {
+ const int& indexA = connections[i].firstJointIdx;
+ const int& indexB = connections[i].secondJointIdx;
+ subset[i].peaksIndices[idxJointA] = indexA;
+ subset[i].peaksIndices[idxJointB] = indexB;
+ subset[i].nJoints = 2;
+ subset[i].score = candidates[indexA].score + candidates[indexB].score + connections[i].score;
+ }
+ } else if (extraJointConnections) {
+ for (size_t i = 0; i < connections.size(); i++) {
+ const int& indexA = connections[i].firstJointIdx;
+ const int& indexB = connections[i].secondJointIdx;
+ for (size_t j = 0; j < subset.size(); j++) {
+ if (subset[j].peaksIndices[idxJointA] == indexA && subset[j].peaksIndices[idxJointB] == -1) {
+ subset[j].peaksIndices[idxJointB] = indexB;
+ } else if (subset[j].peaksIndices[idxJointB] == indexB && subset[j].peaksIndices[idxJointA] == -1) {
+ subset[j].peaksIndices[idxJointA] = indexA;
+ }
+ }
+ }
+ continue;
+ } else {
+ for (size_t i = 0; i < connections.size(); i++) {
+ const int& indexA = connections[i].firstJointIdx;
+ const int& indexB = connections[i].secondJointIdx;
+ bool num = false;
+ for (size_t j = 0; j < subset.size(); j++) {
+ if (subset[j].peaksIndices[idxJointA] == indexA) {
+ subset[j].peaksIndices[idxJointB] = indexB;
+ subset[j].nJoints++;
+ subset[j].score += candidates[indexB].score + connections[i].score;
+ num = true;
+ }
+ }
+ if (!num) {
+ HumanPoseByPeaksIndices hpWithScore(keypointsNumber);
+ hpWithScore.peaksIndices[idxJointA] = indexA;
+ hpWithScore.peaksIndices[idxJointB] = indexB;
+ hpWithScore.nJoints = 2;
+ hpWithScore.score = candidates[indexA].score + candidates[indexB].score + connections[i].score;
+ subset.push_back(hpWithScore);
+ }
+ }
+ }
+ }
+ std::vector<HumanPose> poses;
+ for (const auto& subsetI : subset) {
+ if (subsetI.nJoints < minJointsNumber || subsetI.score / subsetI.nJoints < minSubsetScore) {
+ continue;
+ }
+ int position = -1;
+ HumanPose pose{std::vector<cv::Point2f>(keypointsNumber, cv::Point2f(-1.0f, -1.0f)),
+ subsetI.score * std::max(0, subsetI.nJoints - 1)};
+ for (const auto& peakIdx : subsetI.peaksIndices) {
+ position++;
+ if (peakIdx >= 0) {
+ pose.keypoints[position] = candidates[peakIdx].pos;
+ pose.keypoints[position].x += 0.5;
+ pose.keypoints[position].y += 0.5;
+ }
+ }
+ poses.push_back(pose);
+ }
+ return poses;
+}
diff --git a/python/openvino/runtime/common/models/src/segmentation_model.cpp b/python/openvino/runtime/common/models/src/segmentation_model.cpp
new file mode 100644
index 0000000..82a153b
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/segmentation_model.cpp
@@ -0,0 +1,157 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/segmentation_model.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+SegmentationModel::SegmentationModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout)
+ : ImageModel(modelFileName, useAutoResize, layout) {}
+
+std::vector<std::string> SegmentationModel::loadLabels(const std::string& labelFilename) {
+ std::vector<std::string> labelsList;
+
+ /* Read labels (if any) */
+ if (!labelFilename.empty()) {
+ std::ifstream inputFile(labelFilename);
+ if (!inputFile.is_open())
+ throw std::runtime_error("Can't open the labels file: " + labelFilename);
+ std::string label;
+ while (std::getline(inputFile, label)) {
+ labelsList.push_back(label);
+ }
+ if (labelsList.empty())
+ throw std::logic_error("File is empty: " + labelFilename);
+ }
+
+ return labelsList;
+}
+
+void SegmentationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output ---------------------------------------------
+ // --------------------------- Prepare input -----------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("Segmentation model wrapper supports topologies with only 1 input");
+ }
+ const auto& input = model->input();
+ inputsNames.push_back(input.get_any_name());
+
+ const ov::Layout& inputLayout = getInputLayout(input);
+ const ov::Shape& inputShape = input.get_shape();
+ if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+ model = ppp.build();
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 1) {
+ throw std::logic_error("Segmentation model wrapper supports topologies with only 1 output");
+ }
+
+ const auto& output = model->output();
+ outputsNames.push_back(output.get_any_name());
+
+ const ov::Shape& outputShape = output.get_shape();
+ ov::Layout outputLayout("");
+ switch (outputShape.size()) {
+ case 3:
+ outputLayout = "CHW";
+ outChannels = 1;
+ outHeight = static_cast<int>(outputShape[ov::layout::height_idx(outputLayout)]);
+ outWidth = static_cast<int>(outputShape[ov::layout::width_idx(outputLayout)]);
+ break;
+ case 4:
+ outputLayout = "NCHW";
+ outChannels = static_cast<int>(outputShape[ov::layout::channels_idx(outputLayout)]);
+ outHeight = static_cast<int>(outputShape[ov::layout::height_idx(outputLayout)]);
+ outWidth = static_cast<int>(outputShape[ov::layout::width_idx(outputLayout)]);
+ break;
+ default:
+ throw std::logic_error("Unexpected output tensor shape. Only 4D and 3D outputs are supported.");
+ }
+}
+
+std::unique_ptr<ResultBase> SegmentationModel::postprocess(InferenceResult& infResult) {
+ ImageResult* result = new ImageResult(infResult.frameId, infResult.metaData);
+ const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+ const auto& outTensor = infResult.getFirstOutputTensor();
+
+ result->resultImage = cv::Mat(outHeight, outWidth, CV_8UC1);
+
+ if (outChannels == 1 && outTensor.get_element_type() == ov::element::i32) {
+ cv::Mat predictions(outHeight, outWidth, CV_32SC1, outTensor.data<int32_t>());
+ predictions.convertTo(result->resultImage, CV_8UC1);
+ } else if (outChannels == 1 && outTensor.get_element_type() == ov::element::i64) {
+ cv::Mat predictions(outHeight, outWidth, CV_32SC1);
+ const auto data = outTensor.data<int64_t>();
+ for (size_t i = 0; i < predictions.total(); ++i) {
+ reinterpret_cast<int32_t*>(predictions.data)[i] = int32_t(data[i]);
+ }
+ predictions.convertTo(result->resultImage, CV_8UC1);
+ } else if (outTensor.get_element_type() == ov::element::f32) {
+ const float* data = outTensor.data<float>();
+ for (int rowId = 0; rowId < outHeight; ++rowId) {
+ for (int colId = 0; colId < outWidth; ++colId) {
+ int classId = 0;
+ float maxProb = -1.0f;
+ for (int chId = 0; chId < outChannels; ++chId) {
+ float prob = data[chId * outHeight * outWidth + rowId * outWidth + colId];
+ if (prob > maxProb) {
+ classId = chId;
+ maxProb = prob;
+ }
+ } // nChannels
+
+ result->resultImage.at<uint8_t>(rowId, colId) = classId;
+ } // width
+ } // height
+ }
+
+ cv::resize(result->resultImage,
+ result->resultImage,
+ cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight),
+ 0,
+ 0,
+ cv::INTER_NEAREST);
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/style_transfer_model.cpp b/python/openvino/runtime/common/models/src/style_transfer_model.cpp
new file mode 100644
index 0000000..53e8561
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/style_transfer_model.cpp
@@ -0,0 +1,107 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/style_transfer_model.h"
+
+#include <stddef.h>
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+StyleTransferModel::StyleTransferModel(const std::string& modelFileName, const std::string& layout)
+ : ImageModel(modelFileName, false, layout) {}
+
+void StyleTransferModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output ---------------------------------------------
+ // --------------------------- Prepare input --------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("Style transfer model wrapper supports topologies with only 1 input");
+ }
+
+ inputsNames.push_back(model->input().get_any_name());
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ ov::Layout inputLayout = getInputLayout(model->input());
+
+ if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+ inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+ }
+
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().preprocess().convert_element_type(ov::element::f32);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 1) {
+ throw std::logic_error("Style transfer model wrapper supports topologies with only 1 output");
+ }
+ outputsNames.push_back(model->output().get_any_name());
+
+ const ov::Shape& outputShape = model->output().get_shape();
+ ov::Layout outputLayout{"NCHW"};
+ if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 ||
+ outputShape[ov::layout::channels_idx(outputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's output is expected");
+ }
+
+ ppp.output().tensor().set_element_type(ov::element::f32);
+ model = ppp.build();
+}
+
+std::unique_ptr<ResultBase> StyleTransferModel::postprocess(InferenceResult& infResult) {
+ ImageResult* result = new ImageResult;
+ *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+
+ const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+ const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+ const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape();
+ size_t outHeight = static_cast<int>(outputShape[2]);
+ size_t outWidth = static_cast<int>(outputShape[3]);
+ size_t numOfPixels = outWidth * outHeight;
+
+ std::vector<cv::Mat> imgPlanes;
+ imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))};
+ cv::Mat resultImg;
+ cv::merge(imgPlanes, resultImg);
+ cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+
+ result->resultImage.convertTo(result->resultImage, CV_8UC3);
+
+ return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/super_resolution_model.cpp b/python/openvino/runtime/common/models/src/super_resolution_model.cpp
new file mode 100644
index 0000000..164991a
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/super_resolution_model.cpp
@@ -0,0 +1,207 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/super_resolution_model.h"
+
+#include <stddef.h>
+
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+SuperResolutionModel::SuperResolutionModel(const std::string& modelFileName,
+ const cv::Size& inputImgSize,
+ const std::string& layout)
+ : ImageModel(modelFileName, false, layout) {
+ netInputHeight = inputImgSize.height;
+ netInputWidth = inputImgSize.width;
+}
+
+void SuperResolutionModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output ---------------------------------------------
+ // --------------------------- Prepare input --------------------------------------------------
+ const ov::OutputVector& inputs = model->inputs();
+ if (inputs.size() != 1 && inputs.size() != 2) {
+ throw std::logic_error("Super resolution model wrapper supports topologies with 1 or 2 inputs only");
+ }
+ std::string lrInputTensorName = inputs.begin()->get_any_name();
+ inputsNames.push_back(lrInputTensorName);
+ ov::Shape lrShape = inputs.begin()->get_shape();
+ if (lrShape.size() != 4) {
+ throw std::logic_error("Number of dimensions for an input must be 4");
+ }
+ // in case of 2 inputs they have the same layouts
+ ov::Layout inputLayout = getInputLayout(model->inputs().front());
+
+ auto channelsId = ov::layout::channels_idx(inputLayout);
+ auto heightId = ov::layout::height_idx(inputLayout);
+ auto widthId = ov::layout::width_idx(inputLayout);
+
+ if (lrShape[channelsId] != 1 && lrShape[channelsId] != 3) {
+ throw std::logic_error("Input layer is expected to have 1 or 3 channels");
+ }
+
+ // A model like single-image-super-resolution-???? may take bicubic interpolation of the input image as the
+ // second input
+ std::string bicInputTensorName;
+ if (inputs.size() == 2) {
+ bicInputTensorName = (++inputs.begin())->get_any_name();
+ inputsNames.push_back(bicInputTensorName);
+ ov::Shape bicShape = (++inputs.begin())->get_shape();
+ if (bicShape.size() != 4) {
+ throw std::logic_error("Number of dimensions for both inputs must be 4");
+ }
+ if (lrShape[widthId] >= bicShape[widthId] && lrShape[heightId] >= bicShape[heightId]) {
+ std::swap(bicShape, lrShape);
+ inputsNames[0].swap(inputsNames[1]);
+ } else if (!(lrShape[widthId] <= bicShape[widthId] && lrShape[heightId] <= bicShape[heightId])) {
+ throw std::logic_error("Each spatial dimension of one input must surpass or be equal to a spatial"
+ "dimension of another input");
+ }
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ for (const auto& input : inputs) {
+ ppp.input(input.get_any_name()).tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+ ppp.input(input.get_any_name()).model().set_layout(inputLayout);
+ }
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 1) {
+ throw std::logic_error("Super resolution model wrapper supports topologies with only 1 output");
+ }
+
+ outputsNames.push_back(outputs.begin()->get_any_name());
+ ppp.output().tensor().set_element_type(ov::element::f32);
+ model = ppp.build();
+
+ const ov::Shape& outShape = model->output().get_shape();
+
+ const ov::Layout outputLayout("NCHW");
+ const auto outWidth = outShape[ov::layout::width_idx(outputLayout)];
+ const auto inWidth = lrShape[ov::layout::width_idx(outputLayout)];
+ changeInputSize(model, static_cast<int>(outWidth / inWidth));
+}
+
+void SuperResolutionModel::changeInputSize(std::shared_ptr<ov::Model>& model, int coeff) {
+ std::map<std::string, ov::PartialShape> shapes;
+ const ov::Layout& layout = ov::layout::get_layout(model->inputs().front());
+ const auto batchId = ov::layout::batch_idx(layout);
+ const auto heightId = ov::layout::height_idx(layout);
+ const auto widthId = ov::layout::width_idx(layout);
+
+ const ov::OutputVector& inputs = model->inputs();
+ std::string lrInputTensorName = inputs.begin()->get_any_name();
+ ov::Shape lrShape = inputs.begin()->get_shape();
+
+ if (inputs.size() == 2) {
+ std::string bicInputTensorName = (++inputs.begin())->get_any_name();
+ ov::Shape bicShape = (++inputs.begin())->get_shape();
+ if (lrShape[heightId] >= bicShape[heightId] && lrShape[widthId] >= bicShape[widthId]) {
+ std::swap(bicShape, lrShape);
+ std::swap(bicInputTensorName, lrInputTensorName);
+ }
+ bicShape[batchId] = 1;
+ bicShape[heightId] = coeff * netInputHeight;
+ bicShape[widthId] = coeff * netInputWidth;
+ shapes[bicInputTensorName] = ov::PartialShape(bicShape);
+ }
+
+ lrShape[batchId] = 1;
+ lrShape[heightId] = netInputHeight;
+ lrShape[widthId] = netInputWidth;
+ shapes[lrInputTensorName] = ov::PartialShape(lrShape);
+
+ model->reshape(shapes);
+}
+
+std::shared_ptr<InternalModelData> SuperResolutionModel::preprocess(const InputData& inputData,
+ ov::InferRequest& request) {
+ auto imgData = inputData.asRef<ImageInputData>();
+ auto& img = imgData.inputImage;
+
+ const ov::Tensor lrInputTensor = request.get_tensor(inputsNames[0]);
+ const ov::Layout layout("NHWC");
+
+ if (img.channels() != static_cast<int>(lrInputTensor.get_shape()[ov::layout::channels_idx(layout)])) {
+ cv::cvtColor(img, img, cv::COLOR_BGR2GRAY);
+ }
+
+ if (static_cast<size_t>(img.cols) != netInputWidth || static_cast<size_t>(img.rows) != netInputHeight) {
+ slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+ }
+ const size_t height = lrInputTensor.get_shape()[ov::layout::height_idx(layout)];
+ const size_t width = lrInputTensor.get_shape()[ov::layout::width_idx(layout)];
+ img = resizeImageExt(img, width, height);
+ request.set_tensor(inputsNames[0], wrapMat2Tensor(img));
+
+ if (inputsNames.size() == 2) {
+ const ov::Tensor bicInputTensor = request.get_tensor(inputsNames[1]);
+ const int h = static_cast<int>(bicInputTensor.get_shape()[ov::layout::height_idx(layout)]);
+ const int w = static_cast<int>(bicInputTensor.get_shape()[ov::layout::width_idx(layout)]);
+ cv::Mat resized;
+ cv::resize(img, resized, cv::Size(w, h), 0, 0, cv::INTER_CUBIC);
+ request.set_tensor(inputsNames[1], wrapMat2Tensor(resized));
+ }
+
+ return std::make_shared<InternalImageModelData>(img.cols, img.rows);
+}
+
+std::unique_ptr<ResultBase> SuperResolutionModel::postprocess(InferenceResult& infResult) {
+ ImageResult* result = new ImageResult;
+ *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+ const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+ std::vector<cv::Mat> imgPlanes;
+ const ov::Shape& outShape = infResult.getFirstOutputTensor().get_shape();
+ const size_t outChannels = static_cast<int>(outShape[1]);
+ const size_t outHeight = static_cast<int>(outShape[2]);
+ const size_t outWidth = static_cast<int>(outShape[3]);
+ const size_t numOfPixels = outWidth * outHeight;
+ if (outChannels == 3) {
+ imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+ cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))};
+ } else {
+ imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))};
+ // Post-processing for text-image-super-resolution models
+ cv::threshold(imgPlanes[0], imgPlanes[0], 0.5f, 1.0f, cv::THRESH_BINARY);
+ }
+
+ for (auto& img : imgPlanes) {
+ img.convertTo(img, CV_8UC1, 255);
+ }
+ cv::Mat resultImg;
+ cv::merge(imgPlanes, resultImg);
+ result->resultImage = resultImg;
+
+ return std::unique_ptr<ResultBase>(result);
+}