/* // Copyright (C) 2020-2022 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. */ #include "models/detection_model_retinaface.h" #include #include #include #include #include #include #include #include #include "models/internal_model_data.h" #include "models/results.h" ModelRetinaFace::ModelRetinaFace(const std::string& modelFileName, float confidenceThreshold, bool useAutoResize, float boxIOUThreshold, const std::string& layout) : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout), // Default label is "Face" shouldDetectMasks(false), shouldDetectLandmarks(false), boxIOUThreshold(boxIOUThreshold), maskThreshold(0.8f), landmarkStd(1.0f), anchorCfg({{32, {32, 16}, 16, {1}}, {16, {8, 4}, 16, {1}}, {8, {2, 1}, 16, {1}}}) { generateAnchorsFpn(); } void ModelRetinaFace::prepareInputsOutputs(std::shared_ptr& model) { // --------------------------- Configure input & output ------------------------------------------------- // --------------------------- Prepare input ------------------------------------------------------ if (model->inputs().size() != 1) { throw std::logic_error("RetinaFace model wrapper expects models that have only 1 input"); } const ov::Shape& inputShape = model->input().get_shape(); const ov::Layout& inputLayout = getInputLayout(model->input()); if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { throw std::logic_error("Expected 3-channel input"); } ov::preprocess::PrePostProcessor ppp(model); ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); if (useAutoResize) { ppp.input().tensor().set_spatial_dynamic_shape(); ppp.input() .preprocess() .convert_element_type(ov::element::f32) .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); } ppp.input().model().set_layout(inputLayout); // --------------------------- Reading image input parameters ------------------------------------------- inputsNames.push_back(model->input().get_any_name()); netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; // --------------------------- Prepare output ----------------------------------------------------- const ov::OutputVector& outputs = model->outputs(); if (outputs.size() != 6 && outputs.size() != 9 && outputs.size() != 12) { throw std::logic_error("RetinaFace model wrapper expects models that have 6, 9 or 12 outputs"); } const ov::Layout outputLayout{"NCHW"}; std::vector outputsSizes[OUT_MAX]; for (const auto& output : model->outputs()) { auto outTensorName = output.get_any_name(); outputsNames.push_back(outTensorName); ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout); OutputType type = OUT_MAX; if (outTensorName.find("box") != std::string::npos) { type = OUT_BOXES; } else if (outTensorName.find("cls") != std::string::npos) { type = OUT_SCORES; } else if (outTensorName.find("landmark") != std::string::npos) { type = OUT_LANDMARKS; shouldDetectLandmarks = true; } else if (outTensorName.find("type") != std::string::npos) { type = OUT_MASKSCORES; labels.clear(); labels.push_back("No Mask"); labels.push_back("Mask"); shouldDetectMasks = true; landmarkStd = 0.2f; } else { continue; } size_t num = output.get_shape()[ov::layout::height_idx(outputLayout)]; size_t i = 0; for (; i < outputsSizes[type].size(); ++i) { if (num < outputsSizes[type][i]) { break; } } separateOutputsNames[type].insert(separateOutputsNames[type].begin() + i, outTensorName); outputsSizes[type].insert(outputsSizes[type].begin() + i, num); } model = ppp.build(); for (size_t idx = 0; idx < outputsSizes[OUT_BOXES].size(); ++idx) { size_t width = outputsSizes[OUT_BOXES][idx]; size_t height = outputsSizes[OUT_BOXES][idx]; auto s = anchorCfg[idx].stride; auto anchorNum = anchorsFpn[s].size(); anchors.push_back(std::vector(height * width * anchorNum)); for (size_t iw = 0; iw < width; ++iw) { size_t sw = iw * s; for (size_t ih = 0; ih < height; ++ih) { size_t sh = ih * s; for (size_t k = 0; k < anchorNum; ++k) { Anchor& anc = anchors[idx][(ih * width + iw) * anchorNum + k]; anc.left = anchorsFpn[s][k].left + sw; anc.top = anchorsFpn[s][k].top + sh; anc.right = anchorsFpn[s][k].right + sw; anc.bottom = anchorsFpn[s][k].bottom + sh; } } } } } std::vector ratioEnum(const Anchor& anchor, const std::vector& ratios) { std::vector retVal; const auto w = anchor.getWidth(); const auto h = anchor.getHeight(); const auto xCtr = anchor.getXCenter(); const auto yCtr = anchor.getYCenter(); for (const auto ratio : ratios) { const auto size = w * h; const auto sizeRatio = static_cast(size) / ratio; const auto ws = sqrt(sizeRatio); const auto hs = ws * ratio; retVal.push_back({static_cast(xCtr - 0.5f * (ws - 1.0f)), static_cast(yCtr - 0.5f * (hs - 1.0f)), static_cast(xCtr + 0.5f * (ws - 1.0f)), static_cast(yCtr + 0.5f * (hs - 1.0f))}); } return retVal; } std::vector scaleEnum(const Anchor& anchor, const std::vector& scales) { std::vector retVal; const auto w = anchor.getWidth(); const auto h = anchor.getHeight(); const auto xCtr = anchor.getXCenter(); const auto yCtr = anchor.getYCenter(); for (auto scale : scales) { const auto ws = w * scale; const auto hs = h * scale; retVal.push_back({static_cast(xCtr - 0.5f * (ws - 1.0f)), static_cast(yCtr - 0.5f * (hs - 1.0f)), static_cast(xCtr + 0.5f * (ws - 1.0f)), static_cast(yCtr + 0.5f * (hs - 1.0f))}); } return retVal; } std::vector generateAnchors(const int baseSize, const std::vector& ratios, const std::vector& scales) { Anchor baseAnchor{0.0f, 0.0f, baseSize - 1.0f, baseSize - 1.0f}; auto ratioAnchors = ratioEnum(baseAnchor, ratios); std::vector retVal; for (const auto& ra : ratioAnchors) { auto addon = scaleEnum(ra, scales); retVal.insert(retVal.end(), addon.begin(), addon.end()); } return retVal; } void ModelRetinaFace::generateAnchorsFpn() { auto cfg = anchorCfg; std::sort(cfg.begin(), cfg.end(), [](const AnchorCfgLine& x, const AnchorCfgLine& y) { return x.stride > y.stride; }); for (const auto& cfgLine : cfg) { anchorsFpn.emplace(cfgLine.stride, generateAnchors(cfgLine.baseSize, cfgLine.ratios, cfgLine.scales)); } } std::vector thresholding(const ov::Tensor& scoresTensor, const int anchorNum, const float confidenceThreshold) { std::vector indices; indices.reserve(ModelRetinaFace::INIT_VECTOR_SIZE); auto shape = scoresTensor.get_shape(); size_t restAnchors = shape[1] - anchorNum; const float* scoresPtr = scoresTensor.data(); for (size_t x = anchorNum; x < shape[1]; ++x) { for (size_t y = 0; y < shape[2]; ++y) { for (size_t z = 0; z < shape[3]; ++z) { auto idx = (x * shape[2] + y) * shape[3] + z; auto score = scoresPtr[idx]; if (score >= confidenceThreshold) { indices.push_back((y * shape[3] + z) * restAnchors + (x - anchorNum)); } } } } return indices; } void filterScores(std::vector& scores, const std::vector& indices, const ov::Tensor& scoresTensor, const int anchorNum) { const auto& shape = scoresTensor.get_shape(); const float* scoresPtr = scoresTensor.data(); const auto start = shape[2] * shape[3] * anchorNum; for (auto i : indices) { auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum; scores.push_back(scoresPtr[start + offset]); } } void filterBoxes(std::vector& boxes, const std::vector& indices, const ov::Tensor& boxesTensor, int anchorNum, const std::vector& anchors) { const auto& shape = boxesTensor.get_shape(); const float* boxesPtr = boxesTensor.data(); const auto boxPredLen = shape[1] / anchorNum; const auto blockWidth = shape[2] * shape[3]; for (auto i : indices) { auto offset = blockWidth * boxPredLen * (i % anchorNum) + (i / anchorNum); const auto dx = boxesPtr[offset]; const auto dy = boxesPtr[offset + blockWidth]; const auto dw = boxesPtr[offset + blockWidth * 2]; const auto dh = boxesPtr[offset + blockWidth * 3]; const auto predCtrX = dx * anchors[i].getWidth() + anchors[i].getXCenter(); const auto predCtrY = dy * anchors[i].getHeight() + anchors[i].getYCenter(); const auto predW = exp(dw) * anchors[i].getWidth(); const auto predH = exp(dh) * anchors[i].getHeight(); boxes.push_back({static_cast(predCtrX - 0.5f * (predW - 1.0f)), static_cast(predCtrY - 0.5f * (predH - 1.0f)), static_cast(predCtrX + 0.5f * (predW - 1.0f)), static_cast(predCtrY + 0.5f * (predH - 1.0f))}); } } void filterLandmarks(std::vector& landmarks, const std::vector& indices, const ov::Tensor& landmarksTensor, int anchorNum, const std::vector& anchors, const float landmarkStd) { const auto& shape = landmarksTensor.get_shape(); const float* landmarksPtr = landmarksTensor.data(); const auto landmarkPredLen = shape[1] / anchorNum; const auto blockWidth = shape[2] * shape[3]; for (auto i : indices) { for (int j = 0; j < ModelRetinaFace::LANDMARKS_NUM; ++j) { auto offset = (i % anchorNum) * landmarkPredLen * shape[2] * shape[3] + i / anchorNum; auto deltaX = landmarksPtr[offset + j * 2 * blockWidth] * landmarkStd; auto deltaY = landmarksPtr[offset + (j * 2 + 1) * blockWidth] * landmarkStd; landmarks.push_back({deltaX * anchors[i].getWidth() + anchors[i].getXCenter(), deltaY * anchors[i].getHeight() + anchors[i].getYCenter()}); } } } void filterMasksScores(std::vector& masks, const std::vector& indices, const ov::Tensor& maskScoresTensor, const int anchorNum) { auto shape = maskScoresTensor.get_shape(); const float* maskScoresPtr = maskScoresTensor.data(); auto start = shape[2] * shape[3] * anchorNum * 2; for (auto i : indices) { auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum; masks.push_back(maskScoresPtr[start + offset]); } } std::unique_ptr ModelRetinaFace::postprocess(InferenceResult& infResult) { std::vector scores; scores.reserve(INIT_VECTOR_SIZE); std::vector boxes; boxes.reserve(INIT_VECTOR_SIZE); std::vector landmarks; std::vector masks; if (shouldDetectLandmarks) { landmarks.reserve(INIT_VECTOR_SIZE); } if (shouldDetectMasks) { masks.reserve(INIT_VECTOR_SIZE); } // --------------------------- Gather & Filter output from all levels // ---------------------------------------------------------- for (size_t idx = 0; idx < anchorCfg.size(); ++idx) { const auto boxRaw = infResult.outputsData[separateOutputsNames[OUT_BOXES][idx]]; const auto scoresRaw = infResult.outputsData[separateOutputsNames[OUT_SCORES][idx]]; auto s = anchorCfg[idx].stride; auto anchorNum = anchorsFpn[s].size(); auto validIndices = thresholding(scoresRaw, anchorNum, confidenceThreshold); filterScores(scores, validIndices, scoresRaw, anchorNum); filterBoxes(boxes, validIndices, boxRaw, anchorNum, anchors[idx]); if (shouldDetectLandmarks) { const auto landmarksRaw = infResult.outputsData[separateOutputsNames[OUT_LANDMARKS][idx]]; filterLandmarks(landmarks, validIndices, landmarksRaw, anchorNum, anchors[idx], landmarkStd); } if (shouldDetectMasks) { const auto masksRaw = infResult.outputsData[separateOutputsNames[OUT_MASKSCORES][idx]]; filterMasksScores(masks, validIndices, masksRaw, anchorNum); } } // --------------------------- Apply Non-maximum Suppression // ---------------------------------------------------------- !shouldDetectLandmarks determines nms behavior, if // true - boundaries are included in areas calculation const auto keep = nms(boxes, scores, boxIOUThreshold, !shouldDetectLandmarks); // --------------------------- Create detection result objects // -------------------------------------------------------- RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData); const auto imgWidth = infResult.internalModelData->asRef().inputImgWidth; const auto imgHeight = infResult.internalModelData->asRef().inputImgHeight; const auto scaleX = static_cast(netInputWidth) / imgWidth; const auto scaleY = static_cast(netInputHeight) / imgHeight; result->objects.reserve(keep.size()); result->landmarks.reserve(keep.size() * ModelRetinaFace::LANDMARKS_NUM); for (auto i : keep) { DetectedObject desc; desc.confidence = scores[i]; //--- Scaling coordinates boxes[i].left /= scaleX; boxes[i].top /= scaleY; boxes[i].right /= scaleX; boxes[i].bottom /= scaleY; desc.x = clamp(boxes[i].left, 0.f, static_cast(imgWidth)); desc.y = clamp(boxes[i].top, 0.f, static_cast(imgHeight)); desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast(imgWidth)); desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast(imgHeight)); //--- Default label 0 - Face. If detecting masks then labels would be 0 - No Mask, 1 - Mask desc.labelID = shouldDetectMasks ? (masks[i] > maskThreshold) : 0; desc.label = labels[desc.labelID]; result->objects.push_back(desc); //--- Scaling landmarks coordinates for (size_t l = 0; l < ModelRetinaFace::LANDMARKS_NUM && shouldDetectLandmarks; ++l) { landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x = clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x / scaleX, 0.f, static_cast(imgWidth)); landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y = clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y / scaleY, 0.f, static_cast(imgHeight)); result->landmarks.push_back(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l]); } } return std::unique_ptr(result); }