summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/dla_benchmark/main.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/runtime/dla_benchmark/main.cpp')
-rw-r--r--python/openvino/runtime/dla_benchmark/main.cpp1575
1 files changed, 1575 insertions, 0 deletions
diff --git a/python/openvino/runtime/dla_benchmark/main.cpp b/python/openvino/runtime/dla_benchmark/main.cpp
new file mode 100644
index 0000000..9d9055d
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/main.cpp
@@ -0,0 +1,1575 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Main file of DLA benchmark. Entry point of DLA for just in time, ahead of time execution
+// and any use case of DLA performing inference. This file is responsible for the end to end flow of DLA,
+// from reading user input arguments, creating input tensors, compiling models, running inference
+// dumping results. DLA benchmark is loosely based off of OpenVINO's sample benchmark app.
+// For future OpenVINO uplifts viewing their sample app is a good place to start.
+// Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/main.cpp]
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+#if defined(_WIN32) || defined(_WIN64)
+#include <io.h>
+#define NOMINMAX
+#include <Windows.h>
+#else
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <regex>
+
+#include <samples/args_helper.hpp>
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+
+// DLA utils
+#include "dla_stl_utils.h"
+#include "dla_defines.h"
+
+// DLA benchmark
+#include "average_precision.hpp"
+#include "dla_benchmark.hpp"
+#include "dla_plugin_config.hpp"
+#include "infer_request_wrap.hpp"
+#include "inputs_filling.hpp"
+#include "progress_bar.hpp"
+#include "statistics_report.hpp"
+#include "top1_top5.hpp"
+#include "utils.hpp"
+
+using DebugNetworkData = std::map<std::string, uint64_t>;
+using LSUCounterData = std::map<std::string, uint64_t>;
+
+static const size_t progressBarDefaultTotalCount = 1000;
+
+// Get value from env variable named 'name', if it exists.
+// If not, returns provided default value.
+template <class T>
+T GetEnvOrDefault(const char* name, T default_value) {
+ char* str_val = std::getenv(name);
+ T result = default_value;
+ if (str_val != NULL) {
+ std::stringstream ss;
+ ss << str_val;
+ ss >> result;
+ }
+ return result;
+}
+
+bool ExistsTest(const std::string& name) {
+ struct stat buffer;
+ return (stat(name.c_str(), &buffer) == 0);
+}
+
+bool isFile(const std::string& path) {
+#if defined(_WIN32) || defined(_WIN64)
+ std::cout << "Windows-specific implementation for checking if something is a file" << std::endl;
+ // Windows-specific implementation
+ DWORD fileAttr = GetFileAttributesA(path.c_str());
+ if (fileAttr == INVALID_FILE_ATTRIBUTES) {
+ // The path does not exist or an error occurred.
+ return false;
+ }
+ // Check if it's not a directory.
+ return !(fileAttr & FILE_ATTRIBUTE_DIRECTORY);
+#else
+ // UNIX-specific implementation
+ struct stat buffer;
+ if (stat(path.c_str(), &buffer) == 0) {
+ return S_ISREG(buffer.st_mode);
+ }
+ return false;
+#endif
+}
+
+// This function appears in dla_aot_splitter/src/main.cpp too
+bool DirOpenTest(const std::string& name) {
+#if (!defined(_WIN32) && !defined(_WIN64))
+ // If we can open the directory then return true
+ DIR* dp = opendir(name.c_str());
+ if (dp != nullptr) {
+ closedir(dp);
+ return true;
+ }
+#endif // !_WIN32 && !_WIN64
+ struct stat sb;
+ if (stat(name.c_str(), &sb) == 0) {
+ if ((sb.st_mode & S_IFMT) != S_IFREG) {
+ slog::err << "File " << name << " cannot be opened!" << slog::endl;
+ throw std::logic_error("File cannot be opened!");
+ }
+ }
+ return true;
+}
+
+// Define a custom comparison function to sort based on ASCII names
+bool CompareOutputNodeNames(const ov::Output<const ov::Node>& node1, const ov::Output<const ov::Node>& node2) {
+ return node1.get_any_name() < node2.get_any_name();
+}
+
+// copy arguments into a new array to split the '-i=<arg>' into
+// two arguments (i.e. '-i' and '<arg>') to overcome a bug
+// parseInputFilesArguments function where is doesn't recognize
+// the -i=<arg> format
+void ParseCommandLine(int argc, char** argv) {
+ int num_args = argc;
+ // allocated enough memory in case we needed to split the -i argument into two
+ char** arguments = new char*[num_args + 1];
+ for (int i = 0, j = 0; j < argc; ++i, ++j) {
+ if (strstr(argv[j], "-i=")) {
+ // number of arguments will increase by one after splitting
+ num_args++;
+ arguments[i] = new char[3];
+ strcpy(arguments[i++], "-i");
+ // copy the reset of the argument (i.e. post "-i=")
+ arguments[i] = new char[strlen(argv[j]) - 2];
+ strcpy(arguments[i], argv[j] + 3);
+ continue;
+ }
+ arguments[i] = new char[strlen(argv[j]) + 1];
+ strcpy(arguments[i], argv[j]);
+ }
+ // the parse function is modifying the arguments point so we need to keep
+ // a copy of the original pointer value to delete it properly
+ char** orig_arg_ptr = arguments;
+ gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true);
+ // delete the allocated memory
+ for (int i = 0; i < num_args; ++i) {
+ delete[] orig_arg_ptr[i];
+ }
+ delete[] orig_arg_ptr;
+}
+
+bool CheckAndSetPluginsPath(const char* coredla_root) {
+ // plugins_xml_file should probably be removed in the future
+ if (!FLAGS_plugins_xml_file.empty()) {
+ FLAGS_plugins = FLAGS_plugins_xml_file;
+ slog::warn << "====================================================================" << slog::endl;
+ slog::warn << "Warning: -plugins_xml_file option is deprecated, please use -plugins." << slog::endl;
+ slog::warn << "====================================================================" << slog::endl;
+ }
+
+ const char* coredla_work = std::getenv("COREDLA_WORK");
+ std::string coredla_root_str = coredla_root;
+ if (FLAGS_plugins.empty()) {
+ if (coredla_work == nullptr) {
+ FLAGS_plugins = coredla_root_str + "/runtime/plugins.xml";
+ } else {
+ std::string coredla_work_str = coredla_work;
+ FLAGS_plugins = coredla_work_str + "/runtime/plugins.xml";
+ }
+
+ if (ExistsTest(FLAGS_plugins)) {
+ slog::info << "Using default plugins xml file - " << FLAGS_plugins << slog::endl;
+ return true;
+ }
+ }
+
+ if (ExistsTest(FLAGS_plugins) && isFile(FLAGS_plugins)) {
+ slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl;
+ return true;
+ }
+ // Check if user wants a shortcut to software emulation xml file if a path does not exist
+ if (FLAGS_plugins.find("emulation") != std::string::npos) {
+ // Potential paths for the plugins_emulation.xml file
+ std::string deployed_loc_plugins = coredla_root_str + "/bin/plugins_emulation.xml";
+ std::string developer_loc_plugins = coredla_root_str + "/build/coredla/dla/bin/plugins_emulation.xml";
+
+ if (ExistsTest(deployed_loc_plugins))
+ FLAGS_plugins = deployed_loc_plugins;
+ else if (ExistsTest(developer_loc_plugins))
+ FLAGS_plugins = developer_loc_plugins;
+ } else {
+ // if user didn't specify emulation and user did not pass any xml file, raise an error
+ throw std::invalid_argument("Invalid argument for -plugins. Use 'emulation' or a path to custom xml file");
+ }
+
+ if (ExistsTest(FLAGS_plugins)) {
+ slog::info << "Using custom emulation xml file - " << FLAGS_plugins << slog::endl;
+ return true;
+ }
+
+ return false;
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& net_size) {
+ // ---------------------------Parsing and validating input arguments--------------------------------------
+ slog::info << "Parsing input parameters" << slog::endl;
+
+ // Check for any flags that are missing their preceding dashes
+ // GFlags quietly ignores any flags missing their dashes, which can cause
+ // dla_benchmark to run with settings other than what the user intended
+
+ // GFlags supports two different styles of flag:
+ // 1. --<flag>
+ // 2. -<flag>
+ // It also supports two different ways of specifying values for flags which
+ // take values:
+ // 1. --<flag>=<value>
+ // 2. --<flag> <value>
+
+ // If we are not expecting a flag, we are expecting a value for the
+ // preceding flag
+ bool expecting_flag = true;
+ // Start at 1 to skip the command itself
+ for (int i = 1; i < argc; i++) {
+ if (expecting_flag) {
+ // A flag is always denoted by the first char being '-'
+ if (argv[i][0] != '-') {
+ slog::err << "Argument " << argv[i] << " is invalid. You"
+ << " may have forgotten a preceding '-'." << slog::endl;
+ throw std::logic_error("One or more invalid arguments");
+ }
+
+ char* flag_name_start = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1];
+ std::string flag_name;
+
+ gflags::CommandLineFlagInfo flag_info;
+ if (strstr(flag_name_start, "=")) {
+ flag_name = std::string(flag_name_start, size_t(strstr(flag_name_start, "=") - flag_name_start));
+ } else {
+ flag_name = std::string(flag_name_start);
+ }
+
+ // We expect a flag in the next argv if the current flag is a bool,
+ // because bool flags do not take a value.
+ // If GetCommandLineFlagInfo returns false, we assume the current
+ // flag is a boolean because boolean flags can be specified as
+ // -no<flag>, which is equivalent to -<flag>=false, or the flag
+ // simply being omitted. However, "no<flag>" is not recognized by
+ // GetCommandLineFlagInfo.
+ // Therefore, if the name is not recognized either the flag is a
+ // boolean flag or doesn't exist. In the latter case, gflags errors
+ // when we call ParseCommandLine so we can assume here it's a bool.
+ if (!GetCommandLineFlagInfo(flag_name.c_str(), &flag_info) || strstr(argv[i], "=") || flag_info.type == "bool") {
+ expecting_flag = true;
+ } else {
+ expecting_flag = false;
+ }
+ } else {
+ // If we were expecting a value, doesn't matter what it is
+ // gflags will check all values are the correct type, and
+ // dla_benchmark checks if the values received are sane
+ expecting_flag = true;
+ }
+ }
+
+ ParseCommandLine(argc, argv);
+
+ if (FLAGS_help || FLAGS_h) {
+ ShowUsage();
+ // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it
+ // is an OpenCL/DLAv1 device. Since it is not, it then errors-out when the device
+ // does not response as expected to the OpenCL query.
+ // showAvailableDevices();
+ std::cout << "\n";
+ return false;
+ }
+
+ if (FLAGS_hidden_help) {
+ PrintHiddenHelp();
+ return false;
+ }
+
+ if (FLAGS_cm.empty()) {
+ std::string network_file_flag;
+ if (!FLAGS_m.empty()) {
+ if (!FLAGS_network_file.empty()) {
+ throw std::invalid_argument(
+ "Both --network-file and -m are specified. Please only use one of the two arguments.");
+ }
+ network_file_flag = FLAGS_m;
+ } else if (!FLAGS_network_file.empty()) {
+ network_file_flag = FLAGS_network_file;
+ } else {
+ throw std::logic_error("Model is required but not set. Please set -m option.");
+ }
+
+ std::vector<std::string> m_paths = split(network_file_flag, MULTIGRAPH_SEP);
+ net_size = m_paths.size();
+ slog::info << "Found " << net_size << " graph" << (net_size == 1 ? "" : "s") << slog::endl;
+ for (auto& m_path : m_paths) {
+ if (!ExistsTest(m_path)) {
+ slog::err << "network file: " << m_path << " doesn't exist. Please provide a valid path with -m." << slog::endl;
+ throw std::logic_error("Model file path does not exist.");
+ }
+ }
+ } else {
+ std::vector<std::string> m_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+ net_size = m_paths.size();
+ slog::info << "Found " << net_size << " compiled graph" << (net_size == 1 ? "" : "s") << slog::endl;
+ for (auto& m_path : m_paths) {
+ if (!ExistsTest(m_path)) {
+ slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm."
+ << slog::endl;
+ throw std::logic_error("Compiled model file path does not exist.");
+ }
+ }
+ }
+
+ if (FLAGS_api != "async" && FLAGS_api != "sync") {
+ throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
+ }
+
+ if (FLAGS_niter <= 0) {
+ throw std::logic_error("-niter is a required flag and its value must be positive");
+ }
+
+ const char* coredla_root = std::getenv("COREDLA_ROOT");
+ if (coredla_root == nullptr) {
+ slog::err << "ERROR: COREDLA_ROOT environment variable is not set." << slog::endl;
+ throw std::logic_error("Please set up correct environment variables first");
+ }
+
+ if (!CheckAndSetPluginsPath(coredla_root)) {
+ slog::err << "plugins_xml file: " << FLAGS_plugins_xml_file << " doesn't exist. Please provide a valid path."
+ << slog::endl;
+ throw std::logic_error("plugins_xml file path does not exist.");
+ }
+
+ // Checks required arguments for the mAP calculation subroutine.
+ if (FLAGS_enable_object_detection_ap) {
+ if (!FLAGS_yolo_version.size() || !is_yolo_supported(FLAGS_yolo_version)) {
+ slog::err << "Please specify the version of your YOLO graph by setting the -yolo_version option to "
+ "`yolo-v3-tiny-tf` or `yolo-v3-tf` value."
+ << slog::endl;
+ throw std::logic_error("Incorrect YOLO version.");
+ }
+ }
+
+ // Checks if output directory exists and can be opened
+ if (!FLAGS_output_dir.empty()) {
+ if (!ExistsTest(FLAGS_output_dir)) {
+ slog::err << "Specified output directory: " << FLAGS_output_dir << " does not exist" << slog::endl;
+ throw std::logic_error("Output directory does not exist");
+ }
+ // Test whether the path can be opened if it's a directory
+ DirOpenTest(FLAGS_output_dir);
+ }
+
+ return true;
+}
+
+static void next_step(const std::string additional_info = "") {
+ static size_t step_id = 0;
+ static const std::map<size_t, std::string> step_names = {{1, "Parsing and validating input arguments"},
+ {2, "Loading OpenVINO Runtime"},
+ {3, "Setting device configuration"},
+ {4, "Reading the Intermediate Representation network"},
+ {5, "Resizing network to match image sizes and given batch"},
+ {6, "Configuring input of the model"},
+ {7, "Loading the model to the device"},
+ {8, "Setting optimal runtime parameters"},
+ {9, "Creating infer requests and preparing input tensors"},
+ {10, "Measuring performance"},
+ {11, "Dumping statistics report"},
+ {12, "Dumping the output values"}};
+
+ step_id++;
+ if (step_names.count(step_id) == 0)
+ THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size();
+
+ std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id)
+ << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl;
+}
+
+template <typename T>
+T GetMedianValue(const std::vector<T>& vec) {
+ std::vector<T> sorted_vec(vec);
+ std::sort(sorted_vec.begin(), sorted_vec.end());
+ return (sorted_vec.size() % 2 != 0)
+ ? sorted_vec[sorted_vec.size() / 2ULL]
+ : (sorted_vec[sorted_vec.size() / 2ULL] + sorted_vec[sorted_vec.size() / 2ULL - 1ULL]) /
+ static_cast<T>(2.0);
+}
+
+void ReadDebugNetworkInfo(ov::Core core) {
+ if (FLAGS_debug_network) {
+ // On hardware timeout exception, fetch Debug CSR values from all modules attached to the Debug Network
+ std::vector<DebugNetworkData> debug_csr_return =
+ core.get_property("FPGA", "COREDLA_DEBUG_NETWORK_INFO").as<std::vector<DebugNetworkData>>();
+ slog::info << "Dumping Debug Network profiling counters" << slog::endl;
+ for (auto i = 0U; i < debug_csr_return.size(); i++) {
+ std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
+ // Print debug info for all instances
+ for (auto& instance_csr_return : debug_csr_return[i]) {
+ std::cout << instance_csr_return.first << ": " << instance_csr_return.second << std::endl;
+ }
+ }
+ }
+}
+
+void PrintLSUCounterInfo(ov::Core core) {
+ std::vector<LSUCounterData> lsu_counter_vec =
+ core.get_property("FPGA", "COREDLA_LSU_ACCESS_COUNT").as<std::vector<LSUCounterData>>();
+ slog::info << "Dumping LSU memory access counters" << slog::endl;
+ for (auto i = 0U; i < lsu_counter_vec.size(); i++) {
+ std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
+ for (const auto& entry : lsu_counter_vec.at(i)) {
+ std::cout << entry.first <<": " << entry.second << std::endl;
+ }
+ }
+}
+
+// Returns true if last char of csv is a comma
+bool is_last_char_comma(FILE* file) {
+ if (file == nullptr) return 0;
+
+ int i = -1;
+ std::vector<char> white_space_chars = {'\n', ' ', '\t', '\r', '\f', '\v'};
+ char last_char[1];
+ do {
+ if (std::fseek(file, i, SEEK_END) != 0) {
+ return 0;
+ }
+ if (std::fread(last_char, 1, 1, file) == 0) {
+ return 0;
+ }
+ i--;
+ } while (std::count(white_space_chars.begin(), white_space_chars.end(), last_char[0]) != 0);
+
+ return last_char[0] == ',';
+}
+
+bool fileExists(std::string& path) {
+ struct stat buffer;
+ return (stat(path.c_str(), &buffer) == 0);
+}
+
+void append_value_if_incomplete_to_csv(std::string path, double value) {
+ try {
+ if (!fileExists(path)) {
+ return;
+ }
+
+ FILE* data_file = fopen(path.c_str(), "rb");
+ if (data_file == nullptr) {
+ return;
+ }
+ bool is_comma = is_last_char_comma(data_file);
+ fclose(data_file);
+
+ if (is_comma) {
+ FILE* append_file = fopen(path.c_str(), "a");
+ if (append_file == nullptr) {
+ return;
+ }
+ fprintf(append_file, "%f\n", value);
+ fclose(append_file);
+ }
+ } catch (...) {
+ return;
+ }
+}
+
+/**
+ * @brief The entry point of the dla benchmark
+ */
+int main(int argc, char* argv[]) {
+ std::shared_ptr<StatisticsReport> statistics;
+ try {
+ // Declaring the CompiledModel object as a pointer to workaround the segfault
+ // that occurs when destructing the object. Now that it's declared as a pointer
+ // the complier won't automatically call the destructor of the object at the end
+ // of this scope and we won't delete the allocated memory either
+ std::vector<ov::CompiledModel*> compiled_models;
+ size_t net_size = 0; // parse the size of networks for arguments check
+
+ size_t return_code = 0; // universal return code, return this value after dumping out Debug info
+
+ // ----------------- 1. Parsing and validating input arguments -------------------------------------------------
+ next_step();
+
+ if (!ParseAndCheckCommandLine(argc, argv, net_size)) {
+ return 0;
+ }
+
+ bool is_model_compiled = !FLAGS_cm.empty();
+ if (is_model_compiled) {
+ slog::info << "Model is compiled" << slog::endl;
+ }
+
+ std::string arch_file_flag;
+ if (!FLAGS_arch_file.empty()) {
+ if (!FLAGS_arch.empty()) {
+ throw std::invalid_argument(
+ "Both --arch and -arch_file are specified. Please only use one of the two arguments.");
+ }
+ arch_file_flag = FLAGS_arch_file;
+ } else if (!FLAGS_arch.empty()) {
+ arch_file_flag = FLAGS_arch;
+ }
+
+ bool flag_b_default = gflags::GetCommandLineFlagInfoOrDie("b").is_default;
+ bool flag_batch_size_default = gflags::GetCommandLineFlagInfoOrDie("batch_size").is_default;
+
+ size_t batch_size_flag;
+ if (!flag_b_default) {
+ if (!flag_batch_size_default) {
+ throw std::invalid_argument(
+ "Both --batch-size and -b are specified. Please only use one of the two arguments.");
+ }
+ batch_size_flag = FLAGS_b;
+ } else {
+ batch_size_flag = FLAGS_batch_size;
+ }
+
+ if (batch_size_flag > 10000 || batch_size_flag <= 0) {
+ throw std::invalid_argument(
+ "Batch size is too big (>10000) or not a postive number (<=0). Specify the batch size within the specified "
+ "range.");
+ }
+
+ std::string network_file_flag;
+ if (!FLAGS_m.empty()) {
+ if (!FLAGS_network_file.empty()) {
+ throw std::invalid_argument(
+ "Both --network-file and -m are specified. Please only use one of the two arguments.");
+ }
+ network_file_flag = FLAGS_m;
+ } else if (!FLAGS_network_file.empty()) {
+ network_file_flag = FLAGS_network_file;
+ }
+
+ // langsu: ideally use boost to create a sub-folder for ddrfree files
+ // but ed4 toolchain doesn't have boost yet.
+ std::string output_dir;
+ std::string parameter_rom_output_dir;
+ std::string separator = dla::util::path_separator;
+ if (!FLAGS_output_dir.empty()) {
+ output_dir = FLAGS_output_dir + separator;
+ parameter_rom_output_dir = output_dir;
+ } else {
+ output_dir = "." + separator;
+ parameter_rom_output_dir = output_dir;
+ }
+
+ // The set of arguments printed is meant to be a useful summary to the
+ // user, rather than all of the arguments to dla_benchmark
+ slog::info << "Printing summary of arguments being used by dla_benchmark" << slog::endl
+ << "API (-api) ........................... " << FLAGS_api << slog::endl
+ << "Device (-d) .......................... " << FLAGS_d << slog::endl
+ << "Batch size (-b) ...................... " << batch_size_flag << slog::endl
+ << (!FLAGS_cm.empty() ? "Compiled model (-cm) ................. "
+ : "Model (-m) ........................... ")
+ << (!FLAGS_cm.empty() ? FLAGS_cm : network_file_flag) << slog::endl
+ << "Num iterations (-niter) .............. "
+ << (FLAGS_niter > 0 ? std::to_string(FLAGS_niter) : "Not specified") << slog::endl
+ << "Input images directory (-i) .......... "
+ << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl
+ << "Num CPU threads (-nthreads) .......... "
+ << (FLAGS_nthreads > 0 ? std::to_string(FLAGS_nthreads) : "Not specified") << slog::endl
+ << "Architecture file (-arch_file) ....... " << arch_file_flag << slog::endl
+ << "Num inference requests (-nireq) ...... "
+ << (FLAGS_nireq > 0 ? std::to_string(FLAGS_nireq) : "Not specified") << slog::endl
+ << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl
+ << "Groundtruth file (-groundtruth_loc) .. "
+ << (!FLAGS_groundtruth_loc.empty() ? FLAGS_groundtruth_loc : "Not specified") << slog::endl
+ << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl
+ << "EA features " << (FLAGS_enable_early_access ? "enabled." : "disabled.") << slog::endl;
+
+ if (FLAGS_save_run_summary) {
+ std::vector<gflags::CommandLineFlagInfo> flags;
+ StatisticsReport::Parameters command_line_arguments;
+ gflags::GetAllFlags(&flags);
+
+ for (auto& flag : flags) {
+ if (!flag.is_default) {
+ command_line_arguments.push_back({flag.name, flag.current_value});
+ }
+ }
+
+ if (!FLAGS_pcsort.empty() &&
+ (FLAGS_pcsort != "simple_sort" && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort")) {
+ slog::err << "Invalid -pcsort option: " << FLAGS_pcsort << ". Please use one of sort, simple_sort, no_sort."
+ << slog::endl;
+ return 1;
+ }
+
+ statistics =
+ std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_save_run_summary, FLAGS_report_folder});
+ statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments);
+ }
+
+ /** This vector stores paths to the processed images **/
+ auto multi_input_files = VectorMap<std::vector<std::string>>(
+ SplitMultiInputFilesArguments(net_size), // get input directory list
+ [&](const std::vector<std::string>& input_args) mutable {
+ std::vector<std::string> files;
+ for (auto& input_arg : input_args) {
+ // Test if the path exists
+ if (!ExistsTest(input_arg)) {
+ slog::err << "Specified image path: " << input_arg << " does not exist" << slog::endl;
+ throw std::logic_error("Image path does not exist");
+ }
+ // Test whether the path can be opened if it's a directory
+ DirOpenTest(input_arg);
+ readInputFilesArguments(files, input_arg);
+ }
+ return files;
+ });
+
+ if (multi_input_files.size() == 0) {
+ // failed to read input files
+ slog::err << "Failed to read input files" << slog::endl;
+ return 1;
+ }
+
+ if (FLAGS_nstreams.empty()) {
+ slog::warn << "-nstreams default value is determined automatically for a device. " << slog::endl;
+ std::cout << "\tAlthough the automatic selection usually provides a reasonable performance, \n"
+ << "\tbut it still may be non-optimal for some cases, for more information look at README."
+ << std::endl;
+ }
+
+#ifdef DISABLE_JIT
+ if (!network_file_flag.empty()) {
+ slog::err << "Runtime compiled without support for Just-in-Time (JIT) execution!" << slog::endl
+ << "Either specify a compiled model using -cm <compiled_model.bin> "
+ << "or recompile the runtime without the -disable_jit flag." << slog::endl;
+ return 1;
+ }
+#endif
+
+ uint32_t num_batches = 1;
+
+ // ----------------- 2. Loading OpenVINO Runtime/Inference Engine
+ // -----------------------------------------------------------
+ next_step();
+
+ // Get optimal runtime parameters for device
+ std::string device_name = FLAGS_d;
+ if (is_model_compiled) {
+ auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP); // separate each AOT file path
+ for (auto& compiled_graph : compiled_graph_paths) {
+ std::filebuf obj_file_buf;
+ // There does not seem to be a way to get the device from the OpenVINO executable network
+ // Instead we manually read through the xml header in the AOT graph to get the device name (an ugly hack
+ // unfortunately)
+ obj_file_buf.open(compiled_graph.c_str(), std::ios::in | std::ios::binary);
+ std::istream obj_istream(&obj_file_buf);
+ std::string xml_header, current_device;
+ getline(obj_istream, xml_header); // retrieve xml header from AOT bin file
+ if (xml_header.find("TARGET_FALLBACK") != std::string::npos) { // uses hetero plugin
+ int start_index = xml_header.find("TARGET_FALLBACK") + 24;
+ int end_index = xml_header.find("</hetero_config>") - 3;
+ current_device =
+ "HETERO:" + xml_header.substr(start_index, end_index - start_index); // get device from xml header
+ } else {
+ current_device = "FPGA";
+ }
+ if (device_name == "") { // device flag not specified in AOT flow
+ device_name = current_device;
+ } else {
+ if (current_device != device_name) { // print error for non-matching devices
+ throw std::logic_error(
+ "The AOT file does not target the expected device. "
+ "The device specified to dla_benchmark using the -d flag must be the same as the "
+ "device specified to dla_compiler using the --fplugin flag.");
+ }
+ }
+ }
+ } else {
+ if (device_name == "") device_name = "CPU"; // default device for JIT flow is CPU
+ }
+ ov::Core core(FLAGS_plugins);
+
+ if (device_name.find("CPU") != std::string::npos) {
+ core.set_property("FPGA", {{DLIAPlugin::properties::cpu_used.name(), true}});
+ }
+
+ if (arch_file_flag != "" && device_name.find("FPGA") != std::string::npos) {
+ core.set_property("FPGA", {{DLIAPlugin::properties::arch_path.name(), arch_file_flag}});
+ if (!ExistsTest(arch_file_flag)) {
+ slog::err << "architecture file: " << arch_file_flag << " doesn't exist. Please provide a valid path."
+ << slog::endl;
+ throw std::logic_error("architecture file path does not exist.");
+ }
+ if (FLAGS_encryption_key != "") {
+ core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}});
+ }
+ if (FLAGS_encryption_iv != "") {
+ core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}});
+ }
+ // If emulator is used, do not perform decryption of compiled results in the import step
+ if (FLAGS_emulator_decryption) {
+ core.set_property("FPGA", {{DLIAPlugin::properties::emulator_decryption.name(), CONFIG_VALUE(YES)}});
+ }
+ if (FLAGS_min_subgraph_layers < 1) {
+ slog::err << "-min-subgraph-layers must be >= 1" << slog::endl;
+ return 1;
+ }
+ core.set_property("FPGA", {{DLIAPlugin::properties::min_subgraph_layers.name(), FLAGS_min_subgraph_layers}});
+ }
+
+ if (device_name.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
+ // CPU extensions is loaded as a shared library and passed as a pointer to base extension
+ core.add_extension(FLAGS_l);
+ slog::info << "CPU extensions is loaded " << FLAGS_l << slog::endl;
+ }
+
+ slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl;
+ slog::info << "Device info: " << core.get_versions(device_name) << slog::endl;
+
+ // ----------------- 3. Setting device configuration -----------------------------------------------------------
+ next_step();
+
+ auto devices = ParseDevices(device_name);
+ std::map<std::string, uint32_t> device_nstreams = ParseNStreamsValuePerDevice(devices, FLAGS_nstreams);
+ for (auto& pair : device_nstreams) {
+ auto key = std::string(pair.first + "_THROUGHPUT_STREAMS");
+ std::vector<std::string> supported_config_keys =
+ core.get_property(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS)).as<std::vector<std::string>>();
+ if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
+ throw std::logic_error(
+ "Device " + pair.first + " doesn't support config key '" + key + "'! " +
+ "Please specify -nstreams for correct devices in format <dev1>:<nstreams1>,<dev2>:<nstreams2>");
+ }
+ }
+
+ // pc is for CPU only at the moment
+ bool perf_count = FLAGS_pc;
+ std::string perf_count_sort = FLAGS_pcsort;
+ for (auto& device : devices) {
+ if (device == "CPU") { // CPU supports few special performance-oriented keys
+ if (perf_count || !perf_count_sort.empty()) {
+ core.set_property("CPU", {{CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(YES)}});
+ }
+ // limit threading for CPU portion of inference
+ if (FLAGS_nthreads != 0)
+ core.set_property(device, {{CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads)}});
+ core.set_property(device, {{CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin}});
+ // Set CPU to optimize throughput
+ core.set_property(device, ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT));
+ // for CPU execution, more throughput-oriented execution via streams
+ if (FLAGS_api == "async") {
+ core.set_property(
+ device,
+ ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
+ : ov::streams::AUTO));
+ }
+ device_nstreams[device] = core.get_property(device, ov::streams::num);
+ } else if (device == ("GPU")) {
+ if (FLAGS_api == "async") {
+ core.set_property(
+ device,
+ ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
+ : ov::streams::AUTO));
+ }
+ device_nstreams[device] = core.get_property(device, ov::streams::num);
+ }
+ }
+
+ auto double_to_string = [](const double number) {
+ std::stringstream ss;
+ ss << std::fixed << std::setprecision(4) << number;
+ return ss.str();
+ };
+ auto get_total_ms_time = [](Time::time_point& start_time) {
+ return std::chrono::duration_cast<ns>(Time::now() - start_time).count() * 0.000001;
+ };
+
+ size_t batch_size = batch_size_flag;
+ std::vector<std::string> topology_names;
+ ov::element::Type precision = ov::element::undefined;
+ // Vector stores which model (multigraph), InputsInfo is a map of input names and its respctive
+ // input information
+ std::vector<dla_benchmark::InputsInfo> input_infos;
+ if (!is_model_compiled) {
+#ifndef DISABLE_JIT
+ // We choose to ifdef out this block of code because it's more readable than
+ // pulling the block in the "else" out using ifdefs
+ // ----------------- 4. Reading the Intermediate Representation network ----------------------------------------
+ next_step();
+
+ LOG_AND_PRINT(Logger::INFO, "Loading network files\n");
+
+ auto start_time_read = Time::now();
+ // get list of graphs
+ std::vector<std::shared_ptr<ov::Model>> models =
+ VectorMap<std::shared_ptr<ov::Model>>(split(network_file_flag, MULTIGRAPH_SEP), [&](const std::string& m) {
+ std::shared_ptr<ov::Model> model = core.read_model(m);
+ // Assign rt info IMMEDIATELY when DLA benchmark reads the model.
+ // Applying transformations or reshaping may change node names.
+ // Mixed Precision is an EA only feature for 2024.2
+ if (FLAGS_enable_early_access) {
+ for (auto&& node : model->get_ops()) {
+ if (dla::util::NodeTypeUsesPE(node->get_type_name())) {
+ node->get_rt_info()[DLA_PE_PRECISION_MODE] =
+ dla::util::ParseNodeForRTInfo(node->get_friendly_name(), DLA_PE_PRECISION_MODE);
+ }
+ }
+ }
+ printInputAndOutputsInfoShort(*model);
+ return model;
+ });
+
+ auto duration_ms = double_to_string(get_total_ms_time(start_time_read));
+ slog::info << "Read network(s) took " << duration_ms << " ms" << slog::endl;
+ if (statistics)
+ statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+ {{"read network time (ms)", duration_ms}});
+
+ // ----------------- 5. Resizing network to match image sizes and given batch ----------------------------------
+ next_step();
+
+ for (size_t i = 0; i < models.size(); i++) {
+ const auto& model_inputs = std::const_pointer_cast<const ov::Model>(models[i])->inputs();
+ bool reshape = false;
+ input_infos.push_back(
+ GetInputsInfo(batch_size, model_inputs, reshape, FLAGS_bin_data, FLAGS_mean_values, FLAGS_scale_values));
+ if (reshape) {
+ dla_benchmark::PartialShapes shapes = {};
+ for (auto& item : input_infos.back()) shapes[item.first] = item.second.partial_shape;
+ slog::info << "Reshaping model to batch: " << batch_size << slog::endl;
+ models[i]->reshape(shapes);
+ }
+ topology_names.push_back(models[i]->get_friendly_name());
+ }
+
+ // ----------------- 6. Configuring input and output
+ // ----------------------------------------------------------------------
+ next_step();
+ // Set input layouts for all models and their inputs
+ size_t input_info_idx = 0;
+ for (std::shared_ptr<ov::Model> model : models) {
+ auto preproc = ov::preprocess::PrePostProcessor(model);
+ const auto& inputs = model->inputs();
+ for (size_t i = 0; i < inputs.size(); i++) {
+ ov::preprocess::InputInfo& input_info = preproc.input(i);
+ const size_t input_rank = inputs[i].get_partial_shape().size();
+ const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(input_rank));
+ const ov::element::Type_t type = input_infos[input_info_idx].at(inputs[i].get_any_name()).type;
+ input_info.tensor().set_element_type(type).set_layout(layout);
+ }
+
+ const auto& outputs = model->outputs();
+ for (size_t i = 0; i < outputs.size(); i++) {
+ const size_t output_rank = outputs[i].get_partial_shape().size();
+ const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(output_rank));
+ preproc.output(i).tensor().set_element_type(ov::element::f32).set_layout(layout);
+ }
+ // Once the build() method is called, the pre(post)processing steps
+ // for layout and precision conversions are inserted automatically
+ model = preproc.build();
+ input_info_idx++;
+ }
+ // ----------------- 7. Loading the model to the device --------------------------------------------------------
+ next_step();
+
+ // Get the value from the command line arguments (if the command line argument wasn't
+ // used by the user the default value set in dla_benchmark.hpp will be used)
+ int folding_option = FLAGS_folding_option;
+ bool fold_preprocessing = FLAGS_fold_preprocessing;
+ bool estimate_per_layer = FLAGS_estimate_per_layer_latencies;
+ bool enable_early_access = FLAGS_enable_early_access;
+ // TODO(arooney): Remove this once LT hang is fixed.
+ bool multi_infer_req = false;
+ if (FLAGS_nireq > 1 && FLAGS_api == "async") {
+ multi_infer_req = true;
+ }
+
+ core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}});
+ core.set_property("FPGA",
+ {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}});
+ core.set_property("FPGA",
+ {{DLIAPlugin::properties::per_layer_estimation.name(), estimate_per_layer}});
+ core.set_property("FPGA",
+ {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}});
+ core.set_property("FPGA",
+ {{DLIAPlugin::properties::multiple_inferences.name(), multi_infer_req}});
+ core.set_property("FPGA", {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});
+
+ auto start_time = Time::now();
+ auto individual_start_time = Time::now(); // timer for each individual graph loading
+ compiled_models = VectorMap<ov::CompiledModel*>(models, [&](std::shared_ptr<ov::Model> model) {
+ // Apply Low Precision transformations to handle quantized graphs
+ // Mohamed_I: currently, this only works if the entire graph fits on the FPGA
+ // because the CPU plugin calls common_optimizations again which has some transformations
+ // that cause the graph to fail (I suspect it's the ConvolutionMultiplyFusion, but I
+ // cannot disable it from the CPU)
+
+ bool FPGA_used = device_name.find("FPGA") != std::string::npos;
+ bool CPU_used = device_name.find("CPU") != std::string::npos;
+
+ ov::AnyMap config;
+ config.emplace(DLIAPlugin::properties::cpu_used.name(), CPU_used);
+ config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
+ config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);
+
+ for (auto&& node : model->get_ops()) {
+ if (std::string("FakeQuantize") == node->get_type_name()) {
+ config.emplace(DLIAPlugin::properties::apply_low_precision_transforms.name(), true);
+ if (CPU_used && FPGA_used) {
+ std::cerr << "ERROR: Quantized graphs only supported through HETERO:FPGA or CPU." << std::endl;
+ throw std::logic_error("HETERO:FPGA,CPU plugin is not supported for quantization.");
+ }
+ }
+ }
+
+ auto compiled_model = new ov::CompiledModel();
+ *compiled_model = core.compile_model(model, device_name, config);
+ duration_ms = double_to_string(get_total_ms_time(individual_start_time));
+ individual_start_time = Time::now();
+ slog::info << "Compile model ( " << model->get_friendly_name() << " ) took " << duration_ms << " ms"
+ << slog::endl;
+ return compiled_model;
+ });
+ duration_ms = double_to_string(get_total_ms_time(start_time));
+ slog::info << "Load network(s) took " << duration_ms << " ms" << slog::endl;
+ if (statistics)
+ statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+ {{"load network time (ms)", duration_ms}});
+#endif
+ } else {
+ next_step();
+ slog::info << "Skipping the step for compiled network" << slog::endl;
+ next_step();
+ slog::info << "Skipping the step for compiled network" << slog::endl;
+ next_step();
+ slog::info << "Skipping the step for compiled network" << slog::endl;
+ // ----------------- 7. Loading the model to the device --------------------------------------------------------
+ next_step();
+ auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+ compiled_models = vectorMapWithIndex<ov::CompiledModel*>(
+ split(FLAGS_cm, MULTIGRAPH_SEP), // get a list of compiled graphs
+ [&](const std::string& compiled_graph_path, size_t index) {
+ std::stringstream generated_name;
+ generated_name << "Graph_" << index;
+ slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as "
+ << generated_name.str() << slog::endl;
+ auto start_time = Time::now();
+ std::ifstream model_stream(compiled_graph_paths[index].c_str(), std::ios_base::in | std::ios_base::binary);
+ if (!model_stream.is_open()) {
+ throw std::runtime_error("Cannot open compiled model file: " + compiled_graph_paths[index]);
+ }
+ auto compiled_model = new ov::CompiledModel();
+ core.set_property("FPGA",
+ {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});
+ // Import specific configs
+ ov::AnyMap config;
+ config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
+ config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);
+ *compiled_model = core.import_model(model_stream, device_name, config);
+ topology_names.push_back(generated_name.str());
+ model_stream.close();
+ printInputAndOutputsInfoShort(*compiled_model);
+ auto duration_ms = double_to_string(get_total_ms_time(start_time));
+ slog::info << "Import model took " << duration_ms << " ms" << slog::endl;
+ if (statistics)
+ statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+ {{"import model time (ms)", duration_ms}});
+ if (batch_size == 0) {
+ batch_size = 1;
+ }
+ const auto& inputs = compiled_model->inputs();
+ for (const auto& item : inputs) {
+ const auto& shape = item.get_shape();
+ if (shape[0] != batch_size) {
+ slog::err << "Batch size of the compiled model is " << shape[0] << " and batch size provided is "
+ << batch_size << slog::endl;
+ std::cout << "Set the same batch size = " << shape[0] << " when running the app" << std::endl;
+ std::cout << "Or recompile model with batch size = " << batch_size << std::endl;
+ exit(5);
+ }
+ }
+ bool reshape_required = false;
+ input_infos.push_back(GetInputsInfo(batch_size,
+ compiled_model->inputs(),
+ reshape_required,
+ FLAGS_bin_data,
+ FLAGS_mean_values,
+ FLAGS_scale_values));
+ return compiled_model;
+ });
+ }
+ // ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
+ next_step();
+
+ // Number of requests
+ uint32_t nireq = FLAGS_nireq;
+#if defined(__arm__) | defined(__aarch64__)
+ // In OpenVINO 2022.3 Arm plugin, when a AOT graph is compiled on CPU and dla_benchmark has -nireq > 1
+ // the program will be killed. We force nireq = 1 for HETERO:CPU graph only.
+ // Note: -d CPU doesn't need to be checked for AOT because dlac does not support -fplugin CPU.
+ if (device_name == "HETERO:CPU" && nireq > 1) {
+ slog::warn << "-nireq > 1 is not supported for HETERO:CPU graph. Forcing -nireq = 1" << slog::endl;
+ nireq = 1;
+ }
+
+#endif
+
+ if (nireq == 0) {
+ if (FLAGS_api == "sync") {
+ nireq = 1;
+ } else {
+ try {
+ nireq = 0;
+ for (auto& compiled_model : compiled_models) {
+ auto req = compiled_model->get_property(ov::optimal_number_of_infer_requests);
+ if (nireq == 0 || nireq > req) nireq = req;
+ }
+ } catch (const std::exception& ex) {
+ throw ov::Exception("Every device used with the dla_benchmark should support " +
+ std::string(ov::optimal_number_of_infer_requests.name()) +
+ " Failed to query the metric for the " + device_name + " with error: " + ex.what());
+ }
+ }
+ }
+#ifdef MAX_NUM_INFERENCE_REQUEST
+ if (nireq > MAX_NUM_INFERENCE_REQUEST) {
+ slog::warn << "-nireq > "<< MAX_NUM_INFERENCE_REQUEST << " is not supported for the underlying device. Forcing -nireq = 1" << slog::endl;
+ nireq = 1;
+ }
+#endif
+
+ // Iteration limit
+ uint32_t niter = FLAGS_niter;
+ if (niter > 0) {
+ // Round up niter to a multiple of nireq
+ niter = ((niter + nireq - 1) / nireq) * nireq;
+ // We previously checked that FLAGS_niter >= 0, so okay to cast to uint.
+ if (static_cast<uint32_t>(FLAGS_niter) != niter) {
+ slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to " << niter
+ << " using number of requests " << nireq << slog::endl;
+ }
+ num_batches = niter;
+ } else if (niter > 0) {
+ num_batches = niter;
+ }
+
+ // Graph-request limit on device
+ if (device_name.find("FPGA") != std::string::npos) {
+ int ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
+ int numOutstandingInferRequest = nireq * net_size / ip_num_instances;
+ int maxOutstandingInferRequest = core.get_property("FPGA", "COREDLA_DMA_CSR_DESCRIPTOR_QUEUE_SIZE").as<int>();
+ if (maxOutstandingInferRequest > 0 && numOutstandingInferRequest > maxOutstandingInferRequest) {
+ slog::err << "Possible number of outstanding inference requests per instance (" << numOutstandingInferRequest
+ << ") "
+ << "exceeds the CSR descriptor queue limit (" << maxOutstandingInferRequest << ")" << slog::endl;
+ return 1;
+ }
+ }
+
+ if (statistics) {
+ for (auto& topology_name : topology_names) {
+ statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+ {
+ {"topology", topology_name},
+ {"target device", device_name},
+ {"API", FLAGS_api},
+ {"precision", std::string(precision.get_type_name())},
+ {"batch size", std::to_string(batch_size)},
+ {"number of iterations", std::to_string(niter)},
+ {"number of parallel infer requests", std::to_string(nireq)},
+ });
+ }
+ for (auto& nstreams : device_nstreams) {
+ std::stringstream ss;
+ ss << "number of " << nstreams.first << " streams";
+ statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+ {
+ {ss.str(), std::to_string(nstreams.second)},
+ });
+ }
+ }
+
+ // ----------------- 9. Creating infer requests and filling input blobs ----------------------------------------
+ next_step();
+
+ // Data structure hierarchy
+ // Outermost vec: which model it corresponds to (multigraph)
+ // Map: input/output name and its corresponding TensorVector
+ // TensorVector: An alias for vector<ov::tensor> where each vector element correspond to the batch
+ std::vector<std::map<std::string, ov::TensorVector>> input_data_tensors;
+ std::vector<std::map<std::string, ov::TensorVector>> output_tensors(compiled_models.size());
+
+ std::vector<std::unique_ptr<InferRequestsQueue>> infer_request_queues;
+ const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type;
+ for (size_t net_idx = 0; net_idx < compiled_models.size(); net_idx++) {
+ // Handle the case that use same inputs for all networks
+ const auto& inputFiles =
+ net_idx >= multi_input_files.size() ? multi_input_files.back() : multi_input_files[net_idx];
+ input_data_tensors.push_back(GetStaticTensors(inputFiles.empty() ? std::vector<std::string>{} : inputFiles,
+ batch_size,
+ input_infos[net_idx],
+ num_batches,
+ resize_type,
+ FLAGS_bgr,
+ FLAGS_bin_data,
+ FLAGS_verbose));
+ // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv
+ infer_request_queues.push_back(
+ std::move(std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(*(compiled_models[net_idx]), nireq))));
+ }
+
+ // ----------------- 10. Measuring performance ------------------------------------------------------------------
+ size_t progress_bar_total_count = progressBarDefaultTotalCount;
+
+ std::stringstream ss;
+ ss << "Start inference " << FLAGS_api << "ronously";
+ if (FLAGS_api == "async") {
+ if (!ss.str().empty()) {
+ ss << ", ";
+ }
+ ss << infer_request_queues.size() * infer_request_queues.at(0)->requests.size() << " inference requests";
+ std::stringstream device_ss;
+ for (auto& nstreams : device_nstreams) {
+ if (!device_ss.str().empty()) {
+ device_ss << ", ";
+ }
+ device_ss << nstreams.second << " streams for " << nstreams.first;
+ }
+ if (!device_ss.str().empty()) {
+ ss << " using " << device_ss.str();
+ }
+ }
+ ss << ", limits: " << niter << " iterations with each graph, " << compiled_models.size() << " graph(s)";
+ progress_bar_total_count = niter;
+ next_step(ss.str());
+
+ /** Start inference & calculate performance **/
+ /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/
+ ProgressBar progress_bar(progress_bar_total_count, FLAGS_stream_output, FLAGS_progress);
+ std::vector<size_t> iterations(compiled_models.size(), 0);
+ try {
+ while ((niter != 0LL && iterations.back() < niter) || (FLAGS_api == "async" && iterations.back() % nireq != 0)) {
+ // set up all infer request and prep all i/o Blobs
+ for (size_t net_id = 0; net_id < compiled_models.size(); net_id++) {
+ for (size_t iireq = 0; iireq < nireq; iireq++) {
+ auto infer_request = infer_request_queues.at(net_id)->get_idle_request();
+ if (!infer_request) {
+ THROW_IE_EXCEPTION << "No idle Infer Requests!";
+ }
+
+ if (niter != 0LL) {
+ const auto& outputs = compiled_models[net_id]->outputs();
+ for (const auto& output : outputs) {
+ const std::string& name = output.get_any_name();
+ output_tensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape());
+ infer_request->set_tensor(output, output_tensors.at(net_id).at(name).at(iterations.at(net_id)));
+ }
+ const auto& inputs = compiled_models[net_id]->inputs();
+ for (auto& input : inputs) {
+ const std::string& name = input.get_any_name();
+ const auto& data = input_data_tensors.at(net_id).at(name)[iterations.at(net_id)];
+ infer_request->set_tensor(input, data);
+ }
+ }
+
+ // Execute one request/batch
+ if (FLAGS_api == "sync") {
+ infer_request->infer();
+ } else {
+ // As the inference request is currently idle, the wait() adds no additional overhead (and should return
+ // immediately). The primary reason for calling the method is exception checking/re-throwing. Callback,
+ // that governs the actual execution can handle errors as well, but as it uses just error codes it has no
+ // details like ‘what()’ method of `std::exception` So, rechecking for any exceptions here.
+ infer_request->wait();
+ infer_request->start_async();
+ }
+ iterations.at(net_id)++;
+ if (net_id == compiled_models.size() - 1) {
+ progress_bar.addProgress(1);
+ }
+ }
+ }
+ }
+
+ // wait the latest inference executions
+ for (auto& infer_request_queue : infer_request_queues) {
+ infer_request_queue->wait_all();
+ }
+ } catch (const std::exception& ex) {
+ slog::err << "Inference failed:" << slog::endl;
+ slog::err << ex.what() << slog::endl;
+ ReadDebugNetworkInfo(core);
+ PrintLSUCounterInfo(core);
+ // Instead of setting return_code = 1 and continuing, exit immediately.
+ // High risk of segfaulting / weird behavior when inference fails.
+ return 1;
+ }
+
+ size_t iteration = iterations.back();
+
+ std::vector<double> all_latencies;
+ auto start_time = infer_request_queues.at(0)->get_start_time();
+ auto end_time = infer_request_queues.at(0)->get_end_time();
+ for (auto& infer_request_queue : infer_request_queues) {
+ auto& latencies = infer_request_queue->get_latencies();
+ all_latencies.insert(all_latencies.end(), latencies.begin(), latencies.end());
+ start_time = std::min(start_time, infer_request_queue->get_start_time());
+ end_time = std::max(end_time, infer_request_queue->get_end_time());
+ }
+ double latency = GetMedianValue<double>(all_latencies);
+ double total_duration = std::chrono::duration_cast<ns>(end_time - start_time).count() * 0.000001;
+ double total_fps = (FLAGS_api == "sync")
+ ? compiled_models.size() * batch_size * 1000.0 / latency
+ : compiled_models.size() * batch_size * 1000.0 * iteration / total_duration;
+
+ int ip_num_instances = 0;
+ double ip_duration = 0.0;
+ double ip_fps = 0.0;
+ double ip_fps_per_fmax = 0.0;
+ double estimated_ipFps = 0.0;
+ double estimated_ipFpsPerFmax = 0.0;
+ double fmax_core = -1.0;
+ double estimated_ipFps_assumed_fmax = 0.0;
+ if (device_name.find("FPGA") != std::string::npos) {
+ ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
+ // even if hardware has 2 instances, only 1 instance actually gets used if only 1 inference is performed
+ size_t ip_num_instances_used = std::min((size_t)ip_num_instances, iteration);
+ ip_duration = core.get_property("FPGA", "IP_ACTIVE_TIME").as<double>();
+ if (ip_duration) {
+ if (ip_duration != 0.0) {
+ ip_fps = (FLAGS_api == "sync")
+ ? compiled_models.size() * batch_size * 1000.0 / latency / ip_num_instances_used
+ : compiled_models.size() * batch_size * 1000.0 * iteration / ip_duration / ip_num_instances_used;
+ }
+ fmax_core = core.get_property("FPGA", "COREDLA_CLOCK_FREQUENCY").as<double>();
+ if (fmax_core > 0.0) {
+ ip_fps_per_fmax = ip_fps / fmax_core;
+ } else {
+ slog::warn << "Warning: could not estimate clk_dla frequency on the FPGA" << slog::endl;
+ }
+ }
+
+ if (FLAGS_perf_est && (device_name.find("FPGA") != std::string::npos)) {
+ if (is_model_compiled) {
+ // Ahead of Time Flow: getting the imported, precalculated performance estimate
+ estimated_ipFps = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST").as<double>();
+ if (estimated_ipFps < 0)
+ slog::warn << "Missing performance estimation from at least one of the compiled graphs" << slog::endl;
+ estimated_ipFps_assumed_fmax = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST_ASSUMED_FMAX").as<double>();
+ } else {
+#ifndef DISABLE_JIT
+ // Just In Time Flow: running the performance estimate
+ if (fmax_core > 0.0) {
+#if defined(_WIN32) || defined(_WIN64)
+ _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str());
+ _putenv_s("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str());
+#else
+ setenv("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str(), true);
+ setenv("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str(), true);
+#endif
+ estimated_ipFps_assumed_fmax = fmax_core;
+ } else {
+// In case the fmax_core variable is not set, we use the estimated fmax values for AGX7 and A10.
+// This if statement is just defensive programming for a condition that should not happen.
+#ifdef DE10_AGILEX
+ estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 500); // AGX7 fMAX estimate
+#else
+ estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 265); // A10 fMAX estimate
+#endif
+ slog::warn
+ << "Warning: could not estimate clk_dla frequency on the FPGA, setting the fmax to default value."
+ << slog::endl;
+#if defined(_WIN32) || defined(_WIN64)
+ _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
+ _putenv_s("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
+#else
+ setenv("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
+ setenv("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
+#endif
+ }
+ estimated_ipFps = core.get_property("FPGA", "PLUGIN_PERFORMANCE_EST").as<double>();
+#endif
+ }
+ estimated_ipFpsPerFmax = estimated_ipFps / estimated_ipFps_assumed_fmax;
+ }
+ }
+
+ if (statistics) {
+ statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+ {
+ {"total execution time (ms)", double_to_string(total_duration)},
+ {"IP active time (ms)", double_to_string(ip_duration)},
+ {"total number of iterations", std::to_string(iteration)},
+ });
+ if (device_name.find("MULTI") == std::string::npos) {
+ statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+ {
+ {"latency (ms)", double_to_string(latency)},
+ });
+ }
+ statistics->addParameters(
+ StatisticsReport::Category::EXECUTION_RESULTS,
+ {{"throughput", double_to_string(total_fps)}, {"IP throughput", double_to_string(ip_fps)}});
+ }
+
+ progress_bar.finish();
+
+ // ----------------- 11. Dumping statistics report -------------------------------------------------------------
+ next_step();
+
+ if (perf_count || !perf_count_sort.empty()) {
+ std::vector<std::vector<ov::ProfilingInfo>> perfCounts;
+ for (size_t ireq = 0; ireq < nireq; ireq++) {
+ auto reqPerfCounts = infer_request_queues.at(0)->requests[ireq]->get_performance_counts();
+ perfCounts.push_back(reqPerfCounts);
+ }
+ if (statistics) {
+ if (perf_count_sort == "sort") {
+ statistics->printPerfCountersSort(perfCounts, "sort");
+ } else if (perf_count_sort == "simple_sort") {
+ statistics->printPerfCountersSort(perfCounts, "simple_sort");
+ } else {
+ statistics->printPerfCountersSort(perfCounts, "no_sort");
+ }
+ }
+ }
+
+ // dla_benchmark originally also implemented more detailed performance
+ // statistics via InferRequest's getPerformanceCounts function
+ // We did not support it, and removed it. If we want to re-implement it
+ // looking at the latest version of OpenVINO's benchmark_app or our git
+ // history would be a good starting point
+ if (statistics) {
+ statistics->dump();
+ }
+
+ std::cout << "count: " << iteration << " iterations" << std::endl;
+ std::cout << "system duration: " << double_to_string(total_duration) << " ms" << std::endl;
+ if (ip_duration != 0.0) std::cout << "IP duration: " << double_to_string(ip_duration) << " ms" << std::endl;
+ if (device_name.find("MULTI") == std::string::npos)
+ std::cout << "latency: " << double_to_string(latency) << " ms" << std::endl;
+ std::cout << "system throughput: " << double_to_string(total_fps) << " FPS" << std::endl;
+ if (ip_num_instances != 0) std::cout << "number of hardware instances: " << ip_num_instances << std::endl;
+ if (compiled_models.size() != 0)
+ std::cout << "number of network instances: " << compiled_models.size() << std::endl;
+ if (ip_fps != 0.0) std::cout << "IP throughput per instance: " << double_to_string(ip_fps) << " FPS" << std::endl;
+ if (ip_fps_per_fmax != 0.0)
+ std::cout << "IP throughput per fmax per instance: " << double_to_string(ip_fps_per_fmax) << " FPS/MHz"
+ << std::endl;
+ if (fmax_core > 0.0) std::cout << "IP clock frequency: " << double_to_string(fmax_core) << " MHz" << std::endl;
+ if (estimated_ipFps != 0.0)
+ std::cout << "estimated IP throughput per instance: " << double_to_string(estimated_ipFps) << " FPS ("
+ << (int)estimated_ipFps_assumed_fmax << " MHz assumed)" << std::endl;
+ if (estimated_ipFpsPerFmax != 0.0)
+ std::cout << "estimated IP throughput per fmax per instance: " << double_to_string(estimated_ipFpsPerFmax)
+ << " FPS/MHz" << std::endl;
+
+ // ----------------- 12. Dumping output values -------------------------------------------------------------
+ next_step();
+
+ if (FLAGS_dump_output) {
+ for (size_t i = 0; i < compiled_models.size(); i++) {
+ std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
+ // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
+ std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
+ const auto& output_tensors_map = output_tensors[i];
+ // A flag regarding whether we can dump output tensor in a text file due to unsupported layout.
+ // This flag is set at first during dumping.
+ bool can_dump_txt = true;
+ bool can_dump_layout_info_in_txt = true;
+ // dump output tensor as bin, which can be loaded using Python Numpy
+ std::regex pattern("\\{batch\\}");
+ std::string results_bin_file_name = output_dir + "result_{batch}.bin";
+ // dump output tensor as text
+ // backward compatibility support for old regtests that used only one graph
+ std::string results_txt_file_name = output_dir + "result.txt";
+ std::string results_boundaries_file_name = output_dir + "result_tensor_boundaries.txt";
+ // dump inference arguments and metadata as JSON
+ std::string results_meta_file_name = output_dir + "result_meta.json";
+
+ if (compiled_models.size() > 1) {
+ results_bin_file_name = output_dir + topology_names[i] + "_result_{batch}.bin";
+ results_txt_file_name = output_dir + topology_names[i] + "_result.txt";
+ results_boundaries_file_name = output_dir + topology_names[i] + "_result_tensor_boundaries.txt";
+ results_meta_file_name = output_dir + topology_names[i] + "_result_meta.json";
+ }
+
+ slog::info << "Dumping result of " << topology_names[i]
+ << " to " << results_txt_file_name << slog::endl;
+ slog::info << "Dumping per-batch result (raw output) of " << topology_names[i]
+ << " to " << results_bin_file_name << slog::endl;
+ slog::info << "Dumping inference meta data of " << topology_names[i]
+ << " to " << results_meta_file_name << slog::endl;
+
+ std::ofstream result_txt_file(results_txt_file_name);
+ std::ofstream results_boundaries(results_boundaries_file_name);
+ std::ofstream result_meta_file(results_meta_file_name);
+
+ dla_benchmark::InferenceMetaData result_metadata;
+ result_metadata.input_files = multi_input_files.at(i); // all input files in -i
+ result_metadata.groundtruth_loc = FLAGS_groundtruth_loc;
+ result_metadata.batch_size = FLAGS_batch_size;
+ result_metadata.niter = niter;
+ result_metadata.nireq = nireq;
+ result_metadata.model_input_info = input_infos[i];
+ dla_benchmark::OutputsInfoVec model_output_info;
+
+ uint32_t current_lines = 1;
+ size_t max_allowed_megabytes_to_dump = FLAGS_max_output_file_size;
+
+ for (uint32_t batch = 0; batch < num_batches; batch++) {
+ std::string per_batch_results_bin_file_name = std::regex_replace(results_bin_file_name,
+ pattern,
+ std::to_string(batch));
+ std::ofstream per_batch_results_bin_file(per_batch_results_bin_file_name, std::ios::binary);
+
+ for (const auto& item : output_info) {
+ auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
+ unsigned int output_size = tensor.get_size() / batch_size;
+
+ const ov::Layout& layout = ov::layout::get_layout(item);
+ const auto& shape = tensor.get_shape();
+ const std::string& name = item.get_any_name();
+ size_t total_bytes_to_dump = tensor.get_size() * niter * sizeof(float);
+
+ if (can_dump_txt) {
+ // if we cannot dump as a text file, we set can_dump_txt flag to false and write the one-time message
+ if (total_bytes_to_dump > max_allowed_megabytes_to_dump * BYTE_TO_MEGABYTE) {
+ can_dump_txt = false;
+ std::string msg = "Output tensor (" + std::to_string(total_bytes_to_dump / BYTE_TO_MEGABYTE) +
+ " MB) "
+ "is too large to dump. Change environmental variable MAX_DUMP_OUTPUT_TXT (default " +
+ std::to_string(FLAGS_max_output_file_size) + " MB) to allow dumping larger tensors";
+ slog::warn << msg << slog::endl;
+ result_txt_file << msg;
+ } else {
+ if (can_dump_layout_info_in_txt && shape.size() != 2 && shape.size() != 4 && shape.size() != 5) {
+ can_dump_layout_info_in_txt = false;
+ slog::warn << "Output data tensor of rank that is not 2, 4 or 5. layout info will not be dumped in "
+ << "result.txt." << slog::endl;
+ }
+ // Otherwise, dump text and write to the result_tensor_boundaries.txt with additional information
+ // about the result.txt file
+ results_boundaries << name << ": Line " << current_lines << " to "
+ << "line " << current_lines + output_size - 1 << std::endl;
+ results_boundaries << name << " output layout: " << layout.to_string() << std::endl;
+ results_boundaries << name << " output dimension:";
+ for (unsigned int dim = 0; dim < shape.size(); dim++) {
+ results_boundaries << " " << shape[dim];
+ }
+ results_boundaries << std::endl;
+ current_lines = current_lines + output_size;
+ DumpResultTxtFile(tensor, item, output_size, result_txt_file);
+ }
+ }
+ DumpResultBinFile(tensor, per_batch_results_bin_file);
+
+ if (batch == 0) {
+ // all batches should have the same output info
+ dla_benchmark::OutputInfo output_info;
+ output_info.name = name;
+ output_info.shape = shape;
+ model_output_info.push_back(output_info);
+ }
+ }
+ per_batch_results_bin_file.close();
+ }
+
+ result_metadata.model_output_info = model_output_info;
+ DumpResultMetaJSONFile(result_metadata, result_meta_file);
+ result_txt_file.close();
+ results_boundaries.close();
+ result_meta_file.close();
+ }
+ const std::string throughput_file_name = output_dir + "throughput_report.txt";
+ std::ofstream throughput_file;
+ throughput_file.open(throughput_file_name);
+ throughput_file << "Throughput : " << total_fps << " fps" << std::endl;
+ throughput_file << "Batch Size : " << batch_size << std::endl;
+ throughput_file << "Graph number : " << compiled_models.size() << std::endl;
+ throughput_file << "Num Batches : " << num_batches << std::endl;
+ throughput_file.close();
+
+ // Append throughput to dataset
+ // Check both gz and non gz versions
+ std::string dataset_gz_file_name = "data.csv.gz";
+ append_value_if_incomplete_to_csv(dataset_gz_file_name, ip_fps);
+ std::string dataset_file_name = "data.csv";
+ append_value_if_incomplete_to_csv(dataset_file_name, ip_fps);
+ }
+
+ // Calculate top 1, top 5 results
+ if (FLAGS_groundtruth_loc != "") {
+ auto groundtruth_files = split(FLAGS_groundtruth_loc, MULTIGRAPH_SEP);
+ for (size_t i = 0; i < compiled_models.size(); i++) {
+ // This flag `FLAGS_enable_object_detection_ap` enables accuracy checking subroutine that
+ // gives the mAP and COCO AP scores. These scores are two of the main detection evaluation
+ // metrics used in the Common Objects in Context contest, https://cocodataset.org/#detection-eval.
+
+ std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
+ // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
+ std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
+ // Run the default top-1, top-5 evaluation routine if AP scores are not required.
+ if (!FLAGS_enable_object_detection_ap) {
+ if (groundtruth_files.size() <= i) {
+ slog::warn << "Missing ground truth file for " << topology_names[i] << "! SKIPPED" << slog::endl;
+ continue; // Print warnings for all missing ground truth graphs;
+ }
+ slog::info << "Comparing ground truth file " << groundtruth_files[i] << " with network " << topology_names[i]
+ << slog::endl;
+ // captures the results in higher precision for accuracy analysis
+ std::vector<float> results;
+ const auto& output_tensors_map = output_tensors[i];
+ for (uint32_t batch = 0; batch < num_batches; batch++) {
+ for (unsigned int img = 0; img < batch_size; img++) {
+ for (const auto& item : output_info) {
+ auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
+ auto tensor_data = tensor.data<float>();
+ unsigned int output_size = tensor.get_size() / batch_size;
+ size_t offset = img * output_size;
+ for (unsigned int j = 0; j < output_size; j++) {
+ results.push_back(tensor_data[j + offset]);
+ }
+ }
+ }
+ }
+ bool passed = TopResultsAnalyser::get_top_results(groundtruth_files[i], results, batch_size * num_batches);
+ if (passed) {
+ slog::info << "Get top results for \"" << topology_names[i] << "\" graph passed" << slog::endl;
+ } else {
+ // return 4 indicates that the accuracy of the result was below the threshold
+ return_code = 4;
+ }
+ } else {
+ // Runs the accuracy checking routine if AP scores are required.
+ set_runtime(FLAGS_yolo_version, FLAGS_niter, batch_size_flag, FLAGS_i, FLAGS_groundtruth_loc);
+ std::pair<double, double> res =
+ validate_yolo_wrapper(output_tensors[i], output_info, multi_input_files.at(0));
+ std::cout << std::endl;
+ slog::info << "Batch metrics results:" << slog::endl;
+ std::cout << "Detection - mAP@0.5: " << std::setprecision(6) << res.first * 100 << "%" << std::endl;
+ std::cout << "Detection - mAP@0.5:0.95: " << std::setprecision(6) << res.second * 100 << "%" << std::endl;
+ }
+ }
+ }
+ // Output Debug Network Info if COREDLA_TEST_DEBUG_NETWORK is set
+ ReadDebugNetworkInfo(core);
+ if (FLAGS_report_lsu_counters) {
+ PrintLSUCounterInfo(core);
+ }
+ if (return_code) return return_code;
+ } catch (const std::exception& ex) {
+ slog::err << ex.what() << slog::endl;
+
+ if (statistics) {
+ statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+ {
+ {"Error during dla_benchmark: ", ex.what()},
+ });
+ statistics->dump();
+ }
+
+ return 3;
+ }
+
+ return 0;
+ // Bypass long function lint check
+ // NOLINTNEXTLINE(readability/fn_size)
+}