diff options
Diffstat (limited to 'python/openvino/runtime/dla_benchmark/main.cpp')
| -rw-r--r-- | python/openvino/runtime/dla_benchmark/main.cpp | 1575 |
1 files changed, 1575 insertions, 0 deletions
diff --git a/python/openvino/runtime/dla_benchmark/main.cpp b/python/openvino/runtime/dla_benchmark/main.cpp new file mode 100644 index 0000000..9d9055d --- /dev/null +++ b/python/openvino/runtime/dla_benchmark/main.cpp @@ -0,0 +1,1575 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +// Description: Main file of DLA benchmark. Entry point of DLA for just in time, ahead of time execution +// and any use case of DLA performing inference. This file is responsible for the end to end flow of DLA, +// from reading user input arguments, creating input tensors, compiling models, running inference +// dumping results. DLA benchmark is loosely based off of OpenVINO's sample benchmark app. +// For future OpenVINO uplifts viewing their sample app is a good place to start. +// Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/main.cpp] + +#include <algorithm> +#include <chrono> +#include <cstddef> +#include <map> +#include <memory> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> +#if defined(_WIN32) || defined(_WIN64) +#include <io.h> +#define NOMINMAX +#include <Windows.h> +#else +#include <dirent.h> +#include <unistd.h> +#endif +#include <stdio.h> +#include <sys/stat.h> +#include <fstream> +#include <regex> + +#include <samples/args_helper.hpp> +#include <samples/common.hpp> +#include <samples/slog.hpp> + +// DLA utils +#include "dla_stl_utils.h" +#include "dla_defines.h" + +// DLA benchmark +#include "average_precision.hpp" +#include "dla_benchmark.hpp" +#include "dla_plugin_config.hpp" +#include "infer_request_wrap.hpp" +#include "inputs_filling.hpp" +#include "progress_bar.hpp" +#include "statistics_report.hpp" +#include "top1_top5.hpp" +#include "utils.hpp" + +using DebugNetworkData = std::map<std::string, uint64_t>; +using LSUCounterData = std::map<std::string, uint64_t>; + +static const size_t progressBarDefaultTotalCount = 1000; + +// Get value from env variable named 'name', if it exists. +// If not, returns provided default value. +template <class T> +T GetEnvOrDefault(const char* name, T default_value) { + char* str_val = std::getenv(name); + T result = default_value; + if (str_val != NULL) { + std::stringstream ss; + ss << str_val; + ss >> result; + } + return result; +} + +bool ExistsTest(const std::string& name) { + struct stat buffer; + return (stat(name.c_str(), &buffer) == 0); +} + +bool isFile(const std::string& path) { +#if defined(_WIN32) || defined(_WIN64) + std::cout << "Windows-specific implementation for checking if something is a file" << std::endl; + // Windows-specific implementation + DWORD fileAttr = GetFileAttributesA(path.c_str()); + if (fileAttr == INVALID_FILE_ATTRIBUTES) { + // The path does not exist or an error occurred. + return false; + } + // Check if it's not a directory. + return !(fileAttr & FILE_ATTRIBUTE_DIRECTORY); +#else + // UNIX-specific implementation + struct stat buffer; + if (stat(path.c_str(), &buffer) == 0) { + return S_ISREG(buffer.st_mode); + } + return false; +#endif +} + +// This function appears in dla_aot_splitter/src/main.cpp too +bool DirOpenTest(const std::string& name) { +#if (!defined(_WIN32) && !defined(_WIN64)) + // If we can open the directory then return true + DIR* dp = opendir(name.c_str()); + if (dp != nullptr) { + closedir(dp); + return true; + } +#endif // !_WIN32 && !_WIN64 + struct stat sb; + if (stat(name.c_str(), &sb) == 0) { + if ((sb.st_mode & S_IFMT) != S_IFREG) { + slog::err << "File " << name << " cannot be opened!" << slog::endl; + throw std::logic_error("File cannot be opened!"); + } + } + return true; +} + +// Define a custom comparison function to sort based on ASCII names +bool CompareOutputNodeNames(const ov::Output<const ov::Node>& node1, const ov::Output<const ov::Node>& node2) { + return node1.get_any_name() < node2.get_any_name(); +} + +// copy arguments into a new array to split the '-i=<arg>' into +// two arguments (i.e. '-i' and '<arg>') to overcome a bug +// parseInputFilesArguments function where is doesn't recognize +// the -i=<arg> format +void ParseCommandLine(int argc, char** argv) { + int num_args = argc; + // allocated enough memory in case we needed to split the -i argument into two + char** arguments = new char*[num_args + 1]; + for (int i = 0, j = 0; j < argc; ++i, ++j) { + if (strstr(argv[j], "-i=")) { + // number of arguments will increase by one after splitting + num_args++; + arguments[i] = new char[3]; + strcpy(arguments[i++], "-i"); + // copy the reset of the argument (i.e. post "-i=") + arguments[i] = new char[strlen(argv[j]) - 2]; + strcpy(arguments[i], argv[j] + 3); + continue; + } + arguments[i] = new char[strlen(argv[j]) + 1]; + strcpy(arguments[i], argv[j]); + } + // the parse function is modifying the arguments point so we need to keep + // a copy of the original pointer value to delete it properly + char** orig_arg_ptr = arguments; + gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true); + // delete the allocated memory + for (int i = 0; i < num_args; ++i) { + delete[] orig_arg_ptr[i]; + } + delete[] orig_arg_ptr; +} + +bool CheckAndSetPluginsPath(const char* coredla_root) { + // plugins_xml_file should probably be removed in the future + if (!FLAGS_plugins_xml_file.empty()) { + FLAGS_plugins = FLAGS_plugins_xml_file; + slog::warn << "====================================================================" << slog::endl; + slog::warn << "Warning: -plugins_xml_file option is deprecated, please use -plugins." << slog::endl; + slog::warn << "====================================================================" << slog::endl; + } + + const char* coredla_work = std::getenv("COREDLA_WORK"); + std::string coredla_root_str = coredla_root; + if (FLAGS_plugins.empty()) { + if (coredla_work == nullptr) { + FLAGS_plugins = coredla_root_str + "/runtime/plugins.xml"; + } else { + std::string coredla_work_str = coredla_work; + FLAGS_plugins = coredla_work_str + "/runtime/plugins.xml"; + } + + if (ExistsTest(FLAGS_plugins)) { + slog::info << "Using default plugins xml file - " << FLAGS_plugins << slog::endl; + return true; + } + } + + if (ExistsTest(FLAGS_plugins) && isFile(FLAGS_plugins)) { + slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl; + return true; + } + // Check if user wants a shortcut to software emulation xml file if a path does not exist + if (FLAGS_plugins.find("emulation") != std::string::npos) { + // Potential paths for the plugins_emulation.xml file + std::string deployed_loc_plugins = coredla_root_str + "/bin/plugins_emulation.xml"; + std::string developer_loc_plugins = coredla_root_str + "/build/coredla/dla/bin/plugins_emulation.xml"; + + if (ExistsTest(deployed_loc_plugins)) + FLAGS_plugins = deployed_loc_plugins; + else if (ExistsTest(developer_loc_plugins)) + FLAGS_plugins = developer_loc_plugins; + } else { + // if user didn't specify emulation and user did not pass any xml file, raise an error + throw std::invalid_argument("Invalid argument for -plugins. Use 'emulation' or a path to custom xml file"); + } + + if (ExistsTest(FLAGS_plugins)) { + slog::info << "Using custom emulation xml file - " << FLAGS_plugins << slog::endl; + return true; + } + + return false; +} + +bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& net_size) { + // ---------------------------Parsing and validating input arguments-------------------------------------- + slog::info << "Parsing input parameters" << slog::endl; + + // Check for any flags that are missing their preceding dashes + // GFlags quietly ignores any flags missing their dashes, which can cause + // dla_benchmark to run with settings other than what the user intended + + // GFlags supports two different styles of flag: + // 1. --<flag> + // 2. -<flag> + // It also supports two different ways of specifying values for flags which + // take values: + // 1. --<flag>=<value> + // 2. --<flag> <value> + + // If we are not expecting a flag, we are expecting a value for the + // preceding flag + bool expecting_flag = true; + // Start at 1 to skip the command itself + for (int i = 1; i < argc; i++) { + if (expecting_flag) { + // A flag is always denoted by the first char being '-' + if (argv[i][0] != '-') { + slog::err << "Argument " << argv[i] << " is invalid. You" + << " may have forgotten a preceding '-'." << slog::endl; + throw std::logic_error("One or more invalid arguments"); + } + + char* flag_name_start = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1]; + std::string flag_name; + + gflags::CommandLineFlagInfo flag_info; + if (strstr(flag_name_start, "=")) { + flag_name = std::string(flag_name_start, size_t(strstr(flag_name_start, "=") - flag_name_start)); + } else { + flag_name = std::string(flag_name_start); + } + + // We expect a flag in the next argv if the current flag is a bool, + // because bool flags do not take a value. + // If GetCommandLineFlagInfo returns false, we assume the current + // flag is a boolean because boolean flags can be specified as + // -no<flag>, which is equivalent to -<flag>=false, or the flag + // simply being omitted. However, "no<flag>" is not recognized by + // GetCommandLineFlagInfo. + // Therefore, if the name is not recognized either the flag is a + // boolean flag or doesn't exist. In the latter case, gflags errors + // when we call ParseCommandLine so we can assume here it's a bool. + if (!GetCommandLineFlagInfo(flag_name.c_str(), &flag_info) || strstr(argv[i], "=") || flag_info.type == "bool") { + expecting_flag = true; + } else { + expecting_flag = false; + } + } else { + // If we were expecting a value, doesn't matter what it is + // gflags will check all values are the correct type, and + // dla_benchmark checks if the values received are sane + expecting_flag = true; + } + } + + ParseCommandLine(argc, argv); + + if (FLAGS_help || FLAGS_h) { + ShowUsage(); + // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it + // is an OpenCL/DLAv1 device. Since it is not, it then errors-out when the device + // does not response as expected to the OpenCL query. + // showAvailableDevices(); + std::cout << "\n"; + return false; + } + + if (FLAGS_hidden_help) { + PrintHiddenHelp(); + return false; + } + + if (FLAGS_cm.empty()) { + std::string network_file_flag; + if (!FLAGS_m.empty()) { + if (!FLAGS_network_file.empty()) { + throw std::invalid_argument( + "Both --network-file and -m are specified. Please only use one of the two arguments."); + } + network_file_flag = FLAGS_m; + } else if (!FLAGS_network_file.empty()) { + network_file_flag = FLAGS_network_file; + } else { + throw std::logic_error("Model is required but not set. Please set -m option."); + } + + std::vector<std::string> m_paths = split(network_file_flag, MULTIGRAPH_SEP); + net_size = m_paths.size(); + slog::info << "Found " << net_size << " graph" << (net_size == 1 ? "" : "s") << slog::endl; + for (auto& m_path : m_paths) { + if (!ExistsTest(m_path)) { + slog::err << "network file: " << m_path << " doesn't exist. Please provide a valid path with -m." << slog::endl; + throw std::logic_error("Model file path does not exist."); + } + } + } else { + std::vector<std::string> m_paths = split(FLAGS_cm, MULTIGRAPH_SEP); + net_size = m_paths.size(); + slog::info << "Found " << net_size << " compiled graph" << (net_size == 1 ? "" : "s") << slog::endl; + for (auto& m_path : m_paths) { + if (!ExistsTest(m_path)) { + slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm." + << slog::endl; + throw std::logic_error("Compiled model file path does not exist."); + } + } + } + + if (FLAGS_api != "async" && FLAGS_api != "sync") { + throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value."); + } + + if (FLAGS_niter <= 0) { + throw std::logic_error("-niter is a required flag and its value must be positive"); + } + + const char* coredla_root = std::getenv("COREDLA_ROOT"); + if (coredla_root == nullptr) { + slog::err << "ERROR: COREDLA_ROOT environment variable is not set." << slog::endl; + throw std::logic_error("Please set up correct environment variables first"); + } + + if (!CheckAndSetPluginsPath(coredla_root)) { + slog::err << "plugins_xml file: " << FLAGS_plugins_xml_file << " doesn't exist. Please provide a valid path." + << slog::endl; + throw std::logic_error("plugins_xml file path does not exist."); + } + + // Checks required arguments for the mAP calculation subroutine. + if (FLAGS_enable_object_detection_ap) { + if (!FLAGS_yolo_version.size() || !is_yolo_supported(FLAGS_yolo_version)) { + slog::err << "Please specify the version of your YOLO graph by setting the -yolo_version option to " + "`yolo-v3-tiny-tf` or `yolo-v3-tf` value." + << slog::endl; + throw std::logic_error("Incorrect YOLO version."); + } + } + + // Checks if output directory exists and can be opened + if (!FLAGS_output_dir.empty()) { + if (!ExistsTest(FLAGS_output_dir)) { + slog::err << "Specified output directory: " << FLAGS_output_dir << " does not exist" << slog::endl; + throw std::logic_error("Output directory does not exist"); + } + // Test whether the path can be opened if it's a directory + DirOpenTest(FLAGS_output_dir); + } + + return true; +} + +static void next_step(const std::string additional_info = "") { + static size_t step_id = 0; + static const std::map<size_t, std::string> step_names = {{1, "Parsing and validating input arguments"}, + {2, "Loading OpenVINO Runtime"}, + {3, "Setting device configuration"}, + {4, "Reading the Intermediate Representation network"}, + {5, "Resizing network to match image sizes and given batch"}, + {6, "Configuring input of the model"}, + {7, "Loading the model to the device"}, + {8, "Setting optimal runtime parameters"}, + {9, "Creating infer requests and preparing input tensors"}, + {10, "Measuring performance"}, + {11, "Dumping statistics report"}, + {12, "Dumping the output values"}}; + + step_id++; + if (step_names.count(step_id) == 0) + THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size(); + + std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id) + << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl; +} + +template <typename T> +T GetMedianValue(const std::vector<T>& vec) { + std::vector<T> sorted_vec(vec); + std::sort(sorted_vec.begin(), sorted_vec.end()); + return (sorted_vec.size() % 2 != 0) + ? sorted_vec[sorted_vec.size() / 2ULL] + : (sorted_vec[sorted_vec.size() / 2ULL] + sorted_vec[sorted_vec.size() / 2ULL - 1ULL]) / + static_cast<T>(2.0); +} + +void ReadDebugNetworkInfo(ov::Core core) { + if (FLAGS_debug_network) { + // On hardware timeout exception, fetch Debug CSR values from all modules attached to the Debug Network + std::vector<DebugNetworkData> debug_csr_return = + core.get_property("FPGA", "COREDLA_DEBUG_NETWORK_INFO").as<std::vector<DebugNetworkData>>(); + slog::info << "Dumping Debug Network profiling counters" << slog::endl; + for (auto i = 0U; i < debug_csr_return.size(); i++) { + std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl; + // Print debug info for all instances + for (auto& instance_csr_return : debug_csr_return[i]) { + std::cout << instance_csr_return.first << ": " << instance_csr_return.second << std::endl; + } + } + } +} + +void PrintLSUCounterInfo(ov::Core core) { + std::vector<LSUCounterData> lsu_counter_vec = + core.get_property("FPGA", "COREDLA_LSU_ACCESS_COUNT").as<std::vector<LSUCounterData>>(); + slog::info << "Dumping LSU memory access counters" << slog::endl; + for (auto i = 0U; i < lsu_counter_vec.size(); i++) { + std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl; + for (const auto& entry : lsu_counter_vec.at(i)) { + std::cout << entry.first <<": " << entry.second << std::endl; + } + } +} + +// Returns true if last char of csv is a comma +bool is_last_char_comma(FILE* file) { + if (file == nullptr) return 0; + + int i = -1; + std::vector<char> white_space_chars = {'\n', ' ', '\t', '\r', '\f', '\v'}; + char last_char[1]; + do { + if (std::fseek(file, i, SEEK_END) != 0) { + return 0; + } + if (std::fread(last_char, 1, 1, file) == 0) { + return 0; + } + i--; + } while (std::count(white_space_chars.begin(), white_space_chars.end(), last_char[0]) != 0); + + return last_char[0] == ','; +} + +bool fileExists(std::string& path) { + struct stat buffer; + return (stat(path.c_str(), &buffer) == 0); +} + +void append_value_if_incomplete_to_csv(std::string path, double value) { + try { + if (!fileExists(path)) { + return; + } + + FILE* data_file = fopen(path.c_str(), "rb"); + if (data_file == nullptr) { + return; + } + bool is_comma = is_last_char_comma(data_file); + fclose(data_file); + + if (is_comma) { + FILE* append_file = fopen(path.c_str(), "a"); + if (append_file == nullptr) { + return; + } + fprintf(append_file, "%f\n", value); + fclose(append_file); + } + } catch (...) { + return; + } +} + +/** + * @brief The entry point of the dla benchmark + */ +int main(int argc, char* argv[]) { + std::shared_ptr<StatisticsReport> statistics; + try { + // Declaring the CompiledModel object as a pointer to workaround the segfault + // that occurs when destructing the object. Now that it's declared as a pointer + // the complier won't automatically call the destructor of the object at the end + // of this scope and we won't delete the allocated memory either + std::vector<ov::CompiledModel*> compiled_models; + size_t net_size = 0; // parse the size of networks for arguments check + + size_t return_code = 0; // universal return code, return this value after dumping out Debug info + + // ----------------- 1. Parsing and validating input arguments ------------------------------------------------- + next_step(); + + if (!ParseAndCheckCommandLine(argc, argv, net_size)) { + return 0; + } + + bool is_model_compiled = !FLAGS_cm.empty(); + if (is_model_compiled) { + slog::info << "Model is compiled" << slog::endl; + } + + std::string arch_file_flag; + if (!FLAGS_arch_file.empty()) { + if (!FLAGS_arch.empty()) { + throw std::invalid_argument( + "Both --arch and -arch_file are specified. Please only use one of the two arguments."); + } + arch_file_flag = FLAGS_arch_file; + } else if (!FLAGS_arch.empty()) { + arch_file_flag = FLAGS_arch; + } + + bool flag_b_default = gflags::GetCommandLineFlagInfoOrDie("b").is_default; + bool flag_batch_size_default = gflags::GetCommandLineFlagInfoOrDie("batch_size").is_default; + + size_t batch_size_flag; + if (!flag_b_default) { + if (!flag_batch_size_default) { + throw std::invalid_argument( + "Both --batch-size and -b are specified. Please only use one of the two arguments."); + } + batch_size_flag = FLAGS_b; + } else { + batch_size_flag = FLAGS_batch_size; + } + + if (batch_size_flag > 10000 || batch_size_flag <= 0) { + throw std::invalid_argument( + "Batch size is too big (>10000) or not a postive number (<=0). Specify the batch size within the specified " + "range."); + } + + std::string network_file_flag; + if (!FLAGS_m.empty()) { + if (!FLAGS_network_file.empty()) { + throw std::invalid_argument( + "Both --network-file and -m are specified. Please only use one of the two arguments."); + } + network_file_flag = FLAGS_m; + } else if (!FLAGS_network_file.empty()) { + network_file_flag = FLAGS_network_file; + } + + // langsu: ideally use boost to create a sub-folder for ddrfree files + // but ed4 toolchain doesn't have boost yet. + std::string output_dir; + std::string parameter_rom_output_dir; + std::string separator = dla::util::path_separator; + if (!FLAGS_output_dir.empty()) { + output_dir = FLAGS_output_dir + separator; + parameter_rom_output_dir = output_dir; + } else { + output_dir = "." + separator; + parameter_rom_output_dir = output_dir; + } + + // The set of arguments printed is meant to be a useful summary to the + // user, rather than all of the arguments to dla_benchmark + slog::info << "Printing summary of arguments being used by dla_benchmark" << slog::endl + << "API (-api) ........................... " << FLAGS_api << slog::endl + << "Device (-d) .......................... " << FLAGS_d << slog::endl + << "Batch size (-b) ...................... " << batch_size_flag << slog::endl + << (!FLAGS_cm.empty() ? "Compiled model (-cm) ................. " + : "Model (-m) ........................... ") + << (!FLAGS_cm.empty() ? FLAGS_cm : network_file_flag) << slog::endl + << "Num iterations (-niter) .............. " + << (FLAGS_niter > 0 ? std::to_string(FLAGS_niter) : "Not specified") << slog::endl + << "Input images directory (-i) .......... " + << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl + << "Num CPU threads (-nthreads) .......... " + << (FLAGS_nthreads > 0 ? std::to_string(FLAGS_nthreads) : "Not specified") << slog::endl + << "Architecture file (-arch_file) ....... " << arch_file_flag << slog::endl + << "Num inference requests (-nireq) ...... " + << (FLAGS_nireq > 0 ? std::to_string(FLAGS_nireq) : "Not specified") << slog::endl + << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl + << "Groundtruth file (-groundtruth_loc) .. " + << (!FLAGS_groundtruth_loc.empty() ? FLAGS_groundtruth_loc : "Not specified") << slog::endl + << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl + << "EA features " << (FLAGS_enable_early_access ? "enabled." : "disabled.") << slog::endl; + + if (FLAGS_save_run_summary) { + std::vector<gflags::CommandLineFlagInfo> flags; + StatisticsReport::Parameters command_line_arguments; + gflags::GetAllFlags(&flags); + + for (auto& flag : flags) { + if (!flag.is_default) { + command_line_arguments.push_back({flag.name, flag.current_value}); + } + } + + if (!FLAGS_pcsort.empty() && + (FLAGS_pcsort != "simple_sort" && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort")) { + slog::err << "Invalid -pcsort option: " << FLAGS_pcsort << ". Please use one of sort, simple_sort, no_sort." + << slog::endl; + return 1; + } + + statistics = + std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_save_run_summary, FLAGS_report_folder}); + statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments); + } + + /** This vector stores paths to the processed images **/ + auto multi_input_files = VectorMap<std::vector<std::string>>( + SplitMultiInputFilesArguments(net_size), // get input directory list + [&](const std::vector<std::string>& input_args) mutable { + std::vector<std::string> files; + for (auto& input_arg : input_args) { + // Test if the path exists + if (!ExistsTest(input_arg)) { + slog::err << "Specified image path: " << input_arg << " does not exist" << slog::endl; + throw std::logic_error("Image path does not exist"); + } + // Test whether the path can be opened if it's a directory + DirOpenTest(input_arg); + readInputFilesArguments(files, input_arg); + } + return files; + }); + + if (multi_input_files.size() == 0) { + // failed to read input files + slog::err << "Failed to read input files" << slog::endl; + return 1; + } + + if (FLAGS_nstreams.empty()) { + slog::warn << "-nstreams default value is determined automatically for a device. " << slog::endl; + std::cout << "\tAlthough the automatic selection usually provides a reasonable performance, \n" + << "\tbut it still may be non-optimal for some cases, for more information look at README." + << std::endl; + } + +#ifdef DISABLE_JIT + if (!network_file_flag.empty()) { + slog::err << "Runtime compiled without support for Just-in-Time (JIT) execution!" << slog::endl + << "Either specify a compiled model using -cm <compiled_model.bin> " + << "or recompile the runtime without the -disable_jit flag." << slog::endl; + return 1; + } +#endif + + uint32_t num_batches = 1; + + // ----------------- 2. Loading OpenVINO Runtime/Inference Engine + // ----------------------------------------------------------- + next_step(); + + // Get optimal runtime parameters for device + std::string device_name = FLAGS_d; + if (is_model_compiled) { + auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP); // separate each AOT file path + for (auto& compiled_graph : compiled_graph_paths) { + std::filebuf obj_file_buf; + // There does not seem to be a way to get the device from the OpenVINO executable network + // Instead we manually read through the xml header in the AOT graph to get the device name (an ugly hack + // unfortunately) + obj_file_buf.open(compiled_graph.c_str(), std::ios::in | std::ios::binary); + std::istream obj_istream(&obj_file_buf); + std::string xml_header, current_device; + getline(obj_istream, xml_header); // retrieve xml header from AOT bin file + if (xml_header.find("TARGET_FALLBACK") != std::string::npos) { // uses hetero plugin + int start_index = xml_header.find("TARGET_FALLBACK") + 24; + int end_index = xml_header.find("</hetero_config>") - 3; + current_device = + "HETERO:" + xml_header.substr(start_index, end_index - start_index); // get device from xml header + } else { + current_device = "FPGA"; + } + if (device_name == "") { // device flag not specified in AOT flow + device_name = current_device; + } else { + if (current_device != device_name) { // print error for non-matching devices + throw std::logic_error( + "The AOT file does not target the expected device. " + "The device specified to dla_benchmark using the -d flag must be the same as the " + "device specified to dla_compiler using the --fplugin flag."); + } + } + } + } else { + if (device_name == "") device_name = "CPU"; // default device for JIT flow is CPU + } + ov::Core core(FLAGS_plugins); + + if (device_name.find("CPU") != std::string::npos) { + core.set_property("FPGA", {{DLIAPlugin::properties::cpu_used.name(), true}}); + } + + if (arch_file_flag != "" && device_name.find("FPGA") != std::string::npos) { + core.set_property("FPGA", {{DLIAPlugin::properties::arch_path.name(), arch_file_flag}}); + if (!ExistsTest(arch_file_flag)) { + slog::err << "architecture file: " << arch_file_flag << " doesn't exist. Please provide a valid path." + << slog::endl; + throw std::logic_error("architecture file path does not exist."); + } + if (FLAGS_encryption_key != "") { + core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}}); + } + if (FLAGS_encryption_iv != "") { + core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}}); + } + // If emulator is used, do not perform decryption of compiled results in the import step + if (FLAGS_emulator_decryption) { + core.set_property("FPGA", {{DLIAPlugin::properties::emulator_decryption.name(), CONFIG_VALUE(YES)}}); + } + if (FLAGS_min_subgraph_layers < 1) { + slog::err << "-min-subgraph-layers must be >= 1" << slog::endl; + return 1; + } + core.set_property("FPGA", {{DLIAPlugin::properties::min_subgraph_layers.name(), FLAGS_min_subgraph_layers}}); + } + + if (device_name.find("CPU") != std::string::npos && !FLAGS_l.empty()) { + // CPU extensions is loaded as a shared library and passed as a pointer to base extension + core.add_extension(FLAGS_l); + slog::info << "CPU extensions is loaded " << FLAGS_l << slog::endl; + } + + slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl; + slog::info << "Device info: " << core.get_versions(device_name) << slog::endl; + + // ----------------- 3. Setting device configuration ----------------------------------------------------------- + next_step(); + + auto devices = ParseDevices(device_name); + std::map<std::string, uint32_t> device_nstreams = ParseNStreamsValuePerDevice(devices, FLAGS_nstreams); + for (auto& pair : device_nstreams) { + auto key = std::string(pair.first + "_THROUGHPUT_STREAMS"); + std::vector<std::string> supported_config_keys = + core.get_property(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS)).as<std::vector<std::string>>(); + if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) { + throw std::logic_error( + "Device " + pair.first + " doesn't support config key '" + key + "'! " + + "Please specify -nstreams for correct devices in format <dev1>:<nstreams1>,<dev2>:<nstreams2>"); + } + } + + // pc is for CPU only at the moment + bool perf_count = FLAGS_pc; + std::string perf_count_sort = FLAGS_pcsort; + for (auto& device : devices) { + if (device == "CPU") { // CPU supports few special performance-oriented keys + if (perf_count || !perf_count_sort.empty()) { + core.set_property("CPU", {{CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(YES)}}); + } + // limit threading for CPU portion of inference + if (FLAGS_nthreads != 0) + core.set_property(device, {{CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads)}}); + core.set_property(device, {{CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin}}); + // Set CPU to optimize throughput + core.set_property(device, ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)); + // for CPU execution, more throughput-oriented execution via streams + if (FLAGS_api == "async") { + core.set_property( + device, + ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device)) + : ov::streams::AUTO)); + } + device_nstreams[device] = core.get_property(device, ov::streams::num); + } else if (device == ("GPU")) { + if (FLAGS_api == "async") { + core.set_property( + device, + ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device)) + : ov::streams::AUTO)); + } + device_nstreams[device] = core.get_property(device, ov::streams::num); + } + } + + auto double_to_string = [](const double number) { + std::stringstream ss; + ss << std::fixed << std::setprecision(4) << number; + return ss.str(); + }; + auto get_total_ms_time = [](Time::time_point& start_time) { + return std::chrono::duration_cast<ns>(Time::now() - start_time).count() * 0.000001; + }; + + size_t batch_size = batch_size_flag; + std::vector<std::string> topology_names; + ov::element::Type precision = ov::element::undefined; + // Vector stores which model (multigraph), InputsInfo is a map of input names and its respctive + // input information + std::vector<dla_benchmark::InputsInfo> input_infos; + if (!is_model_compiled) { +#ifndef DISABLE_JIT + // We choose to ifdef out this block of code because it's more readable than + // pulling the block in the "else" out using ifdefs + // ----------------- 4. Reading the Intermediate Representation network ---------------------------------------- + next_step(); + + LOG_AND_PRINT(Logger::INFO, "Loading network files\n"); + + auto start_time_read = Time::now(); + // get list of graphs + std::vector<std::shared_ptr<ov::Model>> models = + VectorMap<std::shared_ptr<ov::Model>>(split(network_file_flag, MULTIGRAPH_SEP), [&](const std::string& m) { + std::shared_ptr<ov::Model> model = core.read_model(m); + // Assign rt info IMMEDIATELY when DLA benchmark reads the model. + // Applying transformations or reshaping may change node names. + // Mixed Precision is an EA only feature for 2024.2 + if (FLAGS_enable_early_access) { + for (auto&& node : model->get_ops()) { + if (dla::util::NodeTypeUsesPE(node->get_type_name())) { + node->get_rt_info()[DLA_PE_PRECISION_MODE] = + dla::util::ParseNodeForRTInfo(node->get_friendly_name(), DLA_PE_PRECISION_MODE); + } + } + } + printInputAndOutputsInfoShort(*model); + return model; + }); + + auto duration_ms = double_to_string(get_total_ms_time(start_time_read)); + slog::info << "Read network(s) took " << duration_ms << " ms" << slog::endl; + if (statistics) + statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, + {{"read network time (ms)", duration_ms}}); + + // ----------------- 5. Resizing network to match image sizes and given batch ---------------------------------- + next_step(); + + for (size_t i = 0; i < models.size(); i++) { + const auto& model_inputs = std::const_pointer_cast<const ov::Model>(models[i])->inputs(); + bool reshape = false; + input_infos.push_back( + GetInputsInfo(batch_size, model_inputs, reshape, FLAGS_bin_data, FLAGS_mean_values, FLAGS_scale_values)); + if (reshape) { + dla_benchmark::PartialShapes shapes = {}; + for (auto& item : input_infos.back()) shapes[item.first] = item.second.partial_shape; + slog::info << "Reshaping model to batch: " << batch_size << slog::endl; + models[i]->reshape(shapes); + } + topology_names.push_back(models[i]->get_friendly_name()); + } + + // ----------------- 6. Configuring input and output + // ---------------------------------------------------------------------- + next_step(); + // Set input layouts for all models and their inputs + size_t input_info_idx = 0; + for (std::shared_ptr<ov::Model> model : models) { + auto preproc = ov::preprocess::PrePostProcessor(model); + const auto& inputs = model->inputs(); + for (size_t i = 0; i < inputs.size(); i++) { + ov::preprocess::InputInfo& input_info = preproc.input(i); + const size_t input_rank = inputs[i].get_partial_shape().size(); + const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(input_rank)); + const ov::element::Type_t type = input_infos[input_info_idx].at(inputs[i].get_any_name()).type; + input_info.tensor().set_element_type(type).set_layout(layout); + } + + const auto& outputs = model->outputs(); + for (size_t i = 0; i < outputs.size(); i++) { + const size_t output_rank = outputs[i].get_partial_shape().size(); + const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(output_rank)); + preproc.output(i).tensor().set_element_type(ov::element::f32).set_layout(layout); + } + // Once the build() method is called, the pre(post)processing steps + // for layout and precision conversions are inserted automatically + model = preproc.build(); + input_info_idx++; + } + // ----------------- 7. Loading the model to the device -------------------------------------------------------- + next_step(); + + // Get the value from the command line arguments (if the command line argument wasn't + // used by the user the default value set in dla_benchmark.hpp will be used) + int folding_option = FLAGS_folding_option; + bool fold_preprocessing = FLAGS_fold_preprocessing; + bool estimate_per_layer = FLAGS_estimate_per_layer_latencies; + bool enable_early_access = FLAGS_enable_early_access; + // TODO(arooney): Remove this once LT hang is fixed. + bool multi_infer_req = false; + if (FLAGS_nireq > 1 && FLAGS_api == "async") { + multi_infer_req = true; + } + + core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}}); + core.set_property("FPGA", + {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}}); + core.set_property("FPGA", + {{DLIAPlugin::properties::per_layer_estimation.name(), estimate_per_layer}}); + core.set_property("FPGA", + {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}}); + core.set_property("FPGA", + {{DLIAPlugin::properties::multiple_inferences.name(), multi_infer_req}}); + core.set_property("FPGA", {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}}); + + auto start_time = Time::now(); + auto individual_start_time = Time::now(); // timer for each individual graph loading + compiled_models = VectorMap<ov::CompiledModel*>(models, [&](std::shared_ptr<ov::Model> model) { + // Apply Low Precision transformations to handle quantized graphs + // Mohamed_I: currently, this only works if the entire graph fits on the FPGA + // because the CPU plugin calls common_optimizations again which has some transformations + // that cause the graph to fail (I suspect it's the ConvolutionMultiplyFusion, but I + // cannot disable it from the CPU) + + bool FPGA_used = device_name.find("FPGA") != std::string::npos; + bool CPU_used = device_name.find("CPU") != std::string::npos; + + ov::AnyMap config; + config.emplace(DLIAPlugin::properties::cpu_used.name(), CPU_used); + config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir); + config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir); + + for (auto&& node : model->get_ops()) { + if (std::string("FakeQuantize") == node->get_type_name()) { + config.emplace(DLIAPlugin::properties::apply_low_precision_transforms.name(), true); + if (CPU_used && FPGA_used) { + std::cerr << "ERROR: Quantized graphs only supported through HETERO:FPGA or CPU." << std::endl; + throw std::logic_error("HETERO:FPGA,CPU plugin is not supported for quantization."); + } + } + } + + auto compiled_model = new ov::CompiledModel(); + *compiled_model = core.compile_model(model, device_name, config); + duration_ms = double_to_string(get_total_ms_time(individual_start_time)); + individual_start_time = Time::now(); + slog::info << "Compile model ( " << model->get_friendly_name() << " ) took " << duration_ms << " ms" + << slog::endl; + return compiled_model; + }); + duration_ms = double_to_string(get_total_ms_time(start_time)); + slog::info << "Load network(s) took " << duration_ms << " ms" << slog::endl; + if (statistics) + statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, + {{"load network time (ms)", duration_ms}}); +#endif + } else { + next_step(); + slog::info << "Skipping the step for compiled network" << slog::endl; + next_step(); + slog::info << "Skipping the step for compiled network" << slog::endl; + next_step(); + slog::info << "Skipping the step for compiled network" << slog::endl; + // ----------------- 7. Loading the model to the device -------------------------------------------------------- + next_step(); + auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP); + compiled_models = vectorMapWithIndex<ov::CompiledModel*>( + split(FLAGS_cm, MULTIGRAPH_SEP), // get a list of compiled graphs + [&](const std::string& compiled_graph_path, size_t index) { + std::stringstream generated_name; + generated_name << "Graph_" << index; + slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as " + << generated_name.str() << slog::endl; + auto start_time = Time::now(); + std::ifstream model_stream(compiled_graph_paths[index].c_str(), std::ios_base::in | std::ios_base::binary); + if (!model_stream.is_open()) { + throw std::runtime_error("Cannot open compiled model file: " + compiled_graph_paths[index]); + } + auto compiled_model = new ov::CompiledModel(); + core.set_property("FPGA", + {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}}); + // Import specific configs + ov::AnyMap config; + config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir); + config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir); + *compiled_model = core.import_model(model_stream, device_name, config); + topology_names.push_back(generated_name.str()); + model_stream.close(); + printInputAndOutputsInfoShort(*compiled_model); + auto duration_ms = double_to_string(get_total_ms_time(start_time)); + slog::info << "Import model took " << duration_ms << " ms" << slog::endl; + if (statistics) + statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, + {{"import model time (ms)", duration_ms}}); + if (batch_size == 0) { + batch_size = 1; + } + const auto& inputs = compiled_model->inputs(); + for (const auto& item : inputs) { + const auto& shape = item.get_shape(); + if (shape[0] != batch_size) { + slog::err << "Batch size of the compiled model is " << shape[0] << " and batch size provided is " + << batch_size << slog::endl; + std::cout << "Set the same batch size = " << shape[0] << " when running the app" << std::endl; + std::cout << "Or recompile model with batch size = " << batch_size << std::endl; + exit(5); + } + } + bool reshape_required = false; + input_infos.push_back(GetInputsInfo(batch_size, + compiled_model->inputs(), + reshape_required, + FLAGS_bin_data, + FLAGS_mean_values, + FLAGS_scale_values)); + return compiled_model; + }); + } + // ----------------- 8. Setting optimal runtime parameters ----------------------------------------------------- + next_step(); + + // Number of requests + uint32_t nireq = FLAGS_nireq; +#if defined(__arm__) | defined(__aarch64__) + // In OpenVINO 2022.3 Arm plugin, when a AOT graph is compiled on CPU and dla_benchmark has -nireq > 1 + // the program will be killed. We force nireq = 1 for HETERO:CPU graph only. + // Note: -d CPU doesn't need to be checked for AOT because dlac does not support -fplugin CPU. + if (device_name == "HETERO:CPU" && nireq > 1) { + slog::warn << "-nireq > 1 is not supported for HETERO:CPU graph. Forcing -nireq = 1" << slog::endl; + nireq = 1; + } + +#endif + + if (nireq == 0) { + if (FLAGS_api == "sync") { + nireq = 1; + } else { + try { + nireq = 0; + for (auto& compiled_model : compiled_models) { + auto req = compiled_model->get_property(ov::optimal_number_of_infer_requests); + if (nireq == 0 || nireq > req) nireq = req; + } + } catch (const std::exception& ex) { + throw ov::Exception("Every device used with the dla_benchmark should support " + + std::string(ov::optimal_number_of_infer_requests.name()) + + " Failed to query the metric for the " + device_name + " with error: " + ex.what()); + } + } + } +#ifdef MAX_NUM_INFERENCE_REQUEST + if (nireq > MAX_NUM_INFERENCE_REQUEST) { + slog::warn << "-nireq > "<< MAX_NUM_INFERENCE_REQUEST << " is not supported for the underlying device. Forcing -nireq = 1" << slog::endl; + nireq = 1; + } +#endif + + // Iteration limit + uint32_t niter = FLAGS_niter; + if (niter > 0) { + // Round up niter to a multiple of nireq + niter = ((niter + nireq - 1) / nireq) * nireq; + // We previously checked that FLAGS_niter >= 0, so okay to cast to uint. + if (static_cast<uint32_t>(FLAGS_niter) != niter) { + slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to " << niter + << " using number of requests " << nireq << slog::endl; + } + num_batches = niter; + } else if (niter > 0) { + num_batches = niter; + } + + // Graph-request limit on device + if (device_name.find("FPGA") != std::string::npos) { + int ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>(); + int numOutstandingInferRequest = nireq * net_size / ip_num_instances; + int maxOutstandingInferRequest = core.get_property("FPGA", "COREDLA_DMA_CSR_DESCRIPTOR_QUEUE_SIZE").as<int>(); + if (maxOutstandingInferRequest > 0 && numOutstandingInferRequest > maxOutstandingInferRequest) { + slog::err << "Possible number of outstanding inference requests per instance (" << numOutstandingInferRequest + << ") " + << "exceeds the CSR descriptor queue limit (" << maxOutstandingInferRequest << ")" << slog::endl; + return 1; + } + } + + if (statistics) { + for (auto& topology_name : topology_names) { + statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG, + { + {"topology", topology_name}, + {"target device", device_name}, + {"API", FLAGS_api}, + {"precision", std::string(precision.get_type_name())}, + {"batch size", std::to_string(batch_size)}, + {"number of iterations", std::to_string(niter)}, + {"number of parallel infer requests", std::to_string(nireq)}, + }); + } + for (auto& nstreams : device_nstreams) { + std::stringstream ss; + ss << "number of " << nstreams.first << " streams"; + statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG, + { + {ss.str(), std::to_string(nstreams.second)}, + }); + } + } + + // ----------------- 9. Creating infer requests and filling input blobs ---------------------------------------- + next_step(); + + // Data structure hierarchy + // Outermost vec: which model it corresponds to (multigraph) + // Map: input/output name and its corresponding TensorVector + // TensorVector: An alias for vector<ov::tensor> where each vector element correspond to the batch + std::vector<std::map<std::string, ov::TensorVector>> input_data_tensors; + std::vector<std::map<std::string, ov::TensorVector>> output_tensors(compiled_models.size()); + + std::vector<std::unique_ptr<InferRequestsQueue>> infer_request_queues; + const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type; + for (size_t net_idx = 0; net_idx < compiled_models.size(); net_idx++) { + // Handle the case that use same inputs for all networks + const auto& inputFiles = + net_idx >= multi_input_files.size() ? multi_input_files.back() : multi_input_files[net_idx]; + input_data_tensors.push_back(GetStaticTensors(inputFiles.empty() ? std::vector<std::string>{} : inputFiles, + batch_size, + input_infos[net_idx], + num_batches, + resize_type, + FLAGS_bgr, + FLAGS_bin_data, + FLAGS_verbose)); + // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv + infer_request_queues.push_back( + std::move(std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(*(compiled_models[net_idx]), nireq)))); + } + + // ----------------- 10. Measuring performance ------------------------------------------------------------------ + size_t progress_bar_total_count = progressBarDefaultTotalCount; + + std::stringstream ss; + ss << "Start inference " << FLAGS_api << "ronously"; + if (FLAGS_api == "async") { + if (!ss.str().empty()) { + ss << ", "; + } + ss << infer_request_queues.size() * infer_request_queues.at(0)->requests.size() << " inference requests"; + std::stringstream device_ss; + for (auto& nstreams : device_nstreams) { + if (!device_ss.str().empty()) { + device_ss << ", "; + } + device_ss << nstreams.second << " streams for " << nstreams.first; + } + if (!device_ss.str().empty()) { + ss << " using " << device_ss.str(); + } + } + ss << ", limits: " << niter << " iterations with each graph, " << compiled_models.size() << " graph(s)"; + progress_bar_total_count = niter; + next_step(ss.str()); + + /** Start inference & calculate performance **/ + /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/ + ProgressBar progress_bar(progress_bar_total_count, FLAGS_stream_output, FLAGS_progress); + std::vector<size_t> iterations(compiled_models.size(), 0); + try { + while ((niter != 0LL && iterations.back() < niter) || (FLAGS_api == "async" && iterations.back() % nireq != 0)) { + // set up all infer request and prep all i/o Blobs + for (size_t net_id = 0; net_id < compiled_models.size(); net_id++) { + for (size_t iireq = 0; iireq < nireq; iireq++) { + auto infer_request = infer_request_queues.at(net_id)->get_idle_request(); + if (!infer_request) { + THROW_IE_EXCEPTION << "No idle Infer Requests!"; + } + + if (niter != 0LL) { + const auto& outputs = compiled_models[net_id]->outputs(); + for (const auto& output : outputs) { + const std::string& name = output.get_any_name(); + output_tensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape()); + infer_request->set_tensor(output, output_tensors.at(net_id).at(name).at(iterations.at(net_id))); + } + const auto& inputs = compiled_models[net_id]->inputs(); + for (auto& input : inputs) { + const std::string& name = input.get_any_name(); + const auto& data = input_data_tensors.at(net_id).at(name)[iterations.at(net_id)]; + infer_request->set_tensor(input, data); + } + } + + // Execute one request/batch + if (FLAGS_api == "sync") { + infer_request->infer(); + } else { + // As the inference request is currently idle, the wait() adds no additional overhead (and should return + // immediately). The primary reason for calling the method is exception checking/re-throwing. Callback, + // that governs the actual execution can handle errors as well, but as it uses just error codes it has no + // details like ‘what()’ method of `std::exception` So, rechecking for any exceptions here. + infer_request->wait(); + infer_request->start_async(); + } + iterations.at(net_id)++; + if (net_id == compiled_models.size() - 1) { + progress_bar.addProgress(1); + } + } + } + } + + // wait the latest inference executions + for (auto& infer_request_queue : infer_request_queues) { + infer_request_queue->wait_all(); + } + } catch (const std::exception& ex) { + slog::err << "Inference failed:" << slog::endl; + slog::err << ex.what() << slog::endl; + ReadDebugNetworkInfo(core); + PrintLSUCounterInfo(core); + // Instead of setting return_code = 1 and continuing, exit immediately. + // High risk of segfaulting / weird behavior when inference fails. + return 1; + } + + size_t iteration = iterations.back(); + + std::vector<double> all_latencies; + auto start_time = infer_request_queues.at(0)->get_start_time(); + auto end_time = infer_request_queues.at(0)->get_end_time(); + for (auto& infer_request_queue : infer_request_queues) { + auto& latencies = infer_request_queue->get_latencies(); + all_latencies.insert(all_latencies.end(), latencies.begin(), latencies.end()); + start_time = std::min(start_time, infer_request_queue->get_start_time()); + end_time = std::max(end_time, infer_request_queue->get_end_time()); + } + double latency = GetMedianValue<double>(all_latencies); + double total_duration = std::chrono::duration_cast<ns>(end_time - start_time).count() * 0.000001; + double total_fps = (FLAGS_api == "sync") + ? compiled_models.size() * batch_size * 1000.0 / latency + : compiled_models.size() * batch_size * 1000.0 * iteration / total_duration; + + int ip_num_instances = 0; + double ip_duration = 0.0; + double ip_fps = 0.0; + double ip_fps_per_fmax = 0.0; + double estimated_ipFps = 0.0; + double estimated_ipFpsPerFmax = 0.0; + double fmax_core = -1.0; + double estimated_ipFps_assumed_fmax = 0.0; + if (device_name.find("FPGA") != std::string::npos) { + ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>(); + // even if hardware has 2 instances, only 1 instance actually gets used if only 1 inference is performed + size_t ip_num_instances_used = std::min((size_t)ip_num_instances, iteration); + ip_duration = core.get_property("FPGA", "IP_ACTIVE_TIME").as<double>(); + if (ip_duration) { + if (ip_duration != 0.0) { + ip_fps = (FLAGS_api == "sync") + ? compiled_models.size() * batch_size * 1000.0 / latency / ip_num_instances_used + : compiled_models.size() * batch_size * 1000.0 * iteration / ip_duration / ip_num_instances_used; + } + fmax_core = core.get_property("FPGA", "COREDLA_CLOCK_FREQUENCY").as<double>(); + if (fmax_core > 0.0) { + ip_fps_per_fmax = ip_fps / fmax_core; + } else { + slog::warn << "Warning: could not estimate clk_dla frequency on the FPGA" << slog::endl; + } + } + + if (FLAGS_perf_est && (device_name.find("FPGA") != std::string::npos)) { + if (is_model_compiled) { + // Ahead of Time Flow: getting the imported, precalculated performance estimate + estimated_ipFps = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST").as<double>(); + if (estimated_ipFps < 0) + slog::warn << "Missing performance estimation from at least one of the compiled graphs" << slog::endl; + estimated_ipFps_assumed_fmax = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST_ASSUMED_FMAX").as<double>(); + } else { +#ifndef DISABLE_JIT + // Just In Time Flow: running the performance estimate + if (fmax_core > 0.0) { +#if defined(_WIN32) || defined(_WIN64) + _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str()); + _putenv_s("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str()); +#else + setenv("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str(), true); + setenv("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str(), true); +#endif + estimated_ipFps_assumed_fmax = fmax_core; + } else { +// In case the fmax_core variable is not set, we use the estimated fmax values for AGX7 and A10. +// This if statement is just defensive programming for a condition that should not happen. +#ifdef DE10_AGILEX + estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 500); // AGX7 fMAX estimate +#else + estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 265); // A10 fMAX estimate +#endif + slog::warn + << "Warning: could not estimate clk_dla frequency on the FPGA, setting the fmax to default value." + << slog::endl; +#if defined(_WIN32) || defined(_WIN64) + _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str()); + _putenv_s("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str()); +#else + setenv("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true); + setenv("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true); +#endif + } + estimated_ipFps = core.get_property("FPGA", "PLUGIN_PERFORMANCE_EST").as<double>(); +#endif + } + estimated_ipFpsPerFmax = estimated_ipFps / estimated_ipFps_assumed_fmax; + } + } + + if (statistics) { + statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, + { + {"total execution time (ms)", double_to_string(total_duration)}, + {"IP active time (ms)", double_to_string(ip_duration)}, + {"total number of iterations", std::to_string(iteration)}, + }); + if (device_name.find("MULTI") == std::string::npos) { + statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, + { + {"latency (ms)", double_to_string(latency)}, + }); + } + statistics->addParameters( + StatisticsReport::Category::EXECUTION_RESULTS, + {{"throughput", double_to_string(total_fps)}, {"IP throughput", double_to_string(ip_fps)}}); + } + + progress_bar.finish(); + + // ----------------- 11. Dumping statistics report ------------------------------------------------------------- + next_step(); + + if (perf_count || !perf_count_sort.empty()) { + std::vector<std::vector<ov::ProfilingInfo>> perfCounts; + for (size_t ireq = 0; ireq < nireq; ireq++) { + auto reqPerfCounts = infer_request_queues.at(0)->requests[ireq]->get_performance_counts(); + perfCounts.push_back(reqPerfCounts); + } + if (statistics) { + if (perf_count_sort == "sort") { + statistics->printPerfCountersSort(perfCounts, "sort"); + } else if (perf_count_sort == "simple_sort") { + statistics->printPerfCountersSort(perfCounts, "simple_sort"); + } else { + statistics->printPerfCountersSort(perfCounts, "no_sort"); + } + } + } + + // dla_benchmark originally also implemented more detailed performance + // statistics via InferRequest's getPerformanceCounts function + // We did not support it, and removed it. If we want to re-implement it + // looking at the latest version of OpenVINO's benchmark_app or our git + // history would be a good starting point + if (statistics) { + statistics->dump(); + } + + std::cout << "count: " << iteration << " iterations" << std::endl; + std::cout << "system duration: " << double_to_string(total_duration) << " ms" << std::endl; + if (ip_duration != 0.0) std::cout << "IP duration: " << double_to_string(ip_duration) << " ms" << std::endl; + if (device_name.find("MULTI") == std::string::npos) + std::cout << "latency: " << double_to_string(latency) << " ms" << std::endl; + std::cout << "system throughput: " << double_to_string(total_fps) << " FPS" << std::endl; + if (ip_num_instances != 0) std::cout << "number of hardware instances: " << ip_num_instances << std::endl; + if (compiled_models.size() != 0) + std::cout << "number of network instances: " << compiled_models.size() << std::endl; + if (ip_fps != 0.0) std::cout << "IP throughput per instance: " << double_to_string(ip_fps) << " FPS" << std::endl; + if (ip_fps_per_fmax != 0.0) + std::cout << "IP throughput per fmax per instance: " << double_to_string(ip_fps_per_fmax) << " FPS/MHz" + << std::endl; + if (fmax_core > 0.0) std::cout << "IP clock frequency: " << double_to_string(fmax_core) << " MHz" << std::endl; + if (estimated_ipFps != 0.0) + std::cout << "estimated IP throughput per instance: " << double_to_string(estimated_ipFps) << " FPS (" + << (int)estimated_ipFps_assumed_fmax << " MHz assumed)" << std::endl; + if (estimated_ipFpsPerFmax != 0.0) + std::cout << "estimated IP throughput per fmax per instance: " << double_to_string(estimated_ipFpsPerFmax) + << " FPS/MHz" << std::endl; + + // ----------------- 12. Dumping output values ------------------------------------------------------------- + next_step(); + + if (FLAGS_dump_output) { + for (size_t i = 0; i < compiled_models.size(); i++) { + std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs(); + // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files + std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames); + const auto& output_tensors_map = output_tensors[i]; + // A flag regarding whether we can dump output tensor in a text file due to unsupported layout. + // This flag is set at first during dumping. + bool can_dump_txt = true; + bool can_dump_layout_info_in_txt = true; + // dump output tensor as bin, which can be loaded using Python Numpy + std::regex pattern("\\{batch\\}"); + std::string results_bin_file_name = output_dir + "result_{batch}.bin"; + // dump output tensor as text + // backward compatibility support for old regtests that used only one graph + std::string results_txt_file_name = output_dir + "result.txt"; + std::string results_boundaries_file_name = output_dir + "result_tensor_boundaries.txt"; + // dump inference arguments and metadata as JSON + std::string results_meta_file_name = output_dir + "result_meta.json"; + + if (compiled_models.size() > 1) { + results_bin_file_name = output_dir + topology_names[i] + "_result_{batch}.bin"; + results_txt_file_name = output_dir + topology_names[i] + "_result.txt"; + results_boundaries_file_name = output_dir + topology_names[i] + "_result_tensor_boundaries.txt"; + results_meta_file_name = output_dir + topology_names[i] + "_result_meta.json"; + } + + slog::info << "Dumping result of " << topology_names[i] + << " to " << results_txt_file_name << slog::endl; + slog::info << "Dumping per-batch result (raw output) of " << topology_names[i] + << " to " << results_bin_file_name << slog::endl; + slog::info << "Dumping inference meta data of " << topology_names[i] + << " to " << results_meta_file_name << slog::endl; + + std::ofstream result_txt_file(results_txt_file_name); + std::ofstream results_boundaries(results_boundaries_file_name); + std::ofstream result_meta_file(results_meta_file_name); + + dla_benchmark::InferenceMetaData result_metadata; + result_metadata.input_files = multi_input_files.at(i); // all input files in -i + result_metadata.groundtruth_loc = FLAGS_groundtruth_loc; + result_metadata.batch_size = FLAGS_batch_size; + result_metadata.niter = niter; + result_metadata.nireq = nireq; + result_metadata.model_input_info = input_infos[i]; + dla_benchmark::OutputsInfoVec model_output_info; + + uint32_t current_lines = 1; + size_t max_allowed_megabytes_to_dump = FLAGS_max_output_file_size; + + for (uint32_t batch = 0; batch < num_batches; batch++) { + std::string per_batch_results_bin_file_name = std::regex_replace(results_bin_file_name, + pattern, + std::to_string(batch)); + std::ofstream per_batch_results_bin_file(per_batch_results_bin_file_name, std::ios::binary); + + for (const auto& item : output_info) { + auto tensor = output_tensors_map.at(item.get_any_name()).at(batch); + unsigned int output_size = tensor.get_size() / batch_size; + + const ov::Layout& layout = ov::layout::get_layout(item); + const auto& shape = tensor.get_shape(); + const std::string& name = item.get_any_name(); + size_t total_bytes_to_dump = tensor.get_size() * niter * sizeof(float); + + if (can_dump_txt) { + // if we cannot dump as a text file, we set can_dump_txt flag to false and write the one-time message + if (total_bytes_to_dump > max_allowed_megabytes_to_dump * BYTE_TO_MEGABYTE) { + can_dump_txt = false; + std::string msg = "Output tensor (" + std::to_string(total_bytes_to_dump / BYTE_TO_MEGABYTE) + + " MB) " + "is too large to dump. Change environmental variable MAX_DUMP_OUTPUT_TXT (default " + + std::to_string(FLAGS_max_output_file_size) + " MB) to allow dumping larger tensors"; + slog::warn << msg << slog::endl; + result_txt_file << msg; + } else { + if (can_dump_layout_info_in_txt && shape.size() != 2 && shape.size() != 4 && shape.size() != 5) { + can_dump_layout_info_in_txt = false; + slog::warn << "Output data tensor of rank that is not 2, 4 or 5. layout info will not be dumped in " + << "result.txt." << slog::endl; + } + // Otherwise, dump text and write to the result_tensor_boundaries.txt with additional information + // about the result.txt file + results_boundaries << name << ": Line " << current_lines << " to " + << "line " << current_lines + output_size - 1 << std::endl; + results_boundaries << name << " output layout: " << layout.to_string() << std::endl; + results_boundaries << name << " output dimension:"; + for (unsigned int dim = 0; dim < shape.size(); dim++) { + results_boundaries << " " << shape[dim]; + } + results_boundaries << std::endl; + current_lines = current_lines + output_size; + DumpResultTxtFile(tensor, item, output_size, result_txt_file); + } + } + DumpResultBinFile(tensor, per_batch_results_bin_file); + + if (batch == 0) { + // all batches should have the same output info + dla_benchmark::OutputInfo output_info; + output_info.name = name; + output_info.shape = shape; + model_output_info.push_back(output_info); + } + } + per_batch_results_bin_file.close(); + } + + result_metadata.model_output_info = model_output_info; + DumpResultMetaJSONFile(result_metadata, result_meta_file); + result_txt_file.close(); + results_boundaries.close(); + result_meta_file.close(); + } + const std::string throughput_file_name = output_dir + "throughput_report.txt"; + std::ofstream throughput_file; + throughput_file.open(throughput_file_name); + throughput_file << "Throughput : " << total_fps << " fps" << std::endl; + throughput_file << "Batch Size : " << batch_size << std::endl; + throughput_file << "Graph number : " << compiled_models.size() << std::endl; + throughput_file << "Num Batches : " << num_batches << std::endl; + throughput_file.close(); + + // Append throughput to dataset + // Check both gz and non gz versions + std::string dataset_gz_file_name = "data.csv.gz"; + append_value_if_incomplete_to_csv(dataset_gz_file_name, ip_fps); + std::string dataset_file_name = "data.csv"; + append_value_if_incomplete_to_csv(dataset_file_name, ip_fps); + } + + // Calculate top 1, top 5 results + if (FLAGS_groundtruth_loc != "") { + auto groundtruth_files = split(FLAGS_groundtruth_loc, MULTIGRAPH_SEP); + for (size_t i = 0; i < compiled_models.size(); i++) { + // This flag `FLAGS_enable_object_detection_ap` enables accuracy checking subroutine that + // gives the mAP and COCO AP scores. These scores are two of the main detection evaluation + // metrics used in the Common Objects in Context contest, https://cocodataset.org/#detection-eval. + + std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs(); + // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files + std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames); + // Run the default top-1, top-5 evaluation routine if AP scores are not required. + if (!FLAGS_enable_object_detection_ap) { + if (groundtruth_files.size() <= i) { + slog::warn << "Missing ground truth file for " << topology_names[i] << "! SKIPPED" << slog::endl; + continue; // Print warnings for all missing ground truth graphs; + } + slog::info << "Comparing ground truth file " << groundtruth_files[i] << " with network " << topology_names[i] + << slog::endl; + // captures the results in higher precision for accuracy analysis + std::vector<float> results; + const auto& output_tensors_map = output_tensors[i]; + for (uint32_t batch = 0; batch < num_batches; batch++) { + for (unsigned int img = 0; img < batch_size; img++) { + for (const auto& item : output_info) { + auto tensor = output_tensors_map.at(item.get_any_name()).at(batch); + auto tensor_data = tensor.data<float>(); + unsigned int output_size = tensor.get_size() / batch_size; + size_t offset = img * output_size; + for (unsigned int j = 0; j < output_size; j++) { + results.push_back(tensor_data[j + offset]); + } + } + } + } + bool passed = TopResultsAnalyser::get_top_results(groundtruth_files[i], results, batch_size * num_batches); + if (passed) { + slog::info << "Get top results for \"" << topology_names[i] << "\" graph passed" << slog::endl; + } else { + // return 4 indicates that the accuracy of the result was below the threshold + return_code = 4; + } + } else { + // Runs the accuracy checking routine if AP scores are required. + set_runtime(FLAGS_yolo_version, FLAGS_niter, batch_size_flag, FLAGS_i, FLAGS_groundtruth_loc); + std::pair<double, double> res = + validate_yolo_wrapper(output_tensors[i], output_info, multi_input_files.at(0)); + std::cout << std::endl; + slog::info << "Batch metrics results:" << slog::endl; + std::cout << "Detection - mAP@0.5: " << std::setprecision(6) << res.first * 100 << "%" << std::endl; + std::cout << "Detection - mAP@0.5:0.95: " << std::setprecision(6) << res.second * 100 << "%" << std::endl; + } + } + } + // Output Debug Network Info if COREDLA_TEST_DEBUG_NETWORK is set + ReadDebugNetworkInfo(core); + if (FLAGS_report_lsu_counters) { + PrintLSUCounterInfo(core); + } + if (return_code) return return_code; + } catch (const std::exception& ex) { + slog::err << ex.what() << slog::endl; + + if (statistics) { + statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, + { + {"Error during dla_benchmark: ", ex.what()}, + }); + statistics->dump(); + } + + return 3; + } + + return 0; + // Bypass long function lint check + // NOLINTNEXTLINE(readability/fn_size) +} |
