// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // // Description: Main file of DLA benchmark. Entry point of DLA for just in time, ahead of time execution // and any use case of DLA performing inference. This file is responsible for the end to end flow of DLA, // from reading user input arguments, creating input tensors, compiling models, running inference // dumping results. DLA benchmark is loosely based off of OpenVINO's sample benchmark app. // For future OpenVINO uplifts viewing their sample app is a good place to start. // Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/main.cpp] #include #include #include #include #include #include #include #include #include #if defined(_WIN32) || defined(_WIN64) #include #define NOMINMAX #include #else #include #include #endif #include #include #include #include #include #include #include // DLA utils #include "dla_stl_utils.h" #include "dla_defines.h" // DLA benchmark #include "average_precision.hpp" #include "dla_benchmark.hpp" #include "dla_plugin_config.hpp" #include "infer_request_wrap.hpp" #include "inputs_filling.hpp" #include "progress_bar.hpp" #include "statistics_report.hpp" #include "top1_top5.hpp" #include "utils.hpp" using DebugNetworkData = std::map; using LSUCounterData = std::map; static const size_t progressBarDefaultTotalCount = 1000; // Get value from env variable named 'name', if it exists. // If not, returns provided default value. template T GetEnvOrDefault(const char* name, T default_value) { char* str_val = std::getenv(name); T result = default_value; if (str_val != NULL) { std::stringstream ss; ss << str_val; ss >> result; } return result; } bool ExistsTest(const std::string& name) { struct stat buffer; return (stat(name.c_str(), &buffer) == 0); } bool isFile(const std::string& path) { #if defined(_WIN32) || defined(_WIN64) std::cout << "Windows-specific implementation for checking if something is a file" << std::endl; // Windows-specific implementation DWORD fileAttr = GetFileAttributesA(path.c_str()); if (fileAttr == INVALID_FILE_ATTRIBUTES) { // The path does not exist or an error occurred. return false; } // Check if it's not a directory. return !(fileAttr & FILE_ATTRIBUTE_DIRECTORY); #else // UNIX-specific implementation struct stat buffer; if (stat(path.c_str(), &buffer) == 0) { return S_ISREG(buffer.st_mode); } return false; #endif } // This function appears in dla_aot_splitter/src/main.cpp too bool DirOpenTest(const std::string& name) { #if (!defined(_WIN32) && !defined(_WIN64)) // If we can open the directory then return true DIR* dp = opendir(name.c_str()); if (dp != nullptr) { closedir(dp); return true; } #endif // !_WIN32 && !_WIN64 struct stat sb; if (stat(name.c_str(), &sb) == 0) { if ((sb.st_mode & S_IFMT) != S_IFREG) { slog::err << "File " << name << " cannot be opened!" << slog::endl; throw std::logic_error("File cannot be opened!"); } } return true; } // Define a custom comparison function to sort based on ASCII names bool CompareOutputNodeNames(const ov::Output& node1, const ov::Output& node2) { return node1.get_any_name() < node2.get_any_name(); } // copy arguments into a new array to split the '-i=' into // two arguments (i.e. '-i' and '') to overcome a bug // parseInputFilesArguments function where is doesn't recognize // the -i= format void ParseCommandLine(int argc, char** argv) { int num_args = argc; // allocated enough memory in case we needed to split the -i argument into two char** arguments = new char*[num_args + 1]; for (int i = 0, j = 0; j < argc; ++i, ++j) { if (strstr(argv[j], "-i=")) { // number of arguments will increase by one after splitting num_args++; arguments[i] = new char[3]; strcpy(arguments[i++], "-i"); // copy the reset of the argument (i.e. post "-i=") arguments[i] = new char[strlen(argv[j]) - 2]; strcpy(arguments[i], argv[j] + 3); continue; } arguments[i] = new char[strlen(argv[j]) + 1]; strcpy(arguments[i], argv[j]); } // the parse function is modifying the arguments point so we need to keep // a copy of the original pointer value to delete it properly char** orig_arg_ptr = arguments; gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true); // delete the allocated memory for (int i = 0; i < num_args; ++i) { delete[] orig_arg_ptr[i]; } delete[] orig_arg_ptr; } bool CheckAndSetPluginsPath(const char* coredla_root) { // plugins_xml_file should probably be removed in the future if (!FLAGS_plugins_xml_file.empty()) { FLAGS_plugins = FLAGS_plugins_xml_file; slog::warn << "====================================================================" << slog::endl; slog::warn << "Warning: -plugins_xml_file option is deprecated, please use -plugins." << slog::endl; slog::warn << "====================================================================" << slog::endl; } const char* coredla_work = std::getenv("COREDLA_WORK"); std::string coredla_root_str = coredla_root; if (FLAGS_plugins.empty()) { if (coredla_work == nullptr) { FLAGS_plugins = coredla_root_str + "/runtime/plugins.xml"; } else { std::string coredla_work_str = coredla_work; FLAGS_plugins = coredla_work_str + "/runtime/plugins.xml"; } if (ExistsTest(FLAGS_plugins)) { slog::info << "Using default plugins xml file - " << FLAGS_plugins << slog::endl; return true; } } if (ExistsTest(FLAGS_plugins) && isFile(FLAGS_plugins)) { slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl; return true; } // Check if user wants a shortcut to software emulation xml file if a path does not exist if (FLAGS_plugins.find("emulation") != std::string::npos) { // Potential paths for the plugins_emulation.xml file std::string deployed_loc_plugins = coredla_root_str + "/bin/plugins_emulation.xml"; std::string developer_loc_plugins = coredla_root_str + "/build/coredla/dla/bin/plugins_emulation.xml"; if (ExistsTest(deployed_loc_plugins)) FLAGS_plugins = deployed_loc_plugins; else if (ExistsTest(developer_loc_plugins)) FLAGS_plugins = developer_loc_plugins; } else { // if user didn't specify emulation and user did not pass any xml file, raise an error throw std::invalid_argument("Invalid argument for -plugins. Use 'emulation' or a path to custom xml file"); } if (ExistsTest(FLAGS_plugins)) { slog::info << "Using custom emulation xml file - " << FLAGS_plugins << slog::endl; return true; } return false; } bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& net_size) { // ---------------------------Parsing and validating input arguments-------------------------------------- slog::info << "Parsing input parameters" << slog::endl; // Check for any flags that are missing their preceding dashes // GFlags quietly ignores any flags missing their dashes, which can cause // dla_benchmark to run with settings other than what the user intended // GFlags supports two different styles of flag: // 1. -- // 2. - // It also supports two different ways of specifying values for flags which // take values: // 1. --= // 2. -- // If we are not expecting a flag, we are expecting a value for the // preceding flag bool expecting_flag = true; // Start at 1 to skip the command itself for (int i = 1; i < argc; i++) { if (expecting_flag) { // A flag is always denoted by the first char being '-' if (argv[i][0] != '-') { slog::err << "Argument " << argv[i] << " is invalid. You" << " may have forgotten a preceding '-'." << slog::endl; throw std::logic_error("One or more invalid arguments"); } char* flag_name_start = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1]; std::string flag_name; gflags::CommandLineFlagInfo flag_info; if (strstr(flag_name_start, "=")) { flag_name = std::string(flag_name_start, size_t(strstr(flag_name_start, "=") - flag_name_start)); } else { flag_name = std::string(flag_name_start); } // We expect a flag in the next argv if the current flag is a bool, // because bool flags do not take a value. // If GetCommandLineFlagInfo returns false, we assume the current // flag is a boolean because boolean flags can be specified as // -no, which is equivalent to -=false, or the flag // simply being omitted. However, "no" is not recognized by // GetCommandLineFlagInfo. // Therefore, if the name is not recognized either the flag is a // boolean flag or doesn't exist. In the latter case, gflags errors // when we call ParseCommandLine so we can assume here it's a bool. if (!GetCommandLineFlagInfo(flag_name.c_str(), &flag_info) || strstr(argv[i], "=") || flag_info.type == "bool") { expecting_flag = true; } else { expecting_flag = false; } } else { // If we were expecting a value, doesn't matter what it is // gflags will check all values are the correct type, and // dla_benchmark checks if the values received are sane expecting_flag = true; } } ParseCommandLine(argc, argv); if (FLAGS_help || FLAGS_h) { ShowUsage(); // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it // is an OpenCL/DLAv1 device. Since it is not, it then errors-out when the device // does not response as expected to the OpenCL query. // showAvailableDevices(); std::cout << "\n"; return false; } if (FLAGS_hidden_help) { PrintHiddenHelp(); return false; } if (FLAGS_cm.empty()) { std::string network_file_flag; if (!FLAGS_m.empty()) { if (!FLAGS_network_file.empty()) { throw std::invalid_argument( "Both --network-file and -m are specified. Please only use one of the two arguments."); } network_file_flag = FLAGS_m; } else if (!FLAGS_network_file.empty()) { network_file_flag = FLAGS_network_file; } else { throw std::logic_error("Model is required but not set. Please set -m option."); } std::vector m_paths = split(network_file_flag, MULTIGRAPH_SEP); net_size = m_paths.size(); slog::info << "Found " << net_size << " graph" << (net_size == 1 ? "" : "s") << slog::endl; for (auto& m_path : m_paths) { if (!ExistsTest(m_path)) { slog::err << "network file: " << m_path << " doesn't exist. Please provide a valid path with -m." << slog::endl; throw std::logic_error("Model file path does not exist."); } } } else { std::vector m_paths = split(FLAGS_cm, MULTIGRAPH_SEP); net_size = m_paths.size(); slog::info << "Found " << net_size << " compiled graph" << (net_size == 1 ? "" : "s") << slog::endl; for (auto& m_path : m_paths) { if (!ExistsTest(m_path)) { slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm." << slog::endl; throw std::logic_error("Compiled model file path does not exist."); } } } if (FLAGS_api != "async" && FLAGS_api != "sync") { throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value."); } if (FLAGS_niter <= 0) { throw std::logic_error("-niter is a required flag and its value must be positive"); } const char* coredla_root = std::getenv("COREDLA_ROOT"); if (coredla_root == nullptr) { slog::err << "ERROR: COREDLA_ROOT environment variable is not set." << slog::endl; throw std::logic_error("Please set up correct environment variables first"); } if (!CheckAndSetPluginsPath(coredla_root)) { slog::err << "plugins_xml file: " << FLAGS_plugins_xml_file << " doesn't exist. Please provide a valid path." << slog::endl; throw std::logic_error("plugins_xml file path does not exist."); } // Checks required arguments for the mAP calculation subroutine. if (FLAGS_enable_object_detection_ap) { if (!FLAGS_yolo_version.size() || !is_yolo_supported(FLAGS_yolo_version)) { slog::err << "Please specify the version of your YOLO graph by setting the -yolo_version option to " "`yolo-v3-tiny-tf` or `yolo-v3-tf` value." << slog::endl; throw std::logic_error("Incorrect YOLO version."); } } // Checks if output directory exists and can be opened if (!FLAGS_output_dir.empty()) { if (!ExistsTest(FLAGS_output_dir)) { slog::err << "Specified output directory: " << FLAGS_output_dir << " does not exist" << slog::endl; throw std::logic_error("Output directory does not exist"); } // Test whether the path can be opened if it's a directory DirOpenTest(FLAGS_output_dir); } return true; } static void next_step(const std::string additional_info = "") { static size_t step_id = 0; static const std::map step_names = {{1, "Parsing and validating input arguments"}, {2, "Loading OpenVINO Runtime"}, {3, "Setting device configuration"}, {4, "Reading the Intermediate Representation network"}, {5, "Resizing network to match image sizes and given batch"}, {6, "Configuring input of the model"}, {7, "Loading the model to the device"}, {8, "Setting optimal runtime parameters"}, {9, "Creating infer requests and preparing input tensors"}, {10, "Measuring performance"}, {11, "Dumping statistics report"}, {12, "Dumping the output values"}}; step_id++; if (step_names.count(step_id) == 0) THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size(); std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id) << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl; } template T GetMedianValue(const std::vector& vec) { std::vector sorted_vec(vec); std::sort(sorted_vec.begin(), sorted_vec.end()); return (sorted_vec.size() % 2 != 0) ? sorted_vec[sorted_vec.size() / 2ULL] : (sorted_vec[sorted_vec.size() / 2ULL] + sorted_vec[sorted_vec.size() / 2ULL - 1ULL]) / static_cast(2.0); } void ReadDebugNetworkInfo(ov::Core core) { if (FLAGS_debug_network) { // On hardware timeout exception, fetch Debug CSR values from all modules attached to the Debug Network std::vector debug_csr_return = core.get_property("FPGA", "COREDLA_DEBUG_NETWORK_INFO").as>(); slog::info << "Dumping Debug Network profiling counters" << slog::endl; for (auto i = 0U; i < debug_csr_return.size(); i++) { std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl; // Print debug info for all instances for (auto& instance_csr_return : debug_csr_return[i]) { std::cout << instance_csr_return.first << ": " << instance_csr_return.second << std::endl; } } } } void PrintLSUCounterInfo(ov::Core core) { std::vector lsu_counter_vec = core.get_property("FPGA", "COREDLA_LSU_ACCESS_COUNT").as>(); slog::info << "Dumping LSU memory access counters" << slog::endl; for (auto i = 0U; i < lsu_counter_vec.size(); i++) { std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl; for (const auto& entry : lsu_counter_vec.at(i)) { std::cout << entry.first <<": " << entry.second << std::endl; } } } // Returns true if last char of csv is a comma bool is_last_char_comma(FILE* file) { if (file == nullptr) return 0; int i = -1; std::vector white_space_chars = {'\n', ' ', '\t', '\r', '\f', '\v'}; char last_char[1]; do { if (std::fseek(file, i, SEEK_END) != 0) { return 0; } if (std::fread(last_char, 1, 1, file) == 0) { return 0; } i--; } while (std::count(white_space_chars.begin(), white_space_chars.end(), last_char[0]) != 0); return last_char[0] == ','; } bool fileExists(std::string& path) { struct stat buffer; return (stat(path.c_str(), &buffer) == 0); } void append_value_if_incomplete_to_csv(std::string path, double value) { try { if (!fileExists(path)) { return; } FILE* data_file = fopen(path.c_str(), "rb"); if (data_file == nullptr) { return; } bool is_comma = is_last_char_comma(data_file); fclose(data_file); if (is_comma) { FILE* append_file = fopen(path.c_str(), "a"); if (append_file == nullptr) { return; } fprintf(append_file, "%f\n", value); fclose(append_file); } } catch (...) { return; } } /** * @brief The entry point of the dla benchmark */ int main(int argc, char* argv[]) { std::shared_ptr statistics; try { // Declaring the CompiledModel object as a pointer to workaround the segfault // that occurs when destructing the object. Now that it's declared as a pointer // the complier won't automatically call the destructor of the object at the end // of this scope and we won't delete the allocated memory either std::vector compiled_models; size_t net_size = 0; // parse the size of networks for arguments check size_t return_code = 0; // universal return code, return this value after dumping out Debug info // ----------------- 1. Parsing and validating input arguments ------------------------------------------------- next_step(); if (!ParseAndCheckCommandLine(argc, argv, net_size)) { return 0; } bool is_model_compiled = !FLAGS_cm.empty(); if (is_model_compiled) { slog::info << "Model is compiled" << slog::endl; } std::string arch_file_flag; if (!FLAGS_arch_file.empty()) { if (!FLAGS_arch.empty()) { throw std::invalid_argument( "Both --arch and -arch_file are specified. Please only use one of the two arguments."); } arch_file_flag = FLAGS_arch_file; } else if (!FLAGS_arch.empty()) { arch_file_flag = FLAGS_arch; } bool flag_b_default = gflags::GetCommandLineFlagInfoOrDie("b").is_default; bool flag_batch_size_default = gflags::GetCommandLineFlagInfoOrDie("batch_size").is_default; size_t batch_size_flag; if (!flag_b_default) { if (!flag_batch_size_default) { throw std::invalid_argument( "Both --batch-size and -b are specified. Please only use one of the two arguments."); } batch_size_flag = FLAGS_b; } else { batch_size_flag = FLAGS_batch_size; } if (batch_size_flag > 10000 || batch_size_flag <= 0) { throw std::invalid_argument( "Batch size is too big (>10000) or not a postive number (<=0). Specify the batch size within the specified " "range."); } std::string network_file_flag; if (!FLAGS_m.empty()) { if (!FLAGS_network_file.empty()) { throw std::invalid_argument( "Both --network-file and -m are specified. Please only use one of the two arguments."); } network_file_flag = FLAGS_m; } else if (!FLAGS_network_file.empty()) { network_file_flag = FLAGS_network_file; } // langsu: ideally use boost to create a sub-folder for ddrfree files // but ed4 toolchain doesn't have boost yet. std::string output_dir; std::string parameter_rom_output_dir; std::string separator = dla::util::path_separator; if (!FLAGS_output_dir.empty()) { output_dir = FLAGS_output_dir + separator; parameter_rom_output_dir = output_dir; } else { output_dir = "." + separator; parameter_rom_output_dir = output_dir; } // The set of arguments printed is meant to be a useful summary to the // user, rather than all of the arguments to dla_benchmark slog::info << "Printing summary of arguments being used by dla_benchmark" << slog::endl << "API (-api) ........................... " << FLAGS_api << slog::endl << "Device (-d) .......................... " << FLAGS_d << slog::endl << "Batch size (-b) ...................... " << batch_size_flag << slog::endl << (!FLAGS_cm.empty() ? "Compiled model (-cm) ................. " : "Model (-m) ........................... ") << (!FLAGS_cm.empty() ? FLAGS_cm : network_file_flag) << slog::endl << "Num iterations (-niter) .............. " << (FLAGS_niter > 0 ? std::to_string(FLAGS_niter) : "Not specified") << slog::endl << "Input images directory (-i) .......... " << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl << "Num CPU threads (-nthreads) .......... " << (FLAGS_nthreads > 0 ? std::to_string(FLAGS_nthreads) : "Not specified") << slog::endl << "Architecture file (-arch_file) ....... " << arch_file_flag << slog::endl << "Num inference requests (-nireq) ...... " << (FLAGS_nireq > 0 ? std::to_string(FLAGS_nireq) : "Not specified") << slog::endl << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl << "Groundtruth file (-groundtruth_loc) .. " << (!FLAGS_groundtruth_loc.empty() ? FLAGS_groundtruth_loc : "Not specified") << slog::endl << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl << "EA features " << (FLAGS_enable_early_access ? "enabled." : "disabled.") << slog::endl; if (FLAGS_save_run_summary) { std::vector flags; StatisticsReport::Parameters command_line_arguments; gflags::GetAllFlags(&flags); for (auto& flag : flags) { if (!flag.is_default) { command_line_arguments.push_back({flag.name, flag.current_value}); } } if (!FLAGS_pcsort.empty() && (FLAGS_pcsort != "simple_sort" && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort")) { slog::err << "Invalid -pcsort option: " << FLAGS_pcsort << ". Please use one of sort, simple_sort, no_sort." << slog::endl; return 1; } statistics = std::make_shared(StatisticsReport::Config{FLAGS_save_run_summary, FLAGS_report_folder}); statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments); } /** This vector stores paths to the processed images **/ auto multi_input_files = VectorMap>( SplitMultiInputFilesArguments(net_size), // get input directory list [&](const std::vector& input_args) mutable { std::vector files; for (auto& input_arg : input_args) { // Test if the path exists if (!ExistsTest(input_arg)) { slog::err << "Specified image path: " << input_arg << " does not exist" << slog::endl; throw std::logic_error("Image path does not exist"); } // Test whether the path can be opened if it's a directory DirOpenTest(input_arg); readInputFilesArguments(files, input_arg); } return files; }); if (multi_input_files.size() == 0) { // failed to read input files slog::err << "Failed to read input files" << slog::endl; return 1; } if (FLAGS_nstreams.empty()) { slog::warn << "-nstreams default value is determined automatically for a device. " << slog::endl; std::cout << "\tAlthough the automatic selection usually provides a reasonable performance, \n" << "\tbut it still may be non-optimal for some cases, for more information look at README." << std::endl; } #ifdef DISABLE_JIT if (!network_file_flag.empty()) { slog::err << "Runtime compiled without support for Just-in-Time (JIT) execution!" << slog::endl << "Either specify a compiled model using -cm " << "or recompile the runtime without the -disable_jit flag." << slog::endl; return 1; } #endif uint32_t num_batches = 1; // ----------------- 2. Loading OpenVINO Runtime/Inference Engine // ----------------------------------------------------------- next_step(); // Get optimal runtime parameters for device std::string device_name = FLAGS_d; if (is_model_compiled) { auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP); // separate each AOT file path for (auto& compiled_graph : compiled_graph_paths) { std::filebuf obj_file_buf; // There does not seem to be a way to get the device from the OpenVINO executable network // Instead we manually read through the xml header in the AOT graph to get the device name (an ugly hack // unfortunately) obj_file_buf.open(compiled_graph.c_str(), std::ios::in | std::ios::binary); std::istream obj_istream(&obj_file_buf); std::string xml_header, current_device; getline(obj_istream, xml_header); // retrieve xml header from AOT bin file if (xml_header.find("TARGET_FALLBACK") != std::string::npos) { // uses hetero plugin int start_index = xml_header.find("TARGET_FALLBACK") + 24; int end_index = xml_header.find("") - 3; current_device = "HETERO:" + xml_header.substr(start_index, end_index - start_index); // get device from xml header } else { current_device = "FPGA"; } if (device_name == "") { // device flag not specified in AOT flow device_name = current_device; } else { if (current_device != device_name) { // print error for non-matching devices throw std::logic_error( "The AOT file does not target the expected device. " "The device specified to dla_benchmark using the -d flag must be the same as the " "device specified to dla_compiler using the --fplugin flag."); } } } } else { if (device_name == "") device_name = "CPU"; // default device for JIT flow is CPU } ov::Core core(FLAGS_plugins); if (device_name.find("CPU") != std::string::npos) { core.set_property("FPGA", {{DLIAPlugin::properties::cpu_used.name(), true}}); } if (arch_file_flag != "" && device_name.find("FPGA") != std::string::npos) { core.set_property("FPGA", {{DLIAPlugin::properties::arch_path.name(), arch_file_flag}}); if (!ExistsTest(arch_file_flag)) { slog::err << "architecture file: " << arch_file_flag << " doesn't exist. Please provide a valid path." << slog::endl; throw std::logic_error("architecture file path does not exist."); } if (FLAGS_encryption_key != "") { core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}}); } if (FLAGS_encryption_iv != "") { core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}}); } // If emulator is used, do not perform decryption of compiled results in the import step if (FLAGS_emulator_decryption) { core.set_property("FPGA", {{DLIAPlugin::properties::emulator_decryption.name(), CONFIG_VALUE(YES)}}); } if (FLAGS_min_subgraph_layers < 1) { slog::err << "-min-subgraph-layers must be >= 1" << slog::endl; return 1; } core.set_property("FPGA", {{DLIAPlugin::properties::min_subgraph_layers.name(), FLAGS_min_subgraph_layers}}); } if (device_name.find("CPU") != std::string::npos && !FLAGS_l.empty()) { // CPU extensions is loaded as a shared library and passed as a pointer to base extension core.add_extension(FLAGS_l); slog::info << "CPU extensions is loaded " << FLAGS_l << slog::endl; } slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl; slog::info << "Device info: " << core.get_versions(device_name) << slog::endl; // ----------------- 3. Setting device configuration ----------------------------------------------------------- next_step(); auto devices = ParseDevices(device_name); std::map device_nstreams = ParseNStreamsValuePerDevice(devices, FLAGS_nstreams); for (auto& pair : device_nstreams) { auto key = std::string(pair.first + "_THROUGHPUT_STREAMS"); std::vector supported_config_keys = core.get_property(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS)).as>(); if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) { throw std::logic_error( "Device " + pair.first + " doesn't support config key '" + key + "'! " + "Please specify -nstreams for correct devices in format :,:"); } } // pc is for CPU only at the moment bool perf_count = FLAGS_pc; std::string perf_count_sort = FLAGS_pcsort; for (auto& device : devices) { if (device == "CPU") { // CPU supports few special performance-oriented keys if (perf_count || !perf_count_sort.empty()) { core.set_property("CPU", {{CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(YES)}}); } // limit threading for CPU portion of inference if (FLAGS_nthreads != 0) core.set_property(device, {{CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads)}}); core.set_property(device, {{CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin}}); // Set CPU to optimize throughput core.set_property(device, ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)); // for CPU execution, more throughput-oriented execution via streams if (FLAGS_api == "async") { core.set_property( device, ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device)) : ov::streams::AUTO)); } device_nstreams[device] = core.get_property(device, ov::streams::num); } else if (device == ("GPU")) { if (FLAGS_api == "async") { core.set_property( device, ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device)) : ov::streams::AUTO)); } device_nstreams[device] = core.get_property(device, ov::streams::num); } } auto double_to_string = [](const double number) { std::stringstream ss; ss << std::fixed << std::setprecision(4) << number; return ss.str(); }; auto get_total_ms_time = [](Time::time_point& start_time) { return std::chrono::duration_cast(Time::now() - start_time).count() * 0.000001; }; size_t batch_size = batch_size_flag; std::vector topology_names; ov::element::Type precision = ov::element::undefined; // Vector stores which model (multigraph), InputsInfo is a map of input names and its respctive // input information std::vector input_infos; if (!is_model_compiled) { #ifndef DISABLE_JIT // We choose to ifdef out this block of code because it's more readable than // pulling the block in the "else" out using ifdefs // ----------------- 4. Reading the Intermediate Representation network ---------------------------------------- next_step(); LOG_AND_PRINT(Logger::INFO, "Loading network files\n"); auto start_time_read = Time::now(); // get list of graphs std::vector> models = VectorMap>(split(network_file_flag, MULTIGRAPH_SEP), [&](const std::string& m) { std::shared_ptr model = core.read_model(m); // Assign rt info IMMEDIATELY when DLA benchmark reads the model. // Applying transformations or reshaping may change node names. // Mixed Precision is an EA only feature for 2024.2 if (FLAGS_enable_early_access) { for (auto&& node : model->get_ops()) { if (dla::util::NodeTypeUsesPE(node->get_type_name())) { node->get_rt_info()[DLA_PE_PRECISION_MODE] = dla::util::ParseNodeForRTInfo(node->get_friendly_name(), DLA_PE_PRECISION_MODE); } } } printInputAndOutputsInfoShort(*model); return model; }); auto duration_ms = double_to_string(get_total_ms_time(start_time_read)); slog::info << "Read network(s) took " << duration_ms << " ms" << slog::endl; if (statistics) statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"read network time (ms)", duration_ms}}); // ----------------- 5. Resizing network to match image sizes and given batch ---------------------------------- next_step(); for (size_t i = 0; i < models.size(); i++) { const auto& model_inputs = std::const_pointer_cast(models[i])->inputs(); bool reshape = false; input_infos.push_back( GetInputsInfo(batch_size, model_inputs, reshape, FLAGS_bin_data, FLAGS_mean_values, FLAGS_scale_values)); if (reshape) { dla_benchmark::PartialShapes shapes = {}; for (auto& item : input_infos.back()) shapes[item.first] = item.second.partial_shape; slog::info << "Reshaping model to batch: " << batch_size << slog::endl; models[i]->reshape(shapes); } topology_names.push_back(models[i]->get_friendly_name()); } // ----------------- 6. Configuring input and output // ---------------------------------------------------------------------- next_step(); // Set input layouts for all models and their inputs size_t input_info_idx = 0; for (std::shared_ptr model : models) { auto preproc = ov::preprocess::PrePostProcessor(model); const auto& inputs = model->inputs(); for (size_t i = 0; i < inputs.size(); i++) { ov::preprocess::InputInfo& input_info = preproc.input(i); const size_t input_rank = inputs[i].get_partial_shape().size(); const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(input_rank)); const ov::element::Type_t type = input_infos[input_info_idx].at(inputs[i].get_any_name()).type; input_info.tensor().set_element_type(type).set_layout(layout); } const auto& outputs = model->outputs(); for (size_t i = 0; i < outputs.size(); i++) { const size_t output_rank = outputs[i].get_partial_shape().size(); const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(output_rank)); preproc.output(i).tensor().set_element_type(ov::element::f32).set_layout(layout); } // Once the build() method is called, the pre(post)processing steps // for layout and precision conversions are inserted automatically model = preproc.build(); input_info_idx++; } // ----------------- 7. Loading the model to the device -------------------------------------------------------- next_step(); // Get the value from the command line arguments (if the command line argument wasn't // used by the user the default value set in dla_benchmark.hpp will be used) int folding_option = FLAGS_folding_option; bool fold_preprocessing = FLAGS_fold_preprocessing; bool estimate_per_layer = FLAGS_estimate_per_layer_latencies; bool enable_early_access = FLAGS_enable_early_access; // TODO(arooney): Remove this once LT hang is fixed. bool multi_infer_req = false; if (FLAGS_nireq > 1 && FLAGS_api == "async") { multi_infer_req = true; } core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}}); core.set_property("FPGA", {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}}); core.set_property("FPGA", {{DLIAPlugin::properties::per_layer_estimation.name(), estimate_per_layer}}); core.set_property("FPGA", {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}}); core.set_property("FPGA", {{DLIAPlugin::properties::multiple_inferences.name(), multi_infer_req}}); core.set_property("FPGA", {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}}); auto start_time = Time::now(); auto individual_start_time = Time::now(); // timer for each individual graph loading compiled_models = VectorMap(models, [&](std::shared_ptr model) { // Apply Low Precision transformations to handle quantized graphs // Mohamed_I: currently, this only works if the entire graph fits on the FPGA // because the CPU plugin calls common_optimizations again which has some transformations // that cause the graph to fail (I suspect it's the ConvolutionMultiplyFusion, but I // cannot disable it from the CPU) bool FPGA_used = device_name.find("FPGA") != std::string::npos; bool CPU_used = device_name.find("CPU") != std::string::npos; ov::AnyMap config; config.emplace(DLIAPlugin::properties::cpu_used.name(), CPU_used); config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir); config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir); for (auto&& node : model->get_ops()) { if (std::string("FakeQuantize") == node->get_type_name()) { config.emplace(DLIAPlugin::properties::apply_low_precision_transforms.name(), true); if (CPU_used && FPGA_used) { std::cerr << "ERROR: Quantized graphs only supported through HETERO:FPGA or CPU." << std::endl; throw std::logic_error("HETERO:FPGA,CPU plugin is not supported for quantization."); } } } auto compiled_model = new ov::CompiledModel(); *compiled_model = core.compile_model(model, device_name, config); duration_ms = double_to_string(get_total_ms_time(individual_start_time)); individual_start_time = Time::now(); slog::info << "Compile model ( " << model->get_friendly_name() << " ) took " << duration_ms << " ms" << slog::endl; return compiled_model; }); duration_ms = double_to_string(get_total_ms_time(start_time)); slog::info << "Load network(s) took " << duration_ms << " ms" << slog::endl; if (statistics) statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"load network time (ms)", duration_ms}}); #endif } else { next_step(); slog::info << "Skipping the step for compiled network" << slog::endl; next_step(); slog::info << "Skipping the step for compiled network" << slog::endl; next_step(); slog::info << "Skipping the step for compiled network" << slog::endl; // ----------------- 7. Loading the model to the device -------------------------------------------------------- next_step(); auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP); compiled_models = vectorMapWithIndex( split(FLAGS_cm, MULTIGRAPH_SEP), // get a list of compiled graphs [&](const std::string& compiled_graph_path, size_t index) { std::stringstream generated_name; generated_name << "Graph_" << index; slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as " << generated_name.str() << slog::endl; auto start_time = Time::now(); std::ifstream model_stream(compiled_graph_paths[index].c_str(), std::ios_base::in | std::ios_base::binary); if (!model_stream.is_open()) { throw std::runtime_error("Cannot open compiled model file: " + compiled_graph_paths[index]); } auto compiled_model = new ov::CompiledModel(); core.set_property("FPGA", {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}}); // Import specific configs ov::AnyMap config; config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir); config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir); *compiled_model = core.import_model(model_stream, device_name, config); topology_names.push_back(generated_name.str()); model_stream.close(); printInputAndOutputsInfoShort(*compiled_model); auto duration_ms = double_to_string(get_total_ms_time(start_time)); slog::info << "Import model took " << duration_ms << " ms" << slog::endl; if (statistics) statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"import model time (ms)", duration_ms}}); if (batch_size == 0) { batch_size = 1; } const auto& inputs = compiled_model->inputs(); for (const auto& item : inputs) { const auto& shape = item.get_shape(); if (shape[0] != batch_size) { slog::err << "Batch size of the compiled model is " << shape[0] << " and batch size provided is " << batch_size << slog::endl; std::cout << "Set the same batch size = " << shape[0] << " when running the app" << std::endl; std::cout << "Or recompile model with batch size = " << batch_size << std::endl; exit(5); } } bool reshape_required = false; input_infos.push_back(GetInputsInfo(batch_size, compiled_model->inputs(), reshape_required, FLAGS_bin_data, FLAGS_mean_values, FLAGS_scale_values)); return compiled_model; }); } // ----------------- 8. Setting optimal runtime parameters ----------------------------------------------------- next_step(); // Number of requests uint32_t nireq = FLAGS_nireq; #if defined(__arm__) | defined(__aarch64__) // In OpenVINO 2022.3 Arm plugin, when a AOT graph is compiled on CPU and dla_benchmark has -nireq > 1 // the program will be killed. We force nireq = 1 for HETERO:CPU graph only. // Note: -d CPU doesn't need to be checked for AOT because dlac does not support -fplugin CPU. if (device_name == "HETERO:CPU" && nireq > 1) { slog::warn << "-nireq > 1 is not supported for HETERO:CPU graph. Forcing -nireq = 1" << slog::endl; nireq = 1; } #endif if (nireq == 0) { if (FLAGS_api == "sync") { nireq = 1; } else { try { nireq = 0; for (auto& compiled_model : compiled_models) { auto req = compiled_model->get_property(ov::optimal_number_of_infer_requests); if (nireq == 0 || nireq > req) nireq = req; } } catch (const std::exception& ex) { throw ov::Exception("Every device used with the dla_benchmark should support " + std::string(ov::optimal_number_of_infer_requests.name()) + " Failed to query the metric for the " + device_name + " with error: " + ex.what()); } } } #ifdef MAX_NUM_INFERENCE_REQUEST if (nireq > MAX_NUM_INFERENCE_REQUEST) { slog::warn << "-nireq > "<< MAX_NUM_INFERENCE_REQUEST << " is not supported for the underlying device. Forcing -nireq = 1" << slog::endl; nireq = 1; } #endif // Iteration limit uint32_t niter = FLAGS_niter; if (niter > 0) { // Round up niter to a multiple of nireq niter = ((niter + nireq - 1) / nireq) * nireq; // We previously checked that FLAGS_niter >= 0, so okay to cast to uint. if (static_cast(FLAGS_niter) != niter) { slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to " << niter << " using number of requests " << nireq << slog::endl; } num_batches = niter; } else if (niter > 0) { num_batches = niter; } // Graph-request limit on device if (device_name.find("FPGA") != std::string::npos) { int ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as(); int numOutstandingInferRequest = nireq * net_size / ip_num_instances; int maxOutstandingInferRequest = core.get_property("FPGA", "COREDLA_DMA_CSR_DESCRIPTOR_QUEUE_SIZE").as(); if (maxOutstandingInferRequest > 0 && numOutstandingInferRequest > maxOutstandingInferRequest) { slog::err << "Possible number of outstanding inference requests per instance (" << numOutstandingInferRequest << ") " << "exceeds the CSR descriptor queue limit (" << maxOutstandingInferRequest << ")" << slog::endl; return 1; } } if (statistics) { for (auto& topology_name : topology_names) { statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG, { {"topology", topology_name}, {"target device", device_name}, {"API", FLAGS_api}, {"precision", std::string(precision.get_type_name())}, {"batch size", std::to_string(batch_size)}, {"number of iterations", std::to_string(niter)}, {"number of parallel infer requests", std::to_string(nireq)}, }); } for (auto& nstreams : device_nstreams) { std::stringstream ss; ss << "number of " << nstreams.first << " streams"; statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG, { {ss.str(), std::to_string(nstreams.second)}, }); } } // ----------------- 9. Creating infer requests and filling input blobs ---------------------------------------- next_step(); // Data structure hierarchy // Outermost vec: which model it corresponds to (multigraph) // Map: input/output name and its corresponding TensorVector // TensorVector: An alias for vector where each vector element correspond to the batch std::vector> input_data_tensors; std::vector> output_tensors(compiled_models.size()); std::vector> infer_request_queues; const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type; for (size_t net_idx = 0; net_idx < compiled_models.size(); net_idx++) { // Handle the case that use same inputs for all networks const auto& inputFiles = net_idx >= multi_input_files.size() ? multi_input_files.back() : multi_input_files[net_idx]; input_data_tensors.push_back(GetStaticTensors(inputFiles.empty() ? std::vector{} : inputFiles, batch_size, input_infos[net_idx], num_batches, resize_type, FLAGS_bgr, FLAGS_bin_data, FLAGS_verbose)); // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv infer_request_queues.push_back( std::move(std::unique_ptr(new InferRequestsQueue(*(compiled_models[net_idx]), nireq)))); } // ----------------- 10. Measuring performance ------------------------------------------------------------------ size_t progress_bar_total_count = progressBarDefaultTotalCount; std::stringstream ss; ss << "Start inference " << FLAGS_api << "ronously"; if (FLAGS_api == "async") { if (!ss.str().empty()) { ss << ", "; } ss << infer_request_queues.size() * infer_request_queues.at(0)->requests.size() << " inference requests"; std::stringstream device_ss; for (auto& nstreams : device_nstreams) { if (!device_ss.str().empty()) { device_ss << ", "; } device_ss << nstreams.second << " streams for " << nstreams.first; } if (!device_ss.str().empty()) { ss << " using " << device_ss.str(); } } ss << ", limits: " << niter << " iterations with each graph, " << compiled_models.size() << " graph(s)"; progress_bar_total_count = niter; next_step(ss.str()); /** Start inference & calculate performance **/ /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/ ProgressBar progress_bar(progress_bar_total_count, FLAGS_stream_output, FLAGS_progress); std::vector iterations(compiled_models.size(), 0); try { while ((niter != 0LL && iterations.back() < niter) || (FLAGS_api == "async" && iterations.back() % nireq != 0)) { // set up all infer request and prep all i/o Blobs for (size_t net_id = 0; net_id < compiled_models.size(); net_id++) { for (size_t iireq = 0; iireq < nireq; iireq++) { auto infer_request = infer_request_queues.at(net_id)->get_idle_request(); if (!infer_request) { THROW_IE_EXCEPTION << "No idle Infer Requests!"; } if (niter != 0LL) { const auto& outputs = compiled_models[net_id]->outputs(); for (const auto& output : outputs) { const std::string& name = output.get_any_name(); output_tensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape()); infer_request->set_tensor(output, output_tensors.at(net_id).at(name).at(iterations.at(net_id))); } const auto& inputs = compiled_models[net_id]->inputs(); for (auto& input : inputs) { const std::string& name = input.get_any_name(); const auto& data = input_data_tensors.at(net_id).at(name)[iterations.at(net_id)]; infer_request->set_tensor(input, data); } } // Execute one request/batch if (FLAGS_api == "sync") { infer_request->infer(); } else { // As the inference request is currently idle, the wait() adds no additional overhead (and should return // immediately). The primary reason for calling the method is exception checking/re-throwing. Callback, // that governs the actual execution can handle errors as well, but as it uses just error codes it has no // details like ‘what()’ method of `std::exception` So, rechecking for any exceptions here. infer_request->wait(); infer_request->start_async(); } iterations.at(net_id)++; if (net_id == compiled_models.size() - 1) { progress_bar.addProgress(1); } } } } // wait the latest inference executions for (auto& infer_request_queue : infer_request_queues) { infer_request_queue->wait_all(); } } catch (const std::exception& ex) { slog::err << "Inference failed:" << slog::endl; slog::err << ex.what() << slog::endl; ReadDebugNetworkInfo(core); PrintLSUCounterInfo(core); // Instead of setting return_code = 1 and continuing, exit immediately. // High risk of segfaulting / weird behavior when inference fails. return 1; } size_t iteration = iterations.back(); std::vector all_latencies; auto start_time = infer_request_queues.at(0)->get_start_time(); auto end_time = infer_request_queues.at(0)->get_end_time(); for (auto& infer_request_queue : infer_request_queues) { auto& latencies = infer_request_queue->get_latencies(); all_latencies.insert(all_latencies.end(), latencies.begin(), latencies.end()); start_time = std::min(start_time, infer_request_queue->get_start_time()); end_time = std::max(end_time, infer_request_queue->get_end_time()); } double latency = GetMedianValue(all_latencies); double total_duration = std::chrono::duration_cast(end_time - start_time).count() * 0.000001; double total_fps = (FLAGS_api == "sync") ? compiled_models.size() * batch_size * 1000.0 / latency : compiled_models.size() * batch_size * 1000.0 * iteration / total_duration; int ip_num_instances = 0; double ip_duration = 0.0; double ip_fps = 0.0; double ip_fps_per_fmax = 0.0; double estimated_ipFps = 0.0; double estimated_ipFpsPerFmax = 0.0; double fmax_core = -1.0; double estimated_ipFps_assumed_fmax = 0.0; if (device_name.find("FPGA") != std::string::npos) { ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as(); // even if hardware has 2 instances, only 1 instance actually gets used if only 1 inference is performed size_t ip_num_instances_used = std::min((size_t)ip_num_instances, iteration); ip_duration = core.get_property("FPGA", "IP_ACTIVE_TIME").as(); if (ip_duration) { if (ip_duration != 0.0) { ip_fps = (FLAGS_api == "sync") ? compiled_models.size() * batch_size * 1000.0 / latency / ip_num_instances_used : compiled_models.size() * batch_size * 1000.0 * iteration / ip_duration / ip_num_instances_used; } fmax_core = core.get_property("FPGA", "COREDLA_CLOCK_FREQUENCY").as(); if (fmax_core > 0.0) { ip_fps_per_fmax = ip_fps / fmax_core; } else { slog::warn << "Warning: could not estimate clk_dla frequency on the FPGA" << slog::endl; } } if (FLAGS_perf_est && (device_name.find("FPGA") != std::string::npos)) { if (is_model_compiled) { // Ahead of Time Flow: getting the imported, precalculated performance estimate estimated_ipFps = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST").as(); if (estimated_ipFps < 0) slog::warn << "Missing performance estimation from at least one of the compiled graphs" << slog::endl; estimated_ipFps_assumed_fmax = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST_ASSUMED_FMAX").as(); } else { #ifndef DISABLE_JIT // Just In Time Flow: running the performance estimate if (fmax_core > 0.0) { #if defined(_WIN32) || defined(_WIN64) _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str()); _putenv_s("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str()); #else setenv("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str(), true); setenv("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str(), true); #endif estimated_ipFps_assumed_fmax = fmax_core; } else { // In case the fmax_core variable is not set, we use the estimated fmax values for AGX7 and A10. // This if statement is just defensive programming for a condition that should not happen. #ifdef DE10_AGILEX estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 500); // AGX7 fMAX estimate #else estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 265); // A10 fMAX estimate #endif slog::warn << "Warning: could not estimate clk_dla frequency on the FPGA, setting the fmax to default value." << slog::endl; #if defined(_WIN32) || defined(_WIN64) _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str()); _putenv_s("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str()); #else setenv("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true); setenv("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true); #endif } estimated_ipFps = core.get_property("FPGA", "PLUGIN_PERFORMANCE_EST").as(); #endif } estimated_ipFpsPerFmax = estimated_ipFps / estimated_ipFps_assumed_fmax; } } if (statistics) { statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, { {"total execution time (ms)", double_to_string(total_duration)}, {"IP active time (ms)", double_to_string(ip_duration)}, {"total number of iterations", std::to_string(iteration)}, }); if (device_name.find("MULTI") == std::string::npos) { statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, { {"latency (ms)", double_to_string(latency)}, }); } statistics->addParameters( StatisticsReport::Category::EXECUTION_RESULTS, {{"throughput", double_to_string(total_fps)}, {"IP throughput", double_to_string(ip_fps)}}); } progress_bar.finish(); // ----------------- 11. Dumping statistics report ------------------------------------------------------------- next_step(); if (perf_count || !perf_count_sort.empty()) { std::vector> perfCounts; for (size_t ireq = 0; ireq < nireq; ireq++) { auto reqPerfCounts = infer_request_queues.at(0)->requests[ireq]->get_performance_counts(); perfCounts.push_back(reqPerfCounts); } if (statistics) { if (perf_count_sort == "sort") { statistics->printPerfCountersSort(perfCounts, "sort"); } else if (perf_count_sort == "simple_sort") { statistics->printPerfCountersSort(perfCounts, "simple_sort"); } else { statistics->printPerfCountersSort(perfCounts, "no_sort"); } } } // dla_benchmark originally also implemented more detailed performance // statistics via InferRequest's getPerformanceCounts function // We did not support it, and removed it. If we want to re-implement it // looking at the latest version of OpenVINO's benchmark_app or our git // history would be a good starting point if (statistics) { statistics->dump(); } std::cout << "count: " << iteration << " iterations" << std::endl; std::cout << "system duration: " << double_to_string(total_duration) << " ms" << std::endl; if (ip_duration != 0.0) std::cout << "IP duration: " << double_to_string(ip_duration) << " ms" << std::endl; if (device_name.find("MULTI") == std::string::npos) std::cout << "latency: " << double_to_string(latency) << " ms" << std::endl; std::cout << "system throughput: " << double_to_string(total_fps) << " FPS" << std::endl; if (ip_num_instances != 0) std::cout << "number of hardware instances: " << ip_num_instances << std::endl; if (compiled_models.size() != 0) std::cout << "number of network instances: " << compiled_models.size() << std::endl; if (ip_fps != 0.0) std::cout << "IP throughput per instance: " << double_to_string(ip_fps) << " FPS" << std::endl; if (ip_fps_per_fmax != 0.0) std::cout << "IP throughput per fmax per instance: " << double_to_string(ip_fps_per_fmax) << " FPS/MHz" << std::endl; if (fmax_core > 0.0) std::cout << "IP clock frequency: " << double_to_string(fmax_core) << " MHz" << std::endl; if (estimated_ipFps != 0.0) std::cout << "estimated IP throughput per instance: " << double_to_string(estimated_ipFps) << " FPS (" << (int)estimated_ipFps_assumed_fmax << " MHz assumed)" << std::endl; if (estimated_ipFpsPerFmax != 0.0) std::cout << "estimated IP throughput per fmax per instance: " << double_to_string(estimated_ipFpsPerFmax) << " FPS/MHz" << std::endl; // ----------------- 12. Dumping output values ------------------------------------------------------------- next_step(); if (FLAGS_dump_output) { for (size_t i = 0; i < compiled_models.size(); i++) { std::vector> output_info = compiled_models[i]->outputs(); // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames); const auto& output_tensors_map = output_tensors[i]; // A flag regarding whether we can dump output tensor in a text file due to unsupported layout. // This flag is set at first during dumping. bool can_dump_txt = true; bool can_dump_layout_info_in_txt = true; // dump output tensor as bin, which can be loaded using Python Numpy std::regex pattern("\\{batch\\}"); std::string results_bin_file_name = output_dir + "result_{batch}.bin"; // dump output tensor as text // backward compatibility support for old regtests that used only one graph std::string results_txt_file_name = output_dir + "result.txt"; std::string results_boundaries_file_name = output_dir + "result_tensor_boundaries.txt"; // dump inference arguments and metadata as JSON std::string results_meta_file_name = output_dir + "result_meta.json"; if (compiled_models.size() > 1) { results_bin_file_name = output_dir + topology_names[i] + "_result_{batch}.bin"; results_txt_file_name = output_dir + topology_names[i] + "_result.txt"; results_boundaries_file_name = output_dir + topology_names[i] + "_result_tensor_boundaries.txt"; results_meta_file_name = output_dir + topology_names[i] + "_result_meta.json"; } slog::info << "Dumping result of " << topology_names[i] << " to " << results_txt_file_name << slog::endl; slog::info << "Dumping per-batch result (raw output) of " << topology_names[i] << " to " << results_bin_file_name << slog::endl; slog::info << "Dumping inference meta data of " << topology_names[i] << " to " << results_meta_file_name << slog::endl; std::ofstream result_txt_file(results_txt_file_name); std::ofstream results_boundaries(results_boundaries_file_name); std::ofstream result_meta_file(results_meta_file_name); dla_benchmark::InferenceMetaData result_metadata; result_metadata.input_files = multi_input_files.at(i); // all input files in -i result_metadata.groundtruth_loc = FLAGS_groundtruth_loc; result_metadata.batch_size = FLAGS_batch_size; result_metadata.niter = niter; result_metadata.nireq = nireq; result_metadata.model_input_info = input_infos[i]; dla_benchmark::OutputsInfoVec model_output_info; uint32_t current_lines = 1; size_t max_allowed_megabytes_to_dump = FLAGS_max_output_file_size; for (uint32_t batch = 0; batch < num_batches; batch++) { std::string per_batch_results_bin_file_name = std::regex_replace(results_bin_file_name, pattern, std::to_string(batch)); std::ofstream per_batch_results_bin_file(per_batch_results_bin_file_name, std::ios::binary); for (const auto& item : output_info) { auto tensor = output_tensors_map.at(item.get_any_name()).at(batch); unsigned int output_size = tensor.get_size() / batch_size; const ov::Layout& layout = ov::layout::get_layout(item); const auto& shape = tensor.get_shape(); const std::string& name = item.get_any_name(); size_t total_bytes_to_dump = tensor.get_size() * niter * sizeof(float); if (can_dump_txt) { // if we cannot dump as a text file, we set can_dump_txt flag to false and write the one-time message if (total_bytes_to_dump > max_allowed_megabytes_to_dump * BYTE_TO_MEGABYTE) { can_dump_txt = false; std::string msg = "Output tensor (" + std::to_string(total_bytes_to_dump / BYTE_TO_MEGABYTE) + " MB) " "is too large to dump. Change environmental variable MAX_DUMP_OUTPUT_TXT (default " + std::to_string(FLAGS_max_output_file_size) + " MB) to allow dumping larger tensors"; slog::warn << msg << slog::endl; result_txt_file << msg; } else { if (can_dump_layout_info_in_txt && shape.size() != 2 && shape.size() != 4 && shape.size() != 5) { can_dump_layout_info_in_txt = false; slog::warn << "Output data tensor of rank that is not 2, 4 or 5. layout info will not be dumped in " << "result.txt." << slog::endl; } // Otherwise, dump text and write to the result_tensor_boundaries.txt with additional information // about the result.txt file results_boundaries << name << ": Line " << current_lines << " to " << "line " << current_lines + output_size - 1 << std::endl; results_boundaries << name << " output layout: " << layout.to_string() << std::endl; results_boundaries << name << " output dimension:"; for (unsigned int dim = 0; dim < shape.size(); dim++) { results_boundaries << " " << shape[dim]; } results_boundaries << std::endl; current_lines = current_lines + output_size; DumpResultTxtFile(tensor, item, output_size, result_txt_file); } } DumpResultBinFile(tensor, per_batch_results_bin_file); if (batch == 0) { // all batches should have the same output info dla_benchmark::OutputInfo output_info; output_info.name = name; output_info.shape = shape; model_output_info.push_back(output_info); } } per_batch_results_bin_file.close(); } result_metadata.model_output_info = model_output_info; DumpResultMetaJSONFile(result_metadata, result_meta_file); result_txt_file.close(); results_boundaries.close(); result_meta_file.close(); } const std::string throughput_file_name = output_dir + "throughput_report.txt"; std::ofstream throughput_file; throughput_file.open(throughput_file_name); throughput_file << "Throughput : " << total_fps << " fps" << std::endl; throughput_file << "Batch Size : " << batch_size << std::endl; throughput_file << "Graph number : " << compiled_models.size() << std::endl; throughput_file << "Num Batches : " << num_batches << std::endl; throughput_file.close(); // Append throughput to dataset // Check both gz and non gz versions std::string dataset_gz_file_name = "data.csv.gz"; append_value_if_incomplete_to_csv(dataset_gz_file_name, ip_fps); std::string dataset_file_name = "data.csv"; append_value_if_incomplete_to_csv(dataset_file_name, ip_fps); } // Calculate top 1, top 5 results if (FLAGS_groundtruth_loc != "") { auto groundtruth_files = split(FLAGS_groundtruth_loc, MULTIGRAPH_SEP); for (size_t i = 0; i < compiled_models.size(); i++) { // This flag `FLAGS_enable_object_detection_ap` enables accuracy checking subroutine that // gives the mAP and COCO AP scores. These scores are two of the main detection evaluation // metrics used in the Common Objects in Context contest, https://cocodataset.org/#detection-eval. std::vector> output_info = compiled_models[i]->outputs(); // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames); // Run the default top-1, top-5 evaluation routine if AP scores are not required. if (!FLAGS_enable_object_detection_ap) { if (groundtruth_files.size() <= i) { slog::warn << "Missing ground truth file for " << topology_names[i] << "! SKIPPED" << slog::endl; continue; // Print warnings for all missing ground truth graphs; } slog::info << "Comparing ground truth file " << groundtruth_files[i] << " with network " << topology_names[i] << slog::endl; // captures the results in higher precision for accuracy analysis std::vector results; const auto& output_tensors_map = output_tensors[i]; for (uint32_t batch = 0; batch < num_batches; batch++) { for (unsigned int img = 0; img < batch_size; img++) { for (const auto& item : output_info) { auto tensor = output_tensors_map.at(item.get_any_name()).at(batch); auto tensor_data = tensor.data(); unsigned int output_size = tensor.get_size() / batch_size; size_t offset = img * output_size; for (unsigned int j = 0; j < output_size; j++) { results.push_back(tensor_data[j + offset]); } } } } bool passed = TopResultsAnalyser::get_top_results(groundtruth_files[i], results, batch_size * num_batches); if (passed) { slog::info << "Get top results for \"" << topology_names[i] << "\" graph passed" << slog::endl; } else { // return 4 indicates that the accuracy of the result was below the threshold return_code = 4; } } else { // Runs the accuracy checking routine if AP scores are required. set_runtime(FLAGS_yolo_version, FLAGS_niter, batch_size_flag, FLAGS_i, FLAGS_groundtruth_loc); std::pair res = validate_yolo_wrapper(output_tensors[i], output_info, multi_input_files.at(0)); std::cout << std::endl; slog::info << "Batch metrics results:" << slog::endl; std::cout << "Detection - mAP@0.5: " << std::setprecision(6) << res.first * 100 << "%" << std::endl; std::cout << "Detection - mAP@0.5:0.95: " << std::setprecision(6) << res.second * 100 << "%" << std::endl; } } } // Output Debug Network Info if COREDLA_TEST_DEBUG_NETWORK is set ReadDebugNetworkInfo(core); if (FLAGS_report_lsu_counters) { PrintLSUCounterInfo(core); } if (return_code) return return_code; } catch (const std::exception& ex) { slog::err << ex.what() << slog::endl; if (statistics) { statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, { {"Error during dla_benchmark: ", ex.what()}, }); statistics->dump(); } return 3; } return 0; // Bypass long function lint check // NOLINTNEXTLINE(readability/fn_size) }