python/openvino/runtime/dla_benchmark/main.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575

// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
// Description: Main file of DLA benchmark. Entry point of DLA for just in time, ahead of time execution
//              and any use case of DLA performing inference. This file is responsible for the end to end flow of DLA,
//              from reading user input arguments, creating input tensors, compiling models, running inference
//              dumping results. DLA benchmark is loosely based off of OpenVINO's sample benchmark app.
//              For future OpenVINO uplifts viewing their sample app is a good place to start.
//              Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/main.cpp]

#include <algorithm>
#include <chrono>
#include <cstddef>
#include <map>
#include <memory>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>
#if defined(_WIN32) || defined(_WIN64)
#include <io.h>
#define NOMINMAX
#include <Windows.h>
#else
#include <dirent.h>
#include <unistd.h>
#endif
#include <stdio.h>
#include <sys/stat.h>
#include <fstream>
#include <regex>

#include <samples/args_helper.hpp>
#include <samples/common.hpp>
#include <samples/slog.hpp>

// DLA utils
#include "dla_stl_utils.h"
#include "dla_defines.h"

// DLA benchmark
#include "average_precision.hpp"
#include "dla_benchmark.hpp"
#include "dla_plugin_config.hpp"
#include "infer_request_wrap.hpp"
#include "inputs_filling.hpp"
#include "progress_bar.hpp"
#include "statistics_report.hpp"
#include "top1_top5.hpp"
#include "utils.hpp"

using DebugNetworkData = std::map<std::string, uint64_t>;
using LSUCounterData   = std::map<std::string, uint64_t>;

static const size_t progressBarDefaultTotalCount = 1000;

// Get value from env variable named 'name', if it exists.
// If not, returns provided default value.
template <class T>
T GetEnvOrDefault(const char* name, T default_value) {
  char* str_val = std::getenv(name);
  T result = default_value;
  if (str_val != NULL) {
    std::stringstream ss;
    ss << str_val;
    ss >> result;
  }
  return result;
}

bool ExistsTest(const std::string& name) {
  struct stat buffer;
  return (stat(name.c_str(), &buffer) == 0);
}

bool isFile(const std::string& path) {
#if defined(_WIN32) || defined(_WIN64)
  std::cout << "Windows-specific implementation for checking if something is a file" << std::endl;
  // Windows-specific implementation
  DWORD fileAttr = GetFileAttributesA(path.c_str());
  if (fileAttr == INVALID_FILE_ATTRIBUTES) {
    // The path does not exist or an error occurred.
    return false;
  }
  // Check if it's not a directory.
  return !(fileAttr & FILE_ATTRIBUTE_DIRECTORY);
#else
  // UNIX-specific implementation
  struct stat buffer;
  if (stat(path.c_str(), &buffer) == 0) {
    return S_ISREG(buffer.st_mode);
  }
  return false;
#endif
}

// This function appears in dla_aot_splitter/src/main.cpp too
bool DirOpenTest(const std::string& name) {
#if (!defined(_WIN32) && !defined(_WIN64))
  // If we can open the directory then return true
  DIR* dp = opendir(name.c_str());
  if (dp != nullptr) {
    closedir(dp);
    return true;
  }
#endif  // !_WIN32 && !_WIN64
  struct stat sb;
  if (stat(name.c_str(), &sb) == 0) {
    if ((sb.st_mode & S_IFMT) != S_IFREG) {
      slog::err << "File " << name << " cannot be opened!" << slog::endl;
      throw std::logic_error("File cannot be opened!");
    }
  }
  return true;
}

// Define a custom comparison function to sort based on ASCII names
bool CompareOutputNodeNames(const ov::Output<const ov::Node>& node1, const ov::Output<const ov::Node>& node2) {
  return node1.get_any_name() < node2.get_any_name();
}

// copy arguments into a new array to split the '-i=<arg>' into
// two arguments (i.e. '-i' and '<arg>') to overcome a bug
// parseInputFilesArguments function where is doesn't recognize
// the -i=<arg> format
void ParseCommandLine(int argc, char** argv) {
  int num_args = argc;
  // allocated enough memory in case we needed to split the -i argument into two
  char** arguments = new char*[num_args + 1];
  for (int i = 0, j = 0; j < argc; ++i, ++j) {
    if (strstr(argv[j], "-i=")) {
      // number of arguments will increase by one after splitting
      num_args++;
      arguments[i] = new char[3];
      strcpy(arguments[i++], "-i");
      // copy the reset of the argument (i.e. post "-i=")
      arguments[i] = new char[strlen(argv[j]) - 2];
      strcpy(arguments[i], argv[j] + 3);
      continue;
    }
    arguments[i] = new char[strlen(argv[j]) + 1];
    strcpy(arguments[i], argv[j]);
  }
  // the parse function is modifying the arguments point so we need to keep
  // a copy of the original pointer value to delete it properly
  char** orig_arg_ptr = arguments;
  gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true);
  // delete the allocated memory
  for (int i = 0; i < num_args; ++i) {
    delete[] orig_arg_ptr[i];
  }
  delete[] orig_arg_ptr;
}

bool CheckAndSetPluginsPath(const char* coredla_root) {
  // plugins_xml_file should probably be removed in the future
  if (!FLAGS_plugins_xml_file.empty()) {
    FLAGS_plugins = FLAGS_plugins_xml_file;
    slog::warn << "====================================================================" << slog::endl;
    slog::warn << "Warning: -plugins_xml_file option is deprecated, please use -plugins." << slog::endl;
    slog::warn << "====================================================================" << slog::endl;
  }

  const char* coredla_work = std::getenv("COREDLA_WORK");
  std::string coredla_root_str = coredla_root;
  if (FLAGS_plugins.empty()) {
    if (coredla_work == nullptr) {
      FLAGS_plugins = coredla_root_str + "/runtime/plugins.xml";
    } else {
      std::string coredla_work_str = coredla_work;
      FLAGS_plugins = coredla_work_str + "/runtime/plugins.xml";
    }

    if (ExistsTest(FLAGS_plugins)) {
      slog::info << "Using default plugins xml file - " << FLAGS_plugins << slog::endl;
      return true;
    }
  }

  if (ExistsTest(FLAGS_plugins) && isFile(FLAGS_plugins)) {
    slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl;
    return true;
  }
  // Check if user wants a shortcut to software emulation xml file if a path does not exist
  if (FLAGS_plugins.find("emulation") != std::string::npos) {
    // Potential paths for the plugins_emulation.xml file
    std::string deployed_loc_plugins = coredla_root_str + "/bin/plugins_emulation.xml";
    std::string developer_loc_plugins = coredla_root_str + "/build/coredla/dla/bin/plugins_emulation.xml";

    if (ExistsTest(deployed_loc_plugins))
      FLAGS_plugins = deployed_loc_plugins;
    else if (ExistsTest(developer_loc_plugins))
      FLAGS_plugins = developer_loc_plugins;
  } else {
    // if user didn't specify emulation and user did not pass any xml file, raise an error
    throw std::invalid_argument("Invalid argument for -plugins. Use 'emulation' or a path to custom xml file");
  }

  if (ExistsTest(FLAGS_plugins)) {
    slog::info << "Using custom emulation xml file - " << FLAGS_plugins << slog::endl;
    return true;
  }

  return false;
}

bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& net_size) {
  // ---------------------------Parsing and validating input arguments--------------------------------------
  slog::info << "Parsing input parameters" << slog::endl;

  // Check for any flags that are missing their preceding dashes
  // GFlags quietly ignores any flags missing their dashes, which can cause
  // dla_benchmark to run with settings other than what the user intended

  // GFlags supports two different styles of flag:
  // 1. --<flag>
  // 2. -<flag>
  // It also supports two different ways of specifying values for flags which
  // take values:
  // 1. --<flag>=<value>
  // 2. --<flag> <value>

  // If we are not expecting a flag, we are expecting a value for the
  // preceding flag
  bool expecting_flag = true;
  // Start at 1 to skip the command itself
  for (int i = 1; i < argc; i++) {
    if (expecting_flag) {
      // A flag is always denoted by the first char being '-'
      if (argv[i][0] != '-') {
        slog::err << "Argument " << argv[i] << " is invalid. You"
                  << " may have forgotten a preceding '-'." << slog::endl;
        throw std::logic_error("One or more invalid arguments");
      }

      char* flag_name_start = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1];
      std::string flag_name;

      gflags::CommandLineFlagInfo flag_info;
      if (strstr(flag_name_start, "=")) {
        flag_name = std::string(flag_name_start, size_t(strstr(flag_name_start, "=") - flag_name_start));
      } else {
        flag_name = std::string(flag_name_start);
      }

      // We expect a flag in the next argv if the current flag is a bool,
      // because bool flags do not take a value.
      // If GetCommandLineFlagInfo returns false, we assume the current
      // flag is a boolean because boolean flags can be specified as
      // -no<flag>, which is equivalent to -<flag>=false, or the flag
      // simply being omitted. However, "no<flag>" is not recognized by
      // GetCommandLineFlagInfo.
      // Therefore, if the name is not recognized either the flag is a
      // boolean flag or doesn't exist. In the latter case, gflags errors
      // when we call ParseCommandLine so we can assume here it's a bool.
      if (!GetCommandLineFlagInfo(flag_name.c_str(), &flag_info) || strstr(argv[i], "=") || flag_info.type == "bool") {
        expecting_flag = true;
      } else {
        expecting_flag = false;
      }
    } else {
      // If we were expecting a value, doesn't matter what it is
      // gflags will check all values are the correct type, and
      // dla_benchmark checks if the values received are sane
      expecting_flag = true;
    }
  }

  ParseCommandLine(argc, argv);

  if (FLAGS_help || FLAGS_h) {
    ShowUsage();
    // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it
    // is an OpenCL/DLAv1 device.  Since it is not, it then errors-out when the device
    // does not response as expected to the OpenCL query.
    // showAvailableDevices();
    std::cout << "\n";
    return false;
  }

  if (FLAGS_hidden_help) {
    PrintHiddenHelp();
    return false;
  }

  if (FLAGS_cm.empty()) {
    std::string network_file_flag;
    if (!FLAGS_m.empty()) {
      if (!FLAGS_network_file.empty()) {
        throw std::invalid_argument(
            "Both --network-file and -m are specified. Please only use one of the two arguments.");
      }
      network_file_flag = FLAGS_m;
    } else if (!FLAGS_network_file.empty()) {
      network_file_flag = FLAGS_network_file;
    } else {
      throw std::logic_error("Model is required but not set. Please set -m option.");
    }

    std::vector<std::string> m_paths = split(network_file_flag, MULTIGRAPH_SEP);
    net_size = m_paths.size();
    slog::info << "Found " << net_size << " graph" << (net_size == 1 ? "" : "s") << slog::endl;
    for (auto& m_path : m_paths) {
      if (!ExistsTest(m_path)) {
        slog::err << "network file: " << m_path << " doesn't exist. Please provide a valid path with -m." << slog::endl;
        throw std::logic_error("Model file path does not exist.");
      }
    }
  } else {
    std::vector<std::string> m_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
    net_size = m_paths.size();
    slog::info << "Found " << net_size << " compiled graph" << (net_size == 1 ? "" : "s") << slog::endl;
    for (auto& m_path : m_paths) {
      if (!ExistsTest(m_path)) {
        slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm."
                  << slog::endl;
        throw std::logic_error("Compiled model file path does not exist.");
      }
    }
  }

  if (FLAGS_api != "async" && FLAGS_api != "sync") {
    throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
  }

  if (FLAGS_niter <= 0) {
    throw std::logic_error("-niter is a required flag and its value must be positive");
  }

  const char* coredla_root = std::getenv("COREDLA_ROOT");
  if (coredla_root == nullptr) {
    slog::err << "ERROR: COREDLA_ROOT environment variable is not set." << slog::endl;
    throw std::logic_error("Please set up correct environment variables first");
  }

  if (!CheckAndSetPluginsPath(coredla_root)) {
    slog::err << "plugins_xml file: " << FLAGS_plugins_xml_file << " doesn't exist. Please provide a valid path."
              << slog::endl;
    throw std::logic_error("plugins_xml file path does not exist.");
  }

  // Checks required arguments for the mAP calculation subroutine.
  if (FLAGS_enable_object_detection_ap) {
    if (!FLAGS_yolo_version.size() || !is_yolo_supported(FLAGS_yolo_version)) {
      slog::err << "Please specify the version of your YOLO graph by setting the -yolo_version option to "
                   "`yolo-v3-tiny-tf` or `yolo-v3-tf` value."
                << slog::endl;
      throw std::logic_error("Incorrect YOLO version.");
    }
  }

  // Checks if output directory exists and can be opened
  if (!FLAGS_output_dir.empty()) {
    if (!ExistsTest(FLAGS_output_dir)) {
      slog::err << "Specified output directory: " << FLAGS_output_dir << " does not exist" << slog::endl;
      throw std::logic_error("Output directory does not exist");
    }
    // Test whether the path can be opened if it's a directory
    DirOpenTest(FLAGS_output_dir);
  }

  return true;
}

static void next_step(const std::string additional_info = "") {
  static size_t step_id = 0;
  static const std::map<size_t, std::string> step_names = {{1, "Parsing and validating input arguments"},
                                                           {2, "Loading OpenVINO Runtime"},
                                                           {3, "Setting device configuration"},
                                                           {4, "Reading the Intermediate Representation network"},
                                                           {5, "Resizing network to match image sizes and given batch"},
                                                           {6, "Configuring input of the model"},
                                                           {7, "Loading the model to the device"},
                                                           {8, "Setting optimal runtime parameters"},
                                                           {9, "Creating infer requests and preparing input tensors"},
                                                           {10, "Measuring performance"},
                                                           {11, "Dumping statistics report"},
                                                           {12, "Dumping the output values"}};

  step_id++;
  if (step_names.count(step_id) == 0)
    THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size();

  std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id)
            << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl;
}

template <typename T>
T GetMedianValue(const std::vector<T>& vec) {
  std::vector<T> sorted_vec(vec);
  std::sort(sorted_vec.begin(), sorted_vec.end());
  return (sorted_vec.size() % 2 != 0)
             ? sorted_vec[sorted_vec.size() / 2ULL]
             : (sorted_vec[sorted_vec.size() / 2ULL] + sorted_vec[sorted_vec.size() / 2ULL - 1ULL]) /
                   static_cast<T>(2.0);
}

void ReadDebugNetworkInfo(ov::Core core) {
  if (FLAGS_debug_network) {
    // On hardware timeout exception, fetch Debug CSR values from all modules attached to the Debug Network
    std::vector<DebugNetworkData> debug_csr_return =
        core.get_property("FPGA", "COREDLA_DEBUG_NETWORK_INFO").as<std::vector<DebugNetworkData>>();
    slog::info << "Dumping Debug Network profiling counters" << slog::endl;
    for (auto i = 0U; i < debug_csr_return.size(); i++) {
      std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
      // Print debug info for all instances
      for (auto& instance_csr_return : debug_csr_return[i]) {
        std::cout << instance_csr_return.first << ": " << instance_csr_return.second << std::endl;
      }
    }
  }
}

void PrintLSUCounterInfo(ov::Core core) {
  std::vector<LSUCounterData> lsu_counter_vec =
    core.get_property("FPGA", "COREDLA_LSU_ACCESS_COUNT").as<std::vector<LSUCounterData>>();
    slog::info << "Dumping LSU memory access counters" << slog::endl;
    for (auto i = 0U; i < lsu_counter_vec.size(); i++) {
      std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
      for (const auto& entry : lsu_counter_vec.at(i)) {
        std::cout << entry.first <<": " << entry.second << std::endl;
      }
    }
}

// Returns true if last char of csv is a comma
bool is_last_char_comma(FILE* file) {
  if (file == nullptr) return 0;

  int i = -1;
  std::vector<char> white_space_chars = {'\n', ' ', '\t', '\r', '\f', '\v'};
  char last_char[1];
  do {
    if (std::fseek(file, i, SEEK_END) != 0) {
      return 0;
    }
    if (std::fread(last_char, 1, 1, file) == 0) {
      return 0;
    }
    i--;
  } while (std::count(white_space_chars.begin(), white_space_chars.end(), last_char[0]) != 0);

  return last_char[0] == ',';
}

bool fileExists(std::string& path) {
  struct stat buffer;
  return (stat(path.c_str(), &buffer) == 0);
}

void append_value_if_incomplete_to_csv(std::string path, double value) {
  try {
    if (!fileExists(path)) {
      return;
    }

    FILE* data_file = fopen(path.c_str(), "rb");
    if (data_file == nullptr) {
      return;
    }
    bool is_comma = is_last_char_comma(data_file);
    fclose(data_file);

    if (is_comma) {
      FILE* append_file = fopen(path.c_str(), "a");
      if (append_file == nullptr) {
        return;
      }
      fprintf(append_file, "%f\n", value);
      fclose(append_file);
    }
  } catch (...) {
    return;
  }
}

/**
 * @brief The entry point of the dla benchmark
 */
int main(int argc, char* argv[]) {
  std::shared_ptr<StatisticsReport> statistics;
  try {
    // Declaring the CompiledModel object as a pointer to workaround the segfault
    // that occurs when destructing the object. Now that it's declared as a pointer
    // the complier won't automatically call the destructor of the object at the end
    // of this scope and we won't delete the allocated memory either
    std::vector<ov::CompiledModel*> compiled_models;
    size_t net_size = 0;  // parse the size of networks for arguments check

    size_t return_code = 0;  // universal return code, return this value after dumping out Debug info

    // ----------------- 1. Parsing and validating input arguments -------------------------------------------------
    next_step();

    if (!ParseAndCheckCommandLine(argc, argv, net_size)) {
      return 0;
    }

    bool is_model_compiled = !FLAGS_cm.empty();
    if (is_model_compiled) {
      slog::info << "Model is compiled" << slog::endl;
    }

    std::string arch_file_flag;
    if (!FLAGS_arch_file.empty()) {
      if (!FLAGS_arch.empty()) {
        throw std::invalid_argument(
            "Both --arch and -arch_file are specified. Please only use one of the two arguments.");
      }
      arch_file_flag = FLAGS_arch_file;
    } else if (!FLAGS_arch.empty()) {
      arch_file_flag = FLAGS_arch;
    }

    bool flag_b_default = gflags::GetCommandLineFlagInfoOrDie("b").is_default;
    bool flag_batch_size_default = gflags::GetCommandLineFlagInfoOrDie("batch_size").is_default;

    size_t batch_size_flag;
    if (!flag_b_default) {
      if (!flag_batch_size_default) {
        throw std::invalid_argument(
            "Both --batch-size and -b are specified. Please only use one of the two arguments.");
      }
      batch_size_flag = FLAGS_b;
    } else {
      batch_size_flag = FLAGS_batch_size;
    }

    if (batch_size_flag > 10000 || batch_size_flag <= 0) {
      throw std::invalid_argument(
          "Batch size is too big (>10000) or not a postive number (<=0). Specify the batch size within the specified "
          "range.");
    }

    std::string network_file_flag;
    if (!FLAGS_m.empty()) {
      if (!FLAGS_network_file.empty()) {
        throw std::invalid_argument(
            "Both --network-file and -m are specified. Please only use one of the two arguments.");
      }
      network_file_flag = FLAGS_m;
    } else if (!FLAGS_network_file.empty()) {
      network_file_flag = FLAGS_network_file;
    }

    // langsu: ideally use boost to create a sub-folder for ddrfree files
    // but ed4 toolchain doesn't have boost yet.
    std::string output_dir;
    std::string parameter_rom_output_dir;
    std::string separator = dla::util::path_separator;
    if (!FLAGS_output_dir.empty()) {
      output_dir = FLAGS_output_dir + separator;
      parameter_rom_output_dir = output_dir;
    } else {
      output_dir = "." + separator;
      parameter_rom_output_dir = output_dir;
    }

    // The set of arguments printed is meant to be a useful summary to the
    // user, rather than all of the arguments to dla_benchmark
    slog::info << "Printing summary of arguments being used by dla_benchmark" << slog::endl
               << "API (-api) ........................... " << FLAGS_api << slog::endl
               << "Device (-d) .......................... " << FLAGS_d << slog::endl
               << "Batch size (-b) ...................... " << batch_size_flag << slog::endl
               << (!FLAGS_cm.empty() ? "Compiled model (-cm) ................. "
                                     : "Model (-m) ........................... ")
               << (!FLAGS_cm.empty() ? FLAGS_cm : network_file_flag) << slog::endl
               << "Num iterations (-niter) .............. "
               << (FLAGS_niter > 0 ? std::to_string(FLAGS_niter) : "Not specified") << slog::endl
               << "Input images directory (-i) .......... "
               << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl
               << "Num CPU threads (-nthreads) .......... "
               << (FLAGS_nthreads > 0 ? std::to_string(FLAGS_nthreads) : "Not specified") << slog::endl
               << "Architecture file (-arch_file) ....... " << arch_file_flag << slog::endl
               << "Num inference requests (-nireq) ...... "
               << (FLAGS_nireq > 0 ? std::to_string(FLAGS_nireq) : "Not specified") << slog::endl
               << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl
               << "Groundtruth file (-groundtruth_loc) .. "
               << (!FLAGS_groundtruth_loc.empty() ? FLAGS_groundtruth_loc : "Not specified") << slog::endl
               << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl
               << "EA features " << (FLAGS_enable_early_access ? "enabled." : "disabled.") << slog::endl;

    if (FLAGS_save_run_summary) {
      std::vector<gflags::CommandLineFlagInfo> flags;
      StatisticsReport::Parameters command_line_arguments;
      gflags::GetAllFlags(&flags);

      for (auto& flag : flags) {
        if (!flag.is_default) {
          command_line_arguments.push_back({flag.name, flag.current_value});
        }
      }

      if (!FLAGS_pcsort.empty() &&
          (FLAGS_pcsort != "simple_sort" && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort")) {
        slog::err << "Invalid -pcsort option: " << FLAGS_pcsort << ". Please use one of sort, simple_sort, no_sort."
                  << slog::endl;
        return 1;
      }

      statistics =
          std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_save_run_summary, FLAGS_report_folder});
      statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments);
    }

    /** This vector stores paths to the processed images **/
    auto multi_input_files = VectorMap<std::vector<std::string>>(
        SplitMultiInputFilesArguments(net_size),  // get input directory list
        [&](const std::vector<std::string>& input_args) mutable {
          std::vector<std::string> files;
          for (auto& input_arg : input_args) {
            // Test if the path exists
            if (!ExistsTest(input_arg)) {
              slog::err << "Specified image path: " << input_arg << " does not exist" << slog::endl;
              throw std::logic_error("Image path does not exist");
            }
            // Test whether the path can be opened if it's a directory
            DirOpenTest(input_arg);
            readInputFilesArguments(files, input_arg);
          }
          return files;
        });

    if (multi_input_files.size() == 0) {
      // failed to read input files
      slog::err << "Failed to read input files" << slog::endl;
      return 1;
    }

    if (FLAGS_nstreams.empty()) {
      slog::warn << "-nstreams default value is determined automatically for a device. " << slog::endl;
      std::cout << "\tAlthough the automatic selection usually provides a reasonable performance, \n"
                << "\tbut it still may be non-optimal for some cases, for more information look at README."
                << std::endl;
    }

#ifdef DISABLE_JIT
    if (!network_file_flag.empty()) {
      slog::err << "Runtime compiled without support for Just-in-Time (JIT) execution!" << slog::endl
                << "Either specify a compiled model using -cm <compiled_model.bin> "
                << "or recompile the runtime without the -disable_jit flag." << slog::endl;
      return 1;
    }
#endif

    uint32_t num_batches = 1;

    // ----------------- 2. Loading OpenVINO Runtime/Inference Engine
    // -----------------------------------------------------------
    next_step();

    // Get optimal runtime parameters for device
    std::string device_name = FLAGS_d;
    if (is_model_compiled) {
      auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);  // separate each AOT file path
      for (auto& compiled_graph : compiled_graph_paths) {
        std::filebuf obj_file_buf;
        // There does not seem to be a way to get the device from the OpenVINO executable network
        // Instead we manually read through the xml header in the AOT graph to get the device name (an ugly hack
        // unfortunately)
        obj_file_buf.open(compiled_graph.c_str(), std::ios::in | std::ios::binary);
        std::istream obj_istream(&obj_file_buf);
        std::string xml_header, current_device;
        getline(obj_istream, xml_header);                               // retrieve xml header from AOT bin file
        if (xml_header.find("TARGET_FALLBACK") != std::string::npos) {  // uses hetero plugin
          int start_index = xml_header.find("TARGET_FALLBACK") + 24;
          int end_index = xml_header.find("</hetero_config>") - 3;
          current_device =
              "HETERO:" + xml_header.substr(start_index, end_index - start_index);  // get device from xml header
        } else {
          current_device = "FPGA";
        }
        if (device_name == "") {  // device flag not specified in AOT flow
          device_name = current_device;
        } else {
          if (current_device != device_name) {  // print error for non-matching devices
            throw std::logic_error(
                "The AOT file does not target the expected device.  "
                "The device specified to dla_benchmark using the -d flag must be the same as the "
                "device specified to dla_compiler using the --fplugin flag.");
          }
        }
      }
    } else {
      if (device_name == "") device_name = "CPU";  // default device for JIT flow is CPU
    }
    ov::Core core(FLAGS_plugins);

    if (device_name.find("CPU") != std::string::npos) {
      core.set_property("FPGA", {{DLIAPlugin::properties::cpu_used.name(), true}});
    }

    if (arch_file_flag != "" && device_name.find("FPGA") != std::string::npos) {
      core.set_property("FPGA", {{DLIAPlugin::properties::arch_path.name(), arch_file_flag}});
      if (!ExistsTest(arch_file_flag)) {
        slog::err << "architecture file: " << arch_file_flag << " doesn't exist. Please provide a valid path."
                  << slog::endl;
        throw std::logic_error("architecture file path does not exist.");
      }
      if (FLAGS_encryption_key != "") {
        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}});
      }
      if (FLAGS_encryption_iv != "") {
        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}});
      }
      // If emulator is used, do not perform decryption of compiled results  in the import step
      if (FLAGS_emulator_decryption) {
        core.set_property("FPGA", {{DLIAPlugin::properties::emulator_decryption.name(), CONFIG_VALUE(YES)}});
      }
      if (FLAGS_min_subgraph_layers < 1) {
        slog::err << "-min-subgraph-layers must be >= 1" << slog::endl;
        return 1;
      }
      core.set_property("FPGA", {{DLIAPlugin::properties::min_subgraph_layers.name(), FLAGS_min_subgraph_layers}});
    }

    if (device_name.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
      // CPU extensions is loaded as a shared library and passed as a pointer to base extension
      core.add_extension(FLAGS_l);
      slog::info << "CPU extensions is loaded " << FLAGS_l << slog::endl;
    }

    slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl;
    slog::info << "Device info: " << core.get_versions(device_name) << slog::endl;

    // ----------------- 3. Setting device configuration -----------------------------------------------------------
    next_step();

    auto devices = ParseDevices(device_name);
    std::map<std::string, uint32_t> device_nstreams = ParseNStreamsValuePerDevice(devices, FLAGS_nstreams);
    for (auto& pair : device_nstreams) {
      auto key = std::string(pair.first + "_THROUGHPUT_STREAMS");
      std::vector<std::string> supported_config_keys =
          core.get_property(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS)).as<std::vector<std::string>>();
      if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
        throw std::logic_error(
            "Device " + pair.first + " doesn't support config key '" + key + "'! " +
            "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>");
      }
    }

    // pc is for CPU only at the moment
    bool perf_count = FLAGS_pc;
    std::string perf_count_sort = FLAGS_pcsort;
    for (auto& device : devices) {
      if (device == "CPU") {  // CPU supports few special performance-oriented keys
        if (perf_count || !perf_count_sort.empty()) {
          core.set_property("CPU", {{CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(YES)}});
        }
        // limit threading for CPU portion of inference
        if (FLAGS_nthreads != 0)
          core.set_property(device, {{CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads)}});
        core.set_property(device, {{CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin}});
        // Set CPU to optimize throughput
        core.set_property(device, ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT));
        // for CPU execution, more throughput-oriented execution via streams
        if (FLAGS_api == "async") {
          core.set_property(
              device,
              ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
                                                                 : ov::streams::AUTO));
        }
        device_nstreams[device] = core.get_property(device, ov::streams::num);
      } else if (device == ("GPU")) {
        if (FLAGS_api == "async") {
          core.set_property(
              device,
              ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
                                                                 : ov::streams::AUTO));
        }
        device_nstreams[device] = core.get_property(device, ov::streams::num);
      }
    }

    auto double_to_string = [](const double number) {
      std::stringstream ss;
      ss << std::fixed << std::setprecision(4) << number;
      return ss.str();
    };
    auto get_total_ms_time = [](Time::time_point& start_time) {
      return std::chrono::duration_cast<ns>(Time::now() - start_time).count() * 0.000001;
    };

    size_t batch_size = batch_size_flag;
    std::vector<std::string> topology_names;
    ov::element::Type precision = ov::element::undefined;
    // Vector stores which model (multigraph), InputsInfo is a map of input names and its respctive
    // input information
    std::vector<dla_benchmark::InputsInfo> input_infos;
    if (!is_model_compiled) {
#ifndef DISABLE_JIT
      // We choose to ifdef out this block of code because it's more readable than
      // pulling the block in the "else" out using ifdefs
      // ----------------- 4. Reading the Intermediate Representation network ----------------------------------------
      next_step();

      LOG_AND_PRINT(Logger::INFO, "Loading network files\n");

      auto start_time_read = Time::now();
      // get list of graphs
      std::vector<std::shared_ptr<ov::Model>> models =
          VectorMap<std::shared_ptr<ov::Model>>(split(network_file_flag, MULTIGRAPH_SEP), [&](const std::string& m) {
            std::shared_ptr<ov::Model> model = core.read_model(m);
            // Assign rt info IMMEDIATELY when DLA benchmark reads the model.
            // Applying transformations or reshaping may change node names.
            // Mixed Precision is an EA only feature for 2024.2
            if (FLAGS_enable_early_access) {
              for (auto&& node : model->get_ops()) {
                if (dla::util::NodeTypeUsesPE(node->get_type_name())) {
                  node->get_rt_info()[DLA_PE_PRECISION_MODE] =
                      dla::util::ParseNodeForRTInfo(node->get_friendly_name(), DLA_PE_PRECISION_MODE);
                }
              }
            }
            printInputAndOutputsInfoShort(*model);
            return model;
          });

      auto duration_ms = double_to_string(get_total_ms_time(start_time_read));
      slog::info << "Read network(s) took " << duration_ms << " ms" << slog::endl;
      if (statistics)
        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                  {{"read network time (ms)", duration_ms}});

      // ----------------- 5. Resizing network to match image sizes and given batch ----------------------------------
      next_step();

      for (size_t i = 0; i < models.size(); i++) {
        const auto& model_inputs = std::const_pointer_cast<const ov::Model>(models[i])->inputs();
        bool reshape = false;
        input_infos.push_back(
            GetInputsInfo(batch_size, model_inputs, reshape, FLAGS_bin_data, FLAGS_mean_values, FLAGS_scale_values));
        if (reshape) {
          dla_benchmark::PartialShapes shapes = {};
          for (auto& item : input_infos.back()) shapes[item.first] = item.second.partial_shape;
          slog::info << "Reshaping model to batch: " << batch_size << slog::endl;
          models[i]->reshape(shapes);
        }
        topology_names.push_back(models[i]->get_friendly_name());
      }

      // ----------------- 6. Configuring input and output
      // ----------------------------------------------------------------------
      next_step();
      // Set input layouts for all models and their inputs
      size_t input_info_idx = 0;
      for (std::shared_ptr<ov::Model> model : models) {
        auto preproc = ov::preprocess::PrePostProcessor(model);
        const auto& inputs = model->inputs();
        for (size_t i = 0; i < inputs.size(); i++) {
          ov::preprocess::InputInfo& input_info = preproc.input(i);
          const size_t input_rank = inputs[i].get_partial_shape().size();
          const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(input_rank));
          const ov::element::Type_t type = input_infos[input_info_idx].at(inputs[i].get_any_name()).type;
          input_info.tensor().set_element_type(type).set_layout(layout);
        }

        const auto& outputs = model->outputs();
        for (size_t i = 0; i < outputs.size(); i++) {
          const size_t output_rank = outputs[i].get_partial_shape().size();
          const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(output_rank));
          preproc.output(i).tensor().set_element_type(ov::element::f32).set_layout(layout);
        }
        // Once the build() method is called, the pre(post)processing steps
        // for layout and precision conversions are inserted automatically
        model = preproc.build();
        input_info_idx++;
      }
      // ----------------- 7. Loading the model to the device --------------------------------------------------------
      next_step();

      // Get the value from the command line arguments (if the command line argument wasn't
      // used by the user the default value set in dla_benchmark.hpp will be used)
      int folding_option = FLAGS_folding_option;
      bool fold_preprocessing = FLAGS_fold_preprocessing;
      bool estimate_per_layer = FLAGS_estimate_per_layer_latencies;
      bool enable_early_access = FLAGS_enable_early_access;
      // TODO(arooney): Remove this once LT hang is fixed.
      bool multi_infer_req = false;
      if (FLAGS_nireq > 1 && FLAGS_api == "async") {
        multi_infer_req = true;
      }

      core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}});
      core.set_property("FPGA",
                        {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}});
      core.set_property("FPGA",
                        {{DLIAPlugin::properties::per_layer_estimation.name(), estimate_per_layer}});
      core.set_property("FPGA",
                        {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}});
      core.set_property("FPGA",
                        {{DLIAPlugin::properties::multiple_inferences.name(), multi_infer_req}});
      core.set_property("FPGA", {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});

      auto start_time = Time::now();
      auto individual_start_time = Time::now();  // timer for each individual graph loading
      compiled_models = VectorMap<ov::CompiledModel*>(models, [&](std::shared_ptr<ov::Model> model) {
        // Apply Low Precision transformations to handle quantized graphs
        // Mohamed_I: currently, this only works if the entire graph fits on the FPGA
        // because the CPU plugin calls common_optimizations again which has some transformations
        // that cause the graph to fail (I suspect it's the ConvolutionMultiplyFusion, but I
        // cannot disable it from the CPU)

        bool FPGA_used = device_name.find("FPGA") != std::string::npos;
        bool CPU_used = device_name.find("CPU") != std::string::npos;

        ov::AnyMap config;
        config.emplace(DLIAPlugin::properties::cpu_used.name(), CPU_used);
        config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
        config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);

        for (auto&& node : model->get_ops()) {
          if (std::string("FakeQuantize") == node->get_type_name()) {
            config.emplace(DLIAPlugin::properties::apply_low_precision_transforms.name(), true);
            if (CPU_used && FPGA_used) {
              std::cerr << "ERROR: Quantized graphs only supported through HETERO:FPGA or CPU." << std::endl;
              throw std::logic_error("HETERO:FPGA,CPU plugin is not supported for quantization.");
            }
          }
        }

        auto compiled_model = new ov::CompiledModel();
        *compiled_model = core.compile_model(model, device_name, config);
        duration_ms = double_to_string(get_total_ms_time(individual_start_time));
        individual_start_time = Time::now();
        slog::info << "Compile model ( " << model->get_friendly_name() << " ) took " << duration_ms << " ms"
                   << slog::endl;
        return compiled_model;
      });
      duration_ms = double_to_string(get_total_ms_time(start_time));
      slog::info << "Load network(s) took " << duration_ms << " ms" << slog::endl;
      if (statistics)
        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                  {{"load network time (ms)", duration_ms}});
#endif
    } else {
      next_step();
      slog::info << "Skipping the step for compiled network" << slog::endl;
      next_step();
      slog::info << "Skipping the step for compiled network" << slog::endl;
      next_step();
      slog::info << "Skipping the step for compiled network" << slog::endl;
      // ----------------- 7. Loading the model to the device --------------------------------------------------------
      next_step();
      auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
      compiled_models = vectorMapWithIndex<ov::CompiledModel*>(
          split(FLAGS_cm, MULTIGRAPH_SEP),  // get a list of compiled graphs
          [&](const std::string& compiled_graph_path, size_t index) {
            std::stringstream generated_name;
            generated_name << "Graph_" << index;
            slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as "
                       << generated_name.str() << slog::endl;
            auto start_time = Time::now();
            std::ifstream model_stream(compiled_graph_paths[index].c_str(), std::ios_base::in | std::ios_base::binary);
            if (!model_stream.is_open()) {
              throw std::runtime_error("Cannot open compiled model file: " + compiled_graph_paths[index]);
            }
            auto compiled_model = new ov::CompiledModel();
            core.set_property("FPGA",
                              {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});
            // Import specific configs
            ov::AnyMap config;
            config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
            config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);
            *compiled_model = core.import_model(model_stream, device_name, config);
            topology_names.push_back(generated_name.str());
            model_stream.close();
            printInputAndOutputsInfoShort(*compiled_model);
            auto duration_ms = double_to_string(get_total_ms_time(start_time));
            slog::info << "Import model took " << duration_ms << " ms" << slog::endl;
            if (statistics)
              statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                        {{"import model time (ms)", duration_ms}});
            if (batch_size == 0) {
              batch_size = 1;
            }
            const auto& inputs = compiled_model->inputs();
            for (const auto& item : inputs) {
              const auto& shape = item.get_shape();
              if (shape[0] != batch_size) {
                slog::err << "Batch size of the compiled model is " << shape[0] << " and batch size provided is "
                          << batch_size << slog::endl;
                std::cout << "Set the same batch size = " << shape[0] << " when running the app" << std::endl;
                std::cout << "Or recompile model with batch size = " << batch_size << std::endl;
                exit(5);
              }
            }
            bool reshape_required = false;
            input_infos.push_back(GetInputsInfo(batch_size,
                                                compiled_model->inputs(),
                                                reshape_required,
                                                FLAGS_bin_data,
                                                FLAGS_mean_values,
                                                FLAGS_scale_values));
            return compiled_model;
          });
    }
    // ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
    next_step();

    // Number of requests
    uint32_t nireq = FLAGS_nireq;
#if defined(__arm__) | defined(__aarch64__)
    // In OpenVINO 2022.3 Arm plugin, when a AOT graph is compiled on CPU and dla_benchmark has -nireq > 1
    // the program will be killed. We force nireq = 1 for HETERO:CPU graph only.
    // Note: -d CPU doesn't need to be checked for AOT because dlac does not support -fplugin CPU.
    if (device_name == "HETERO:CPU" && nireq > 1) {
      slog::warn << "-nireq > 1 is not supported for HETERO:CPU graph. Forcing -nireq = 1" << slog::endl;
      nireq = 1;
    }

#endif

    if (nireq == 0) {
      if (FLAGS_api == "sync") {
        nireq = 1;
      } else {
        try {
          nireq = 0;
          for (auto& compiled_model : compiled_models) {
            auto req = compiled_model->get_property(ov::optimal_number_of_infer_requests);
            if (nireq == 0 || nireq > req) nireq = req;
          }
        } catch (const std::exception& ex) {
          throw ov::Exception("Every device used with the dla_benchmark should support " +
                              std::string(ov::optimal_number_of_infer_requests.name()) +
                              " Failed to query the metric for the " + device_name + " with error: " + ex.what());
        }
      }
    }
#ifdef MAX_NUM_INFERENCE_REQUEST
    if (nireq > MAX_NUM_INFERENCE_REQUEST) {
      slog::warn << "-nireq > "<< MAX_NUM_INFERENCE_REQUEST << " is not supported for the underlying device. Forcing -nireq = 1" << slog::endl;
      nireq = 1;
    }
#endif

    // Iteration limit
    uint32_t niter = FLAGS_niter;
    if (niter > 0) {
      // Round up niter to a multiple of nireq
      niter = ((niter + nireq - 1) / nireq) * nireq;
      // We previously checked that FLAGS_niter >= 0, so okay to cast to uint.
      if (static_cast<uint32_t>(FLAGS_niter) != niter) {
        slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to " << niter
                   << " using number of requests " << nireq << slog::endl;
      }
      num_batches = niter;
    } else if (niter > 0) {
      num_batches = niter;
    }

    // Graph-request limit on device
    if (device_name.find("FPGA") != std::string::npos) {
      int ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
      int numOutstandingInferRequest = nireq * net_size / ip_num_instances;
      int maxOutstandingInferRequest = core.get_property("FPGA", "COREDLA_DMA_CSR_DESCRIPTOR_QUEUE_SIZE").as<int>();
      if (maxOutstandingInferRequest > 0 && numOutstandingInferRequest > maxOutstandingInferRequest) {
        slog::err << "Possible number of outstanding inference requests per instance (" << numOutstandingInferRequest
                  << ") "
                  << "exceeds the CSR descriptor queue limit (" << maxOutstandingInferRequest << ")" << slog::endl;
        return 1;
      }
    }

    if (statistics) {
      for (auto& topology_name : topology_names) {
        statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
                                  {
                                      {"topology", topology_name},
                                      {"target device", device_name},
                                      {"API", FLAGS_api},
                                      {"precision", std::string(precision.get_type_name())},
                                      {"batch size", std::to_string(batch_size)},
                                      {"number of iterations", std::to_string(niter)},
                                      {"number of parallel infer requests", std::to_string(nireq)},
                                  });
      }
      for (auto& nstreams : device_nstreams) {
        std::stringstream ss;
        ss << "number of " << nstreams.first << " streams";
        statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
                                  {
                                      {ss.str(), std::to_string(nstreams.second)},
                                  });
      }
    }

    // ----------------- 9. Creating infer requests and filling input blobs ----------------------------------------
    next_step();

    // Data structure hierarchy
    // Outermost vec: which model it corresponds to (multigraph)
    // Map: input/output name and its corresponding TensorVector
    // TensorVector: An alias for vector<ov::tensor> where each vector element correspond to the batch
    std::vector<std::map<std::string, ov::TensorVector>> input_data_tensors;
    std::vector<std::map<std::string, ov::TensorVector>> output_tensors(compiled_models.size());

    std::vector<std::unique_ptr<InferRequestsQueue>> infer_request_queues;
    const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type;
    for (size_t net_idx = 0; net_idx < compiled_models.size(); net_idx++) {
      // Handle the case that use same inputs for all networks
      const auto& inputFiles =
          net_idx >= multi_input_files.size() ? multi_input_files.back() : multi_input_files[net_idx];
      input_data_tensors.push_back(GetStaticTensors(inputFiles.empty() ? std::vector<std::string>{} : inputFiles,
                                                    batch_size,
                                                    input_infos[net_idx],
                                                    num_batches,
                                                    resize_type,
                                                    FLAGS_bgr,
                                                    FLAGS_bin_data,
                                                    FLAGS_verbose));
      // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv
      infer_request_queues.push_back(
          std::move(std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(*(compiled_models[net_idx]), nireq))));
    }

    // ----------------- 10. Measuring performance ------------------------------------------------------------------
    size_t progress_bar_total_count = progressBarDefaultTotalCount;

    std::stringstream ss;
    ss << "Start inference " << FLAGS_api << "ronously";
    if (FLAGS_api == "async") {
      if (!ss.str().empty()) {
        ss << ", ";
      }
      ss << infer_request_queues.size() * infer_request_queues.at(0)->requests.size() << " inference requests";
      std::stringstream device_ss;
      for (auto& nstreams : device_nstreams) {
        if (!device_ss.str().empty()) {
          device_ss << ", ";
        }
        device_ss << nstreams.second << " streams for " << nstreams.first;
      }
      if (!device_ss.str().empty()) {
        ss << " using " << device_ss.str();
      }
    }
    ss << ", limits: " << niter << " iterations with each graph, " << compiled_models.size() << " graph(s)";
    progress_bar_total_count = niter;
    next_step(ss.str());

    /** Start inference & calculate performance **/
    /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/
    ProgressBar progress_bar(progress_bar_total_count, FLAGS_stream_output, FLAGS_progress);
    std::vector<size_t> iterations(compiled_models.size(), 0);
    try {
      while ((niter != 0LL && iterations.back() < niter) || (FLAGS_api == "async" && iterations.back() % nireq != 0)) {
        // set up all infer request and prep all i/o Blobs
        for (size_t net_id = 0; net_id < compiled_models.size(); net_id++) {
          for (size_t iireq = 0; iireq < nireq; iireq++) {
            auto infer_request = infer_request_queues.at(net_id)->get_idle_request();
            if (!infer_request) {
              THROW_IE_EXCEPTION << "No idle Infer Requests!";
            }

            if (niter != 0LL) {
              const auto& outputs = compiled_models[net_id]->outputs();
              for (const auto& output : outputs) {
                const std::string& name = output.get_any_name();
                output_tensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape());
                infer_request->set_tensor(output, output_tensors.at(net_id).at(name).at(iterations.at(net_id)));
              }
              const auto& inputs = compiled_models[net_id]->inputs();
              for (auto& input : inputs) {
                const std::string& name = input.get_any_name();
                const auto& data = input_data_tensors.at(net_id).at(name)[iterations.at(net_id)];
                infer_request->set_tensor(input, data);
              }
            }

            // Execute one request/batch
            if (FLAGS_api == "sync") {
              infer_request->infer();
            } else {
              // As the inference request is currently idle, the wait() adds no additional overhead (and should return
              // immediately). The primary reason for calling the method is exception checking/re-throwing. Callback,
              // that governs the actual execution can handle errors as well, but as it uses just error codes it has no
              // details like ‘what()’ method of `std::exception` So, rechecking for any exceptions here.
              infer_request->wait();
              infer_request->start_async();
            }
            iterations.at(net_id)++;
            if (net_id == compiled_models.size() - 1) {
              progress_bar.addProgress(1);
            }
          }
        }
      }

      // wait the latest inference executions
      for (auto& infer_request_queue : infer_request_queues) {
        infer_request_queue->wait_all();
      }
    } catch (const std::exception& ex) {
      slog::err << "Inference failed:" << slog::endl;
      slog::err << ex.what() << slog::endl;
      ReadDebugNetworkInfo(core);
      PrintLSUCounterInfo(core);
      // Instead of setting return_code = 1 and continuing, exit immediately.
      // High risk of segfaulting / weird behavior when inference fails.
      return 1;
    }

    size_t iteration = iterations.back();

    std::vector<double> all_latencies;
    auto start_time = infer_request_queues.at(0)->get_start_time();
    auto end_time = infer_request_queues.at(0)->get_end_time();
    for (auto& infer_request_queue : infer_request_queues) {
      auto& latencies = infer_request_queue->get_latencies();
      all_latencies.insert(all_latencies.end(), latencies.begin(), latencies.end());
      start_time = std::min(start_time, infer_request_queue->get_start_time());
      end_time = std::max(end_time, infer_request_queue->get_end_time());
    }
    double latency = GetMedianValue<double>(all_latencies);
    double total_duration = std::chrono::duration_cast<ns>(end_time - start_time).count() * 0.000001;
    double total_fps = (FLAGS_api == "sync")
                           ? compiled_models.size() * batch_size * 1000.0 / latency
                           : compiled_models.size() * batch_size * 1000.0 * iteration / total_duration;

    int ip_num_instances = 0;
    double ip_duration = 0.0;
    double ip_fps = 0.0;
    double ip_fps_per_fmax = 0.0;
    double estimated_ipFps = 0.0;
    double estimated_ipFpsPerFmax = 0.0;
    double fmax_core = -1.0;
    double estimated_ipFps_assumed_fmax = 0.0;
    if (device_name.find("FPGA") != std::string::npos) {
      ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
      // even if hardware has 2 instances, only 1 instance actually gets used if only 1 inference is performed
      size_t ip_num_instances_used = std::min((size_t)ip_num_instances, iteration);
      ip_duration = core.get_property("FPGA", "IP_ACTIVE_TIME").as<double>();
      if (ip_duration) {
        if (ip_duration != 0.0) {
          ip_fps = (FLAGS_api == "sync")
                       ? compiled_models.size() * batch_size * 1000.0 / latency / ip_num_instances_used
                       : compiled_models.size() * batch_size * 1000.0 * iteration / ip_duration / ip_num_instances_used;
        }
        fmax_core = core.get_property("FPGA", "COREDLA_CLOCK_FREQUENCY").as<double>();
        if (fmax_core > 0.0) {
          ip_fps_per_fmax = ip_fps / fmax_core;
        } else {
          slog::warn << "Warning: could not estimate clk_dla frequency on the FPGA" << slog::endl;
        }
      }

      if (FLAGS_perf_est && (device_name.find("FPGA") != std::string::npos)) {
        if (is_model_compiled) {
          // Ahead of Time Flow: getting the imported, precalculated performance estimate
          estimated_ipFps = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST").as<double>();
          if (estimated_ipFps < 0)
            slog::warn << "Missing performance estimation from at least one of the compiled graphs" << slog::endl;
          estimated_ipFps_assumed_fmax = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST_ASSUMED_FMAX").as<double>();
        } else {
#ifndef DISABLE_JIT
          // Just In Time Flow: running the performance estimate
          if (fmax_core > 0.0) {
#if defined(_WIN32) || defined(_WIN64)
            _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str());
            _putenv_s("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str());
#else
            setenv("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str(), true);
            setenv("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str(), true);
#endif
            estimated_ipFps_assumed_fmax = fmax_core;
          } else {
// In case the fmax_core variable is not set, we use the estimated fmax values for AGX7 and A10.
// This if statement is just defensive programming for a condition that should not happen.
#ifdef DE10_AGILEX
            estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 500);  // AGX7 fMAX estimate
#else
            estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 265);  // A10 fMAX estimate
#endif
            slog::warn
                << "Warning: could not estimate clk_dla frequency on the FPGA, setting the fmax to default value."
                << slog::endl;
#if defined(_WIN32) || defined(_WIN64)
            _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
            _putenv_s("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
#else
            setenv("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
            setenv("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
#endif
          }
          estimated_ipFps = core.get_property("FPGA", "PLUGIN_PERFORMANCE_EST").as<double>();
#endif
        }
        estimated_ipFpsPerFmax = estimated_ipFps / estimated_ipFps_assumed_fmax;
      }
    }

    if (statistics) {
      statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                {
                                    {"total execution time (ms)", double_to_string(total_duration)},
                                    {"IP active time (ms)", double_to_string(ip_duration)},
                                    {"total number of iterations", std::to_string(iteration)},
                                });
      if (device_name.find("MULTI") == std::string::npos) {
        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                  {
                                      {"latency (ms)", double_to_string(latency)},
                                  });
      }
      statistics->addParameters(
          StatisticsReport::Category::EXECUTION_RESULTS,
          {{"throughput", double_to_string(total_fps)}, {"IP throughput", double_to_string(ip_fps)}});
    }

    progress_bar.finish();

    // ----------------- 11. Dumping statistics report -------------------------------------------------------------
    next_step();

    if (perf_count || !perf_count_sort.empty()) {
      std::vector<std::vector<ov::ProfilingInfo>> perfCounts;
      for (size_t ireq = 0; ireq < nireq; ireq++) {
        auto reqPerfCounts = infer_request_queues.at(0)->requests[ireq]->get_performance_counts();
        perfCounts.push_back(reqPerfCounts);
      }
      if (statistics) {
        if (perf_count_sort == "sort") {
          statistics->printPerfCountersSort(perfCounts, "sort");
        } else if (perf_count_sort == "simple_sort") {
          statistics->printPerfCountersSort(perfCounts, "simple_sort");
        } else {
          statistics->printPerfCountersSort(perfCounts, "no_sort");
        }
      }
    }

    // dla_benchmark originally also implemented more detailed performance
    // statistics via InferRequest's getPerformanceCounts function
    // We did not support it, and removed it. If we want to re-implement it
    // looking at the latest version of OpenVINO's benchmark_app or our git
    // history would be a good starting point
    if (statistics) {
      statistics->dump();
    }

    std::cout << "count:             " << iteration << " iterations" << std::endl;
    std::cout << "system duration:   " << double_to_string(total_duration) << " ms" << std::endl;
    if (ip_duration != 0.0) std::cout << "IP duration:       " << double_to_string(ip_duration) << " ms" << std::endl;
    if (device_name.find("MULTI") == std::string::npos)
      std::cout << "latency:           " << double_to_string(latency) << " ms" << std::endl;
    std::cout << "system throughput: " << double_to_string(total_fps) << " FPS" << std::endl;
    if (ip_num_instances != 0) std::cout << "number of hardware instances: " << ip_num_instances << std::endl;
    if (compiled_models.size() != 0)
      std::cout << "number of network instances: " << compiled_models.size() << std::endl;
    if (ip_fps != 0.0) std::cout << "IP throughput per instance: " << double_to_string(ip_fps) << " FPS" << std::endl;
    if (ip_fps_per_fmax != 0.0)
      std::cout << "IP throughput per fmax per instance: " << double_to_string(ip_fps_per_fmax) << " FPS/MHz"
                << std::endl;
    if (fmax_core > 0.0) std::cout << "IP clock frequency: " << double_to_string(fmax_core) << " MHz" << std::endl;
    if (estimated_ipFps != 0.0)
      std::cout << "estimated IP throughput per instance: " << double_to_string(estimated_ipFps) << " FPS ("
                << (int)estimated_ipFps_assumed_fmax << " MHz assumed)" << std::endl;
    if (estimated_ipFpsPerFmax != 0.0)
      std::cout << "estimated IP throughput per fmax per instance: " << double_to_string(estimated_ipFpsPerFmax)
                << " FPS/MHz" << std::endl;

    // ----------------- 12. Dumping output values -------------------------------------------------------------
    next_step();

    if (FLAGS_dump_output) {
      for (size_t i = 0; i < compiled_models.size(); i++) {
        std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
        // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
        std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
        const auto& output_tensors_map = output_tensors[i];
        // A flag regarding whether we can dump output tensor in a text file due to unsupported layout.
        // This flag is set at first during dumping.
        bool can_dump_txt = true;
        bool can_dump_layout_info_in_txt = true;
        // dump output tensor as bin, which can be loaded using Python Numpy
        std::regex pattern("\\{batch\\}");
        std::string results_bin_file_name = output_dir + "result_{batch}.bin";
        // dump output tensor as text
        // backward compatibility support for old regtests that used only one graph
        std::string results_txt_file_name = output_dir + "result.txt";
        std::string results_boundaries_file_name = output_dir + "result_tensor_boundaries.txt";
        // dump inference arguments and metadata as JSON
        std::string results_meta_file_name = output_dir + "result_meta.json";

        if (compiled_models.size() > 1) {
          results_bin_file_name = output_dir + topology_names[i] + "_result_{batch}.bin";
          results_txt_file_name = output_dir + topology_names[i] + "_result.txt";
          results_boundaries_file_name = output_dir + topology_names[i] + "_result_tensor_boundaries.txt";
          results_meta_file_name = output_dir + topology_names[i] + "_result_meta.json";
        }

        slog::info << "Dumping result of " << topology_names[i]
                   << " to " << results_txt_file_name << slog::endl;
        slog::info << "Dumping per-batch result (raw output) of " << topology_names[i]
                   << " to " << results_bin_file_name << slog::endl;
        slog::info << "Dumping inference meta data of " << topology_names[i]
                   << " to " << results_meta_file_name << slog::endl;

        std::ofstream result_txt_file(results_txt_file_name);
        std::ofstream results_boundaries(results_boundaries_file_name);
        std::ofstream result_meta_file(results_meta_file_name);

        dla_benchmark::InferenceMetaData result_metadata;
        result_metadata.input_files = multi_input_files.at(i);  // all input files in -i
        result_metadata.groundtruth_loc = FLAGS_groundtruth_loc;
        result_metadata.batch_size = FLAGS_batch_size;
        result_metadata.niter = niter;
        result_metadata.nireq = nireq;
        result_metadata.model_input_info = input_infos[i];
        dla_benchmark::OutputsInfoVec model_output_info;

        uint32_t current_lines = 1;
        size_t max_allowed_megabytes_to_dump = FLAGS_max_output_file_size;

        for (uint32_t batch = 0; batch < num_batches; batch++) {
          std::string per_batch_results_bin_file_name = std::regex_replace(results_bin_file_name,
                                                                           pattern,
                                                                           std::to_string(batch));
          std::ofstream per_batch_results_bin_file(per_batch_results_bin_file_name, std::ios::binary);

          for (const auto& item : output_info) {
            auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
            unsigned int output_size = tensor.get_size() / batch_size;

            const ov::Layout& layout = ov::layout::get_layout(item);
            const auto& shape = tensor.get_shape();
            const std::string& name = item.get_any_name();
            size_t total_bytes_to_dump = tensor.get_size() * niter * sizeof(float);

            if (can_dump_txt) {
              // if we cannot dump as a text file, we set can_dump_txt flag to false and write the one-time message
              if (total_bytes_to_dump > max_allowed_megabytes_to_dump * BYTE_TO_MEGABYTE) {
                can_dump_txt = false;
                std::string msg = "Output tensor (" + std::to_string(total_bytes_to_dump / BYTE_TO_MEGABYTE) +
                                  " MB) "
                                  "is too large to dump. Change environmental variable MAX_DUMP_OUTPUT_TXT (default " +
                                  std::to_string(FLAGS_max_output_file_size) + " MB) to allow dumping larger tensors";
                slog::warn << msg << slog::endl;
                result_txt_file << msg;
              } else {
                if (can_dump_layout_info_in_txt && shape.size() != 2 && shape.size() != 4 && shape.size() != 5) {
                  can_dump_layout_info_in_txt = false;
                  slog::warn << "Output data tensor of rank that is not 2, 4 or 5. layout info will not be dumped in "
                             << "result.txt." << slog::endl;
                }
                // Otherwise, dump text and write to the result_tensor_boundaries.txt with additional information
                // about the result.txt file
                results_boundaries << name << ": Line " << current_lines << " to "
                                   << "line " << current_lines + output_size - 1 << std::endl;
                results_boundaries << name << " output layout: " << layout.to_string() << std::endl;
                results_boundaries << name << " output dimension:";
                for (unsigned int dim = 0; dim < shape.size(); dim++) {
                  results_boundaries << " " << shape[dim];
                }
                results_boundaries << std::endl;
                current_lines = current_lines + output_size;
                DumpResultTxtFile(tensor, item, output_size, result_txt_file);
              }
            }
            DumpResultBinFile(tensor, per_batch_results_bin_file);

            if (batch == 0) {
              // all batches should have the same output info
              dla_benchmark::OutputInfo output_info;
              output_info.name = name;
              output_info.shape = shape;
              model_output_info.push_back(output_info);
            }
          }
          per_batch_results_bin_file.close();
        }

        result_metadata.model_output_info = model_output_info;
        DumpResultMetaJSONFile(result_metadata, result_meta_file);
        result_txt_file.close();
        results_boundaries.close();
        result_meta_file.close();
      }
      const std::string throughput_file_name = output_dir + "throughput_report.txt";
      std::ofstream throughput_file;
      throughput_file.open(throughput_file_name);
      throughput_file << "Throughput : " << total_fps << " fps" << std::endl;
      throughput_file << "Batch Size : " << batch_size << std::endl;
      throughput_file << "Graph number : " << compiled_models.size() << std::endl;
      throughput_file << "Num Batches : " << num_batches << std::endl;
      throughput_file.close();

      // Append throughput to dataset
      // Check both gz and non gz versions
      std::string dataset_gz_file_name = "data.csv.gz";
      append_value_if_incomplete_to_csv(dataset_gz_file_name, ip_fps);
      std::string dataset_file_name = "data.csv";
      append_value_if_incomplete_to_csv(dataset_file_name, ip_fps);
    }

    // Calculate top 1, top 5 results
    if (FLAGS_groundtruth_loc != "") {
      auto groundtruth_files = split(FLAGS_groundtruth_loc, MULTIGRAPH_SEP);
      for (size_t i = 0; i < compiled_models.size(); i++) {
        // This flag `FLAGS_enable_object_detection_ap` enables accuracy checking subroutine that
        // gives the mAP and COCO AP scores. These scores are two of the main detection evaluation
        // metrics used in the Common Objects in Context contest, https://cocodataset.org/#detection-eval.

        std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
        // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
        std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
        // Run the default top-1, top-5 evaluation routine if AP scores are not required.
        if (!FLAGS_enable_object_detection_ap) {
          if (groundtruth_files.size() <= i) {
            slog::warn << "Missing ground truth file for " << topology_names[i] << "! SKIPPED" << slog::endl;
            continue;  // Print warnings for all missing ground truth graphs;
          }
          slog::info << "Comparing ground truth file " << groundtruth_files[i] << " with network " << topology_names[i]
                     << slog::endl;
          // captures the results in higher precision for accuracy analysis
          std::vector<float> results;
          const auto& output_tensors_map = output_tensors[i];
          for (uint32_t batch = 0; batch < num_batches; batch++) {
            for (unsigned int img = 0; img < batch_size; img++) {
              for (const auto& item : output_info) {
                auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
                auto tensor_data = tensor.data<float>();
                unsigned int output_size = tensor.get_size() / batch_size;
                size_t offset = img * output_size;
                for (unsigned int j = 0; j < output_size; j++) {
                  results.push_back(tensor_data[j + offset]);
                }
              }
            }
          }
          bool passed = TopResultsAnalyser::get_top_results(groundtruth_files[i], results, batch_size * num_batches);
          if (passed) {
            slog::info << "Get top results for \"" << topology_names[i] << "\" graph passed" << slog::endl;
          } else {
            // return 4 indicates that the accuracy of the result was below the threshold
            return_code = 4;
          }
        } else {
          // Runs the accuracy checking routine if AP scores are required.
          set_runtime(FLAGS_yolo_version, FLAGS_niter, batch_size_flag, FLAGS_i, FLAGS_groundtruth_loc);
          std::pair<double, double> res =
              validate_yolo_wrapper(output_tensors[i], output_info, multi_input_files.at(0));
          std::cout << std::endl;
          slog::info << "Batch metrics results:" << slog::endl;
          std::cout << "Detection - mAP@0.5: " << std::setprecision(6) << res.first * 100 << "%" << std::endl;
          std::cout << "Detection - mAP@0.5:0.95: " << std::setprecision(6) << res.second * 100 << "%" << std::endl;
        }
      }
    }
    // Output Debug Network Info if COREDLA_TEST_DEBUG_NETWORK is set
    ReadDebugNetworkInfo(core);
    if (FLAGS_report_lsu_counters) {
      PrintLSUCounterInfo(core);
    }
    if (return_code) return return_code;
  } catch (const std::exception& ex) {
    slog::err << ex.what() << slog::endl;

    if (statistics) {
      statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                {
                                    {"Error during dla_benchmark: ", ex.what()},
                                });
      statistics->dump();
    }

    return 3;
  }

  return 0;
  // Bypass long function lint check
  // NOLINTNEXTLINE(readability/fn_size)
}