diff options
| author | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
|---|---|---|
| committer | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
| commit | ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch) | |
| tree | a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/dla_aot_splitter | |
| parent | 40da1752f2c8639186b72f6838aa415e854d0b1d (diff) | |
| download | thesis-master.tar.gz thesis-master.tar.bz2 thesis-master.zip | |
Diffstat (limited to 'python/openvino/runtime/dla_aot_splitter')
21 files changed, 2046 insertions, 0 deletions
diff --git a/python/openvino/runtime/dla_aot_splitter/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/CMakeLists.txt new file mode 100644 index 0000000..0e1e4f8 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/CMakeLists.txt @@ -0,0 +1,71 @@ +cmake_minimum_required(VERSION 3.10) + +# Use <package>_ROOT variables to help find_package locate packages +if (POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) +endif() + +find_package(OpenCV COMPONENTS core highgui imgcodecs imgproc videoio REQUIRED) +find_package(gflags COMPONENTS shared REQUIRED) + +add_subdirectory(dla_aot_splitter_plugin) +add_subdirectory(dla_aot_splitter_example) + +if (DE10_AGILEX) + add_library(de10_agilex ALIAS de10_agilex_mmd) +elseif (SYSTEM_CONSOLE_PLATFORM) + # DO NOTHING +elseif (PAC_A10) + add_library(dcp_a10_pac ALIAS intel_opae_mmd) +elseif(AGX7_IDK) + add_library(agx7_i_dk ALIAS intel_opae_mmd) +elseif(AGX7_N6001) + add_library(agx7_n6001 ALIAS intel_opae_mmd) +endif() + +add_executable(dla_aot_splitter ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp) + +target_compile_features(dla_aot_splitter PUBLIC cxx_std_11) + +target_sources(dla_aot_splitter PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/inc/dla_aot_splitter.hpp + $ENV{COREDLA_ROOT}/runtime/dla_benchmark/inputs_filling.cpp #TODO REMOVE and replace with link library + $ENV{COREDLA_ROOT}/runtime/dla_benchmark/utils.cpp #TODO REMOVE and replace with link library + $ENV{COREDLA_ROOT}/runtime/common/utils/src/slog.cpp + $ENV{COREDLA_ROOT}/runtime/common/utils/src/args_helper.cpp + $ENV{COREDLA_ROOT}/runtime/common/utils/src/common.cpp + $ENV{COREDLA_ROOT}/runtime/common/utils/src/latency_metrics.cpp +) + +target_include_directories(dla_aot_splitter PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/inc + $ENV{COREDLA_ROOT}/util/inc + $ENV{COREDLA_ROOT}/dla_plugin/inc + $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia + $ENV{COREDLA_ROOT}/runtime/dla_benchmark #TODO REMOVE and replace with link library +) + +if (WIN32) + target_include_directories(dla_aot_splitter PRIVATE + $ENV{COREDLA_ROOT}/compiler/inc # dla_performance_estimator.h + ) +endif() + + +target_link_libraries(dla_aot_splitter PRIVATE + openvino::runtime + openvino_dev_api + format_reader + ie_samples_utils + ${OpenCV_LIBRARIES} # Needed for the directly compiled inputs_filling + dla_aot_splitter_plugin + gflags +) + +if (NOT WIN32) + target_link_libraries(dla_aot_splitter PRIVATE + ${LIB_DL} + pthread + ) +endif() diff --git a/python/openvino/runtime/dla_aot_splitter/CPPLINT.cfg b/python/openvino/runtime/dla_aot_splitter/CPPLINT.cfg new file mode 100644 index 0000000..4bdae97 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/CPPLINT.cfg @@ -0,0 +1,8 @@ +set noparent +filter=-build/header_guard,-runtime/explicit,-build/include_subdir,-runtime/references,-build/c++11,-runtime/int,-runtime/string,-runtime/printf,-build/namespaces,-readability/todo,-readability/casting + +# Exlude Example code +exclude_files=dla_aot_splitter_example + +linelength=160 +headers=h,hpp diff --git a/python/openvino/runtime/dla_aot_splitter/README.md b/python/openvino/runtime/dla_aot_splitter/README.md new file mode 100644 index 0000000..ffefe0d --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/README.md @@ -0,0 +1,52 @@ +# Intel AI Suite Core DLA 'AoT Splitter' + +This tool is intended to split a compiled HETERO:FPGA OpenVINO model into Input memory, Config memory, and Filter memory data blobs that would normally exist in the DDR memory of a runtime CoreDLA IP. These blobs can be used to directly run an inference on the IP without using OpenVINO InferenceEngine. + +# How to Build the Splitter, Plugin, and Example + +First, follow all instructions to install CoreDLA compiler development environment + +Change directory to the dla runtime folder + +``` +sh build_runtime.sh -target_de10_agilex +``` + +# How to Run the Splitter Executable + +The executable outputs the memory blobs to the current working directory. Change directory to the location where you want the outputs to be generated + +``` +cd directory_where_you_want_output + +runtime/build_Release/dla_aot_splitter/dla_aot_splitter -cm compiled_hetero_fpga_model.bin -i path/to/image.bmp -bgr -plugins runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml +``` + +Ensure that the libdla_aot_splitter.so, libcoreDLAHeteroPlugin.so and other shared libraries are available to the utility. + +The tool outputs the following artifacts: + - arch_build.mem / arch_build.bin + - config.mem / config.bin + - filter.mem /filter.bin + - input.mem / input.bin + - inter_size.mem + - output_size.mem + +# Building the Example Inference Program + +The example inference program with static input,config,filter data is compiled with the following environment variables +and option to build_runtime.sh + +## DE10 Agilex +``` +export AOT_SPLITTER_EXAMPLE_MODEL=<path/to/model.xml> +export AOT_SPLITTER_EXAMPLE_INPUT=<path/to/image.bmp> +sh build_runtime.sh -aot_splitter_example -target_de10_agilex +``` + +This program directly embeds the input, config and filter data into the resulting exectuable file for direct use. + +## PCIE + +The emulation inference program uses the PCIE MMD driver from the example design to connect to and provision the IP. +Your system may require a different driver to provision the IP diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt new file mode 100644 index 0000000..a6f2ce8 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt @@ -0,0 +1,209 @@ +# Copyright (C) 2018-2020 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +add_executable(dla_aot_splitter_example EXCLUDE_FROM_ALL src/main.cpp) + +target_compile_features(dla_aot_splitter_example PUBLIC cxx_std_11) + +target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_MMD) + +file(GLOB SOURCES + # coredla_device + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/device_memory_allocator.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/dla_dma_constants.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/mmd_wrapper.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/src/device_memory_allocator.cpp + # + src/main.cpp +) +if (SYSTEM_CONSOLE_PLATFORM) + list(APPEND SOURCES ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/mmd_wrapper.cpp) +else () + list(APPEND SOURCES $ENV{COREDLA_ROOT}/runtime/coredla_device/src/mmd_wrapper.cpp) +endif () + +target_sources (dla_aot_splitter_example PRIVATE ${SOURCES}) + +if (DISABLE_JIT) +# for dla_dma_constants.svh + if (EXISTS $ENV{COREDLA_ROOT}/inc) + target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/inc) + else() + target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/build/coredla/dla/inc) + endif() +endif() + +target_link_libraries(dla_aot_splitter_example PRIVATE + pthread +) + +if (DISABLE_JIT) + target_include_directories(dla_aot_splitter_example PRIVATE + $ENV{COREDLA_ROOT}/util/inc + $ENV{COREDLA_XUTIL_DIR}/compiled_result/inc + ) + target_sources(dla_aot_splitter_example PRIVATE $ENV{COREDLA_XUTIL_DIR}/compiled_result/src/compiled_result_reader_writer.cpp) +else() + target_link_libraries(dla_aot_splitter_example + PRIVATE + dla_compiled_result + ) +endif() + +if (DE10_AGILEX) + target_link_libraries(dla_aot_splitter_example PRIVATE de10_agilex) +elseif(PAC_A10) + target_link_libraries(dla_aot_splitter_example PRIVATE dcp_a10_pac) +elseif(AGX7_IDK) + target_link_libraries(dla_aot_splitter_example PRIVATE agx7_i_dk) + find_library(libjson-c_LIBRARIES + NAMES json-c + PATHS ${LIBOPAE-C_ROOT}/lib + ${LIBOPAE-C_ROOT}/lib64 + /usr/local/lib + /usr/lib + /lib + /usr/lib/x86_64-linux-gnu + ${CMAKE_EXTRA_LIBS}) + target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES}) +elseif(AGX7_N6001) + target_link_libraries(dla_aot_splitter_example PRIVATE agx7_n6001) + find_library(libjson-c_LIBRARIES + NAMES json-c + PATHS ${LIBOPAE-C_ROOT}/lib + ${LIBOPAE-C_ROOT}/lib64 + /usr/local/lib + /usr/lib + /lib + /usr/lib/x86_64-linux-gnu + ${CMAKE_EXTRA_LIBS}) + target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES}) +elseif(SYSTEM_CONSOLE_PLATFORM) + # Agilex 5 JTAG ED: do nothing +elseif(REFERENCE) + # Reference: do nothing +else() + message(FATAL_ERROR "Building DLA AOT Aplitter Example with unsupported platform") +endif() + +target_include_directories(dla_aot_splitter_example PRIVATE + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc + if(PAC_A10) + $ENV{COREDLA_ROOT}/runtime/coredla_device/mmd/dcp_a10_pac/host + endif() +) + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include) + +target_sources (dla_aot_splitter_example PRIVATE + ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem +) +target_include_directories(dla_aot_splitter_example PRIVATE + ${CMAKE_CURRENT_BINARY_DIR}/include +) + +if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_MODEL}) + set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{AOT_SPLITTER_EXAMPLE_MODEL}) +else() + if (EXISTS $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml) + set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml) + else() + # The path below is for Intel internal use only + if (EXISTS /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml) + set (AOT_SPLITTER_EXAMPLE_MODEL /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml) + endif() + endif() +endif() + +if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_INPUT}) + set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{AOT_SPLITTER_EXAMPLE_INPUT}) +else() + if (EXISTS $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp) + set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp) + else() + # The path below is for Intel internal use only + if (EXISTS /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp) + set (AOT_SPLITTER_EXAMPLE_INPUT /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp) + endif() + endif() +endif() + +if (EXISTS ${CoreDLA_DIR}/../bin) + set(COREDLA_BIN ${CoreDLA_DIR}/../bin) + set(COREDLA_LIB ${CoreDLA_DIR}/../lib) + set(COREDLA_EXARCH ${CoreDLA_DIR}/../example_architectures) + if(DE10_AGILEX OR AGX7_IDK OR AGX7_N6001) + set (AOT_SPLITTER_EXAMPLE_ARCH AGX7_Performance.arch) + elseif(SYSTEM_CONSOLE_PLATFORM) + set (AOT_SPLITTER_EXAMPLE_ARCH AGX5_Small_Softmax.arch) + else() + set (AOT_SPLITTER_EXAMPLE_ARCH A10_Performance.arch) + endif() +else() + set(COREDLA_BIN $ENV{COREDLA_ROOT}/build/coredla/dla/bin) + set(COREDLA_LIB $ENV{COREDLA_ROOT}/build/coredla/dla/lib) + set(COREDLA_EXARCH $ENV{COREDLA_ROOT}/example_architectures) + + # The paths below are for Intel internal use only + if(DE10_AGILEX) + set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/64x32_i5x1_fp13agx_sb31744_xbark32_clamp_preluk32_poolk4_softmax_1inst.arch) + elseif(AGX7_IDK OR AGX7_N6001) + set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/32x64_i5x1_fp13agx_sb32768_poolk4_actk32_prelu_rclamp_sig_softmaxk1.arch) + elseif(SYSTEM_CONSOLE_PLATFORM) + set (AOT_SPLITTER_EXAMPLE_ARCH 16x16_i12x1_fp12agx_sb8192_poolk4_actk16_clamp_softmaxk1.arch) + else() + set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/A10/64x32_i4x1_fp11_sb31744_xbark32_clamp_preluk32_poolk4_softmax.arch) + endif() +endif() + +if (NOT DEFINED AOT_SPLITTER_INPUT_ARGUMENTS) + set (AOT_SPLITTER_INPUT_ARGUMENTS ) + if (DEFINED AOT_SPLITTER_EXAMPLE_INPUT) + set (AOT_SPLITTER_INPUT_ARGUMENTS -i ${AOT_SPLITTER_EXAMPLE_INPUT} -bgr) + endif() +endif() + +# Need to copy the system console script for Agilex 5E JTAG ED +# Also link against Boost +if (SYSTEM_CONSOLE_PLATFORM) + find_package(Boost REQUIRED COMPONENTS filesystem) + target_link_libraries(dla_aot_splitter_example PRIVATE Boost::filesystem) + add_custom_command( + TARGET dla_aot_splitter_example POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/system_console_script.tcl + ${CMAKE_CURRENT_BINARY_DIR}/system_console_script.tcl + ) + target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_SYSCON_SOURCE_ROOT=${CMAKE_CURRENT_BINARY_DIR}) +endif() + +add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem + COMMAND + LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} ${COREDLA_BIN}/dlac --network-file ${AOT_SPLITTER_EXAMPLE_MODEL} --march ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH} --foutput-format open_vino_hetero --o ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin + COMMAND + LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} $<TARGET_FILE:dla_aot_splitter> ${AOT_SPLITTER_INPUT_ARGUMENTS} -cm ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin -plugins $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml + DEPENDS + ${COREDLA_BIN}/dlac + dla_benchmark + dla_aot_splitter + dla_aot_splitter_plugin + ${AOT_SPLITTER_EXAMPLE_MODEL} + ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH} + ${AOT_SPLITTER_EXAMPLE_INPUT} + $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR}/include +) diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp new file mode 100644 index 0000000..b90ccd5 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp @@ -0,0 +1,180 @@ +// Copyright 2022 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// +// This small tool demonstrates the minimum number of steps necessary to run an +// inference on the FPGA while using the output files from the AoT splitter. +// + +#include <iostream> +#include <iomanip> +#include <fstream> +#include <stdint.h> +#include <array> +#include <cstring> //memcpy + +uint32_t arch_build_mem_32[] = +{ + #include "arch_build.mem" +}; +uint8_t* const arch_build_mem = (uint8_t*)&arch_build_mem_32[0]; +const uint32_t arch_build_mem_size = sizeof(arch_build_mem_32); + +uint32_t input_mem_32[] = +{ + #include "input.mem" +}; +uint8_t* const input_mem = sizeof(input_mem_32) ? (uint8_t*)&input_mem_32[0] : nullptr; +const uint32_t input_mem_size = sizeof(input_mem_32); + +uint32_t config_mem_32[] = +{ + #include "config.mem" +}; +uint8_t* const config_mem = (uint8_t*)&config_mem_32[0]; +const uint32_t config_mem_size = sizeof(config_mem_32); + +uint32_t filter_mem_32[] = +{ + #include "filter.mem" +}; +uint8_t* const filter_mem = (uint8_t*)&filter_mem_32[0]; +const uint32_t filter_mem_size = sizeof(filter_mem_32); + +constexpr uint32_t output_mem_size = + #include "output_size.mem" +; + +constexpr uint32_t inter_mem_size = + #include "inter_size.mem" +; + +#include "mmd_wrapper.h" +#include "device_memory_allocator.h" +#include "dla_dma_constants.h" //DLA_DMA_CSR_OFFSET_*** + +int main(int argc, char *argv[]) { + std::array<uint8_t, output_mem_size> actual_output_mem; + for (uint64_t i=0u; i < actual_output_mem.size();i++) + { + actual_output_mem[i] = (0xDEADBEEF) >> ((3-(i%4)) * 8); + } + + std::cout << "AOT Splitter Example" << std::endl; + + constexpr int instance = 0; + + constexpr int _maxNumPipelines = 5; + constexpr int numPipelines = _maxNumPipelines; + + // TODO: retrieve this from the arch file + constexpr uint64_t featureWordSize = 32; + constexpr uint64_t filterWordSize = 64; + + + constexpr int ARCH_HASH_SIZE = 16; + constexpr int BUILD_VERSION_SIZE = 32; + + MmdWrapper mmdWrapper{}; + DeviceMemoryAllocator ddrAllocator{}; + + for (size_t i = 0; i < ARCH_HASH_SIZE; i+=4) { + uint32_t arch_build_word_from_device = mmdWrapper.ReadFromCsr(instance, i); + if (arch_build_mem_32[i/4] != arch_build_word_from_device) + { + std::cout << "Arch hash mismatch at word " << i << " : expected " << + std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_mem_32[i/4] << + " != " << + std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_word_from_device << std::endl; + return 1; + } + } + char expected_build_version[BUILD_VERSION_SIZE + 1]; + expected_build_version[BUILD_VERSION_SIZE] = '\0'; + std::memcpy(expected_build_version, (uint8_t*)&arch_build_mem_32[ARCH_HASH_SIZE/sizeof(uint32_t)], BUILD_VERSION_SIZE); + + char actual_build_version[BUILD_VERSION_SIZE + 1]; + actual_build_version[BUILD_VERSION_SIZE] = '\0'; + + for (uint32_t i=0;i < BUILD_VERSION_SIZE; i+=4) + { + uint32_t chunk = mmdWrapper.ReadFromCsr(instance, ARCH_HASH_SIZE + i); + for (uint8_t j=0;j < 4; j++) + { + actual_build_version[i+j] = chunk & 0xFF; + chunk >>= 8; + } + } + if (0 != std::strncmp(expected_build_version, actual_build_version, BUILD_VERSION_SIZE)) + { + std::cout << "Build version mismath. Expected " << expected_build_version << " actual " << actual_build_version << std::endl; + return 1; + } + + ddrAllocator.Initialize(mmdWrapper.GetDDRSizePerInstance(), &mmdWrapper); + + ddrAllocator.AllocateSharedBuffer(inter_mem_size, instance); + //mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR, 0); + + + uint64_t inputOutputBufferSize = numPipelines * (input_mem_size + output_mem_size); // how much space to allocate + uint64_t inputOutputBufferAlignment = featureWordSize; // starting address must be aligned to this + uint64_t inputOutputBufferAddr; // where did the allocator place this buffer + ddrAllocator.AllocatePrivateBuffer(inputOutputBufferSize, inputOutputBufferAlignment, inputOutputBufferAddr); + + uint64_t configFilterBufferSize = config_mem_size + filter_mem_size; + uint64_t configFilterBufferAlignment = filterWordSize; + uint64_t configFilterBufferAddr; + ddrAllocator.AllocatePrivateBuffer(configFilterBufferSize, configFilterBufferAlignment, configFilterBufferAddr); + + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0); + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, 3); + uint32_t completionCount = mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT); + std::cout << "Initial completion count " << completionCount << std::endl; + + mmdWrapper.WriteToDDR(instance, inputOutputBufferAddr, input_mem_size, input_mem); + + mmdWrapper.WriteToDDR(instance, configFilterBufferAddr, config_mem_size, config_mem); + mmdWrapper.WriteToDDR(instance, configFilterBufferAddr + config_mem_size, filter_mem_size, filter_mem); + + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, configFilterBufferAddr); + constexpr int CONFIG_READER_DATA_BYTES = 8; // May want to move to a header in production code + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, ((config_mem_size) / CONFIG_READER_DATA_BYTES) - 2); + + + // base address for feature reader -- this will trigger one run of DLA + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, inputOutputBufferAddr); + + int i=0; + while(mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT) == completionCount) + { + i++; + if (i % 100000 == 0) { + std::cout << "Timeout" << std::endl; + return 1; + } + } + + std::cout << "Completed infered in " << i << " polling intervals" << std::endl; + + //Reading from pipeline zero + mmdWrapper.ReadFromDDR(instance, inputOutputBufferAddr + input_mem_size, actual_output_mem.size(), actual_output_mem.data()); + + std::ofstream of ("actual_output.mem", std::ios_base::out | std::ios_base::binary); + if (of) { + of.write((const char*)actual_output_mem.data(), actual_output_mem.size()); + } + of.close(); + + return 0; +} diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/CMakeLists.txt new file mode 100644 index 0000000..6f5e916 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/CMakeLists.txt @@ -0,0 +1,113 @@ +cmake_minimum_required(VERSION 3.10) + +add_library(dla_aot_splitter_plugin SHARED) + +target_compile_features(dla_aot_splitter_plugin PUBLIC cxx_std_11) + +target_compile_definitions(dla_aot_splitter_plugin PUBLIC DISABLE_JIT) + +set_target_properties(dla_aot_splitter_plugin PROPERTIES POSITION_INDEPENDENT_CODE ON) + +if (WIN32) + # Fix warning C4273: inconsistent dll linkage + target_compile_definitions(dla_aot_splitter_plugin PRIVATE XBYAK_NO_OP_NAMES + IMPLEMENT_INFERENCE_ENGINE_PLUGIN + $<TARGET_PROPERTY:openvino::runtime,INTERFACE_COMPILE_DEFINITIONS>) +endif() + +target_include_directories(dla_aot_splitter_plugin PRIVATE + $ENV{COREDLA_ROOT}/dla_plugin + $ENV{COREDLA_ROOT}/dla_plugin/inc + $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia + $ENV{COREDLA_ROOT}/util/inc # dla_error.h + $ENV{COREDLA_ROOT}/inc # dla_dma_constants.svh + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc # For abstract classes (BatchJob, Device etc.) + # + ${CMAKE_CURRENT_SOURCE_DIR}/inc +) + +target_sources(dla_aot_splitter_plugin PRIVATE +## + $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_async_infer_request.h + $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_config.hpp + $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_compiled_model.h + $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_runtime_log.h + $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia_infer_request.h + $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia_plugin.h + $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia_utils.h + $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_plugin_config.hpp +## + $ENV{COREDLA_ROOT}/dla_plugin/src/dla_async_infer_request.cpp + $ENV{COREDLA_ROOT}/dla_plugin/src/dla_config.cpp + $ENV{COREDLA_ROOT}/dla_plugin/src/dla_compiled_model.cpp + $ENV{COREDLA_ROOT}/dla_plugin/src/dlia_infer_request.cpp + $ENV{COREDLA_ROOT}/dla_plugin/src/dlia_plugin.cpp + $ENV{COREDLA_ROOT}/dla_plugin/src/dla_plugin_jit_functions.cpp + $ENV{COREDLA_ROOT}/dla_plugin/src/dlia_utils.cpp + $ENV{COREDLA_ROOT}/util/src/dla_numeric_utils.cpp +## + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/graph_job.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/batch_job.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/device.h +## + ${CMAKE_CURRENT_SOURCE_DIR}/src/raw_graph_job.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/raw_device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/raw_batch_job.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/dla_aot_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/inc/raw_graph_job.h + ${CMAKE_CURRENT_SOURCE_DIR}/inc/raw_device.h + ${CMAKE_CURRENT_SOURCE_DIR}/inc/raw_batch_job.h + ${CMAKE_CURRENT_SOURCE_DIR}/inc/dla_aot_utils.h + ${CMAKE_CURRENT_SOURCE_DIR}/inc/dla_aot_structs.h +) + +if (WIN32) + target_link_libraries(dla_aot_splitter_plugin + PRIVATE +## + dla_op_transformation + dliaPluginIOTransformations + openvino::runtime + openvino_dev_api + ${TBB_IMPORTED_TARGETS} +) +else() + target_link_libraries(dla_aot_splitter_plugin + PRIVATE +## + pthread + dla_op_transformation + dliaPluginIOTransformations + openvino::runtime + openvino_dev_api + ${TBB_IMPORTED_TARGETS} +) +endif() + +if (DISABLE_JIT) + target_include_directories(dla_aot_splitter_plugin PRIVATE + $ENV{COREDLA_ROOT}/util/inc + $ENV{COREDLA_XUTIL_DIR}/compiled_result/inc + ) + target_sources(dla_aot_splitter_plugin PRIVATE $ENV{COREDLA_XUTIL_DIR}/compiled_result/src/compiled_result_reader_writer.cpp) + + if (EXISTS $ENV{COREDLA_ROOT}/inc) + target_include_directories(dla_aot_splitter_plugin PUBLIC $ENV{COREDLA_ROOT}/inc) + else() + target_include_directories(dla_aot_splitter_plugin PUBLIC $ENV{COREDLA_ROOT}/build/coredla/dla/inc) + endif() +else() + target_link_libraries(dla_aot_splitter_plugin + PRIVATE + dla_compiled_result + archparam + ) +endif() + +set_target_properties(dliaPluginIOTransformations PROPERTIES POSITION_INDEPENDENT_CODE ON) + +if (WIN32) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugins_aot_splitter_win.xml ${CMAKE_CURRENT_BINARY_DIR}/plugins_aot_splitter.xml COPYONLY) +else() + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugins_aot_splitter.xml ${CMAKE_CURRENT_BINARY_DIR}/ COPYONLY) +endif() diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_structs.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_structs.h new file mode 100644 index 0000000..697b5d2 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_structs.h @@ -0,0 +1,38 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef _DLA_AOT_STRUCTS_H_ +#define _DLA_AOT_STRUCTS_H_ + +#include "compiled_result.h" + +// Custom type +typedef unsigned char uint8_t; + +// All size and offset fields are in bytes. +typedef struct { + const dla::CompiledResult* compiled_result; + uint32_t config_buffer_size; + uint32_t filter_bias_scale_buffer_size; + uint8_t *input_feature_buffer; + uint32_t input_feature_buffer_size; + uint32_t output_feature_buffer_size; + uint32_t intermediate_feature_buffer_size; +} DLAInput; + +typedef struct { + // Its size is output_feature_buffer_size in DLAInput. + uint8_t *output_feature_buffer; +} DLAOutput; + +#endif // _DLA_REF_H_ diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_utils.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_utils.h new file mode 100644 index 0000000..7fa23e8 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_utils.h @@ -0,0 +1,49 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef _DLA_AOT_UTILS_H_ +#define _DLA_AOT_UTILS_H_ + +#include <fcntl.h> +#include <google/protobuf/io/zero_copy_stream_impl.h> +#include <google/protobuf/text_format.h> +#include <sys/stat.h> + +#include <iostream> +#include <string> +#include <vector> + +#include "dla_aot_structs.h" + +using google::protobuf::io::FileInputStream; + +// fp16 feature element (in bytes) +// TODO: extract it from arch / compiled result +const uint32_t feature_elem_size = 2; + +////////////////////////////////////////////////////////////////////////////// +// Dump DLA input and output to the following files: +// - config_filter.mem: config + filter buffer +// - input_feature.mem: input feature buffer +// - output_feature.mem: output feature buffer (emulation results) +// +// Each .mem file is a text file, with one byte (in hex) per line. +////////////////////////////////////////////////////////////////////////////// + +void writeInputOutputToFiles(const std::vector<int>& arch_hash, + const std::string& build_version, + const std::string& arch_name, + const DLAInput& input, + const DLAOutput& output); + +#endif // _DLA_AOT_UTILS_H_ diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_batch_job.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_batch_job.h new file mode 100644 index 0000000..dd8e5fa --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_batch_job.h @@ -0,0 +1,79 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. +#ifndef RAW_BATCH_JOB_H +#define RAW_BATCH_JOB_H + +#include <assert.h> +#include <cstdio> +#if defined(_WIN32) || defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include <windows.h> +#else +#include <dlfcn.h> +#endif +#include <cstring> +#include <iostream> +#include <string> +#include <thread> +#include <memory> + +#include "batch_job.h" +#include "dla_aot_structs.h" +#include "raw_device.h" + +// RawBatchJob represents one batch execution +// Contains functions to start DLA +class RawBatchJob : public BatchJob { + private: + const CompiledResult* compiledResult; + DLAInput* dlaBuffers_; + DLAOutput output_; + int instance_; + uint32_t debugLevel_; + std::string AES_key_; + std::string IV_key_; + bool encryption_enabled_; + RawBatchJob(const CompiledResult* compiledResult, + DLAInput* dlaBuffers, + int instance, + uint32_t debugLevel, + std::string AES_key, + std::string IV_key, + bool encryption_enabled); + + public: + RawBatchJob(const RawBatchJob&) = delete; + RawBatchJob(RawBatchJob&) = delete; + RawBatchJob& operator=(const RawBatchJob&) = delete; + static unique_ptr<BatchJob> MakeUnique(const CompiledResult* compiledResult, + DLAInput* dlaBuffers, + int instance, + uint32_t debugLevel, + std::string AES_key, + std::string IV_key, + bool encryption_enabled); + // @param inputArray - ptr to CPU array containing input data tp be copied to DDR + // blocking function + void LoadInputFeatureToDDR(void* inputArray); + // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data + void StartDla() override; + // @param outputArray - ptr to CPU array where the output data in DDR is copied into + // outputArray must be allocated by the caller (size >= output_size_ddr) + // blocking function + void ReadOutputFeatureFromDDR(void* outputArray) const; + void ScheduleInputFeature() const {} +}; + +#endif diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_device.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_device.h new file mode 100644 index 0000000..168707e --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_device.h @@ -0,0 +1,81 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. +#ifndef RAW_DEVICE_H +#define RAW_DEVICE_H + +#include <assert.h> +#include <chrono> +#include <cstdio> +#include <cstring> +#include <iostream> +#include <memory> +#include <string> +#include <thread> +#include <vector> +#include <map> +#include "arch_params.h" +#include "compiled_result.h" +#include "device.h" +using namespace std; +using namespace dla; +class GraphJob; + +class RawDevice : public Device { + public: + GraphJob* CreateGraphJob(const CompiledResult* compiledResult, + size_t numPipelines, + int instance, + std::string AES_key, + std::string IV_key, + bool encryption_enabled, + const std::string export_dir, + const std::string parameter_rom_export_dir); + // Return number of DLA jobs completed till now + // Used for debugging + int GetNumInferencesCompleted(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the total time taken by DLA jobs on hardware (in milliseconds) + double GetActiveHWTimeMs(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the average of time taken per job (in milliseconds) + // Avg Time per job < Active Time + double GetAvgHWTimePerJobMs(size_t num_jobs, int instance) const override; + RawDevice(const arch_params* archParams); + void WaitForDla(int instance, + size_t threadId = 0, + std::function<bool()> isCancelled = nullptr) override; // threadId is for debugging purpose only + std::string SchedulerGetStatus() const override { return ""; } + bool InitializeScheduler(uint32_t sourceBufferSize, + uint32_t dropSourceBuffers, + uint32_t numInferenceRequests, + const std::string source_fifo_file = "") override { + return true; + } + int GetNumInstances() const override { return numInstances_; } + int GetSizeCsrDescriptorQueue() const override { return -1; } // meaningless here + double GetCoreDlaClockFreq() const override { return -1.0; } // meaningless here + std::map<std::string, uint64_t> ReadDebugNetwork(int instance) const override { + return std::map<std::string, uint64_t>(); + }; + uint64_t GetNumInputFeatureMemoryReads(int instance) const override { return 0; }; + uint64_t GetNumFilterMemoryReads(int instance) const override {return 0; }; + uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override {return 0; }; + + private: + RawDevice() = delete; + vector<unique_ptr<GraphJob>> allGraphJobs_; + int numInstances_; + const arch_params* archParams_; +}; + +#endif // REF_DEVCE_H diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_graph_job.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_graph_job.h new file mode 100644 index 0000000..38ad075 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_graph_job.h @@ -0,0 +1,80 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. +#ifndef RAW_GRAPH_JOB_H +#define RAW_GRAPH_JOB_H + +#include <assert.h> +#include <cstdio> +#include <memory> +#include <vector> +#if defined(_WIN32) || defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include <windows.h> +#else +#include <dlfcn.h> +#endif +#include <cstring> +#include <iostream> +#include <string> +#include <thread> +#include "compiled_result.h" + +#include "dla_aot_structs.h" +#include "graph_job.h" +#include "raw_batch_job.h" +#include "raw_device.h" +using namespace dla; +/*! RawGraphJob is a DLA compiled graph loaded onto a emulation device + * Initialized with Emulator Device object + * RawGraphJob stores arrays filter, bias, config, inputs and outputs + * It provides handle to "batch job" objects that are used to load input and start DLA for one batch + */ +class RawGraphJob : public GraphJob { + public: + static unique_ptr<GraphJob> MakeUnique(const arch_params* archParams, + const CompiledResult* compiled_result, + size_t numPipelines, + int instance, + uint32_t debugLevel, + std::string AES_key, + std::string IV_key, + bool encryption_enabled); + // Returns an unused batch job object + // If all batch jobs are used, returns null + // Increments batchJobsRequested_ + // Thread safe + BatchJob* GetBatchJob(); + RawGraphJob(const GraphJob&) = delete; + RawGraphJob(RawGraphJob&) = delete; + RawGraphJob& operator=(const RawGraphJob&) = delete; + + private: + DLAInput dlaBuffers_; + vector<unique_ptr<BatchJob>> batchJobs_; + int instance_; + uint32_t debugLevel_; + unsigned int batchJobsRequested_; + std::mutex graphJobMutex; + RawGraphJob(const arch_params* archParams, + const CompiledResult* compiledResult, + size_t numPipelines, + int instance, + uint32_t debugLevel, + std::string AES_key, + std::string IV_key, + bool encryption_enabled); +}; + +#endif diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml new file mode 100644 index 0000000..2f2d24e --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml @@ -0,0 +1,18 @@ +<ie> + <plugins> + <plugin name="GNA" location="libopenvino_intel_gna_plugin.so"> + </plugin> + <plugin name="HETERO" location="libcoreDLAHeteroPlugin.so"> + </plugin> + <plugin name="CPU" location="libopenvino_intel_cpu_plugin.so"> + </plugin> + <plugin name="MULTI" location="libopenvino_auto_plugin.so"> + </plugin> + <plugin name="GPU" location="libopenvino_intel_gpu_plugin.so"> + </plugin> + <plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so"> + </plugin> + <plugin name="FPGA" location="libdla_aot_splitter_plugin.so"> + </plugin> + </plugins> +</ie> diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter_win.xml b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter_win.xml new file mode 100755 index 0000000..aeeedde --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter_win.xml @@ -0,0 +1,22 @@ +<ie>
+ <plugins>
+ <plugin name="AUTO" location="openvino_auto_plugin.dll">
+ </plugin>
+ <plugin name="BATCH" location="openvino_auto_batch_plugin.dll">
+ </plugin>
+ <plugin name="CPU" location="openvino_intel_cpu_plugin.dll">
+ </plugin>
+ <plugin name="GNA" location="openvino_intel_gna_plugin.dll">
+ </plugin>
+ <plugin name="GPU" location="openvino_intel_gpu_plugin.dll">
+ </plugin>
+ <plugin name="HETERO" location="coreDLAHeteroPlugin.dll">
+ </plugin>
+ <plugin name="MULTI" location="openvino_auto_plugin.dll">
+ </plugin>
+ <plugin name="MYRIAD" location="openvino_intel_myriad_plugin.dll">
+ </plugin>
+ <plugin name="FPGA" location="dla_aot_splitter_plugin.dll">
+ </plugin>
+ </plugins>
+</ie>
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/CPPLINT.cfg b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/CPPLINT.cfg new file mode 100644 index 0000000..3288819 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/CPPLINT.cfg @@ -0,0 +1,4 @@ +filter=-build/header_guard,-runtime/explicit,-build/include_subdir,-runtime/references,-build/c++11,-runtime/int +exclude_files=^(?!pe_array_sim.cpp).*\.cpp +linelength=160 +headers=h,hpp diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/dla_aot_utils.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/dla_aot_utils.cpp new file mode 100644 index 0000000..4317201 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/dla_aot_utils.cpp @@ -0,0 +1,117 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +/* + This file contains some helper utilities to output coredla data blobs to files + in the current working directory +*/ + +#include "dla_aot_utils.h" + +// The resulting file is expected to be consumed by RTL testbench or hardware. +static void writeBufferToBinFile(const uint8_t *buffer, uint32_t buffer_size, + const char *file_path) { + FILE *fp = fopen(file_path, "wb"); + assert(nullptr != fp); + + if (buffer_size && !fwrite(buffer, buffer_size, 1, fp)) + { + std::cout << "ERROR writing to output file " << file_path << std::endl; + } + + fclose(fp); +} + +// The resulting file is expected to be consumed by RTL testbench or hardware. +static void writeBufferToFile(const uint8_t *buffer, uint32_t buffer_size, + const char *file_path) { + FILE *fp = fopen(file_path, "w"); + assert(nullptr != fp); + + // Write buffer size (in bytes) to the first line + for (uint32_t b = 0; b < buffer_size; b+=4) { + if (b && ((b % 128) == 0)) + { + fprintf(fp, "\n"); + } + fprintf(fp, "0x%08x", *((uint32_t*)&buffer[b])); + if(b + 4 < buffer_size) + { + fprintf(fp, ","); + } + } + + fclose(fp); +} + +// Create all files that the splitter is responsible for +void writeInputOutputToFiles ( + const std::vector<int>& arch_hash, + const std::string& build_version, + const std::string& arch_name, + const DLAInput &input, + const DLAOutput &output +) { + uint8_t arch_build[ARCH_HASH_SIZE + BUILD_VERSION_SIZE + ARCH_NAME_SIZE]; + + memset(&arch_build[0], 0, ARCH_HASH_SIZE + BUILD_VERSION_SIZE); + memcpy(&arch_build[0], arch_hash.data(), ARCH_HASH_SIZE); + memcpy(&arch_build[ARCH_HASH_SIZE], build_version.c_str(), std::min(build_version.length(),static_cast<size_t>(BUILD_VERSION_SIZE))); + memcpy(&arch_build[ARCH_HASH_SIZE + BUILD_VERSION_SIZE], arch_name.c_str(), std::min(arch_name.length(),static_cast<size_t>(ARCH_NAME_SIZE))); + writeBufferToFile(arch_build, + sizeof(arch_build), + "arch_build.mem"); + writeBufferToFile(arch_build, + sizeof(arch_build), + "arch_build.bin"); + const auto &config_fbs_buffer = + input.compiled_result->get_config_filter_bias_scale_array(); + + // Only dump filters and config memory file when they are saved in DDR + if (!input.compiled_result->get_ddrfree_header().enable_parameter_rom) { + writeBufferToFile(&(config_fbs_buffer[0][0]), + input.config_buffer_size, + "config.mem"); + writeBufferToBinFile(&(config_fbs_buffer[0][0]), + input.config_buffer_size, + "config.bin"); + writeBufferToFile(&(config_fbs_buffer[0][0]) + input.config_buffer_size, + input.filter_bias_scale_buffer_size, + "filter.mem"); + writeBufferToBinFile(&(config_fbs_buffer[0][0]) + input.config_buffer_size, + input.filter_bias_scale_buffer_size, + "filter.bin"); + } else { + std::cout << "Graph filters and DLA configs are not dumped because parameter ROM is enabled in the AOT file." << std::endl; + } + uint8_t* input_buffer = nullptr; + size_t input_size = 0; + if (input.input_feature_buffer) { + input_buffer = input.input_feature_buffer; + input_size = input.input_feature_buffer_size; + } + writeBufferToFile(input_buffer, + input_size, + "input.mem"); + writeBufferToBinFile(input_buffer, + input_size, + "input.bin"); + uint32_t inter_size = input.intermediate_feature_buffer_size; + writeBufferToFile((const uint8_t*)&inter_size, + sizeof(inter_size), + "inter_size.mem"); + uint32_t output_size = input.output_feature_buffer_size; + writeBufferToFile((const uint8_t*)&output_size, + sizeof(output_size), + "output_size.mem"); +} diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_batch_job.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_batch_job.cpp new file mode 100644 index 0000000..23247d5 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_batch_job.cpp @@ -0,0 +1,68 @@ +// Copyright 2022 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +/* + The raw_batch_job, raw_graph_job, and raw_device implement the interfaces + used by dliaPlugin to mimic a inference flow without actually providing a + inference. It is used to get the transformed input performed by the dliaPlugin + upper layers +*/ + +#include "raw_batch_job.h" +#include "dla_aot_utils.h" + +unique_ptr<BatchJob> RawBatchJob::MakeUnique(const CompiledResult * compiledResult, + DLAInput* dlaBuffers, + int instance, + uint32_t debugLevel, + std::string AES_key, + std::string IV_key, + bool encryption_enabled) { + return unique_ptr<BatchJob>(new RawBatchJob(compiledResult, dlaBuffers, instance, debugLevel, AES_key, IV_key, encryption_enabled)); +} + +RawBatchJob::RawBatchJob(const CompiledResult * compiledResult, + DLAInput* dlaBuffers, + int instance, + uint32_t debugLevel, + std::string AES_key, + std::string IV_key, + bool encryption_enabled) : compiledResult(compiledResult) { + dlaBuffers_ = dlaBuffers; + instance_ = instance; + debugLevel_= debugLevel; + AES_key_ = AES_key; + IV_key_ = IV_key; + encryption_enabled_ = encryption_enabled; + output_.output_feature_buffer = new uint8_t[dlaBuffers_->output_feature_buffer_size]; + memset(output_.output_feature_buffer, 0, dlaBuffers_->output_feature_buffer_size); + assert(nullptr != output_.output_feature_buffer); +} + +// Emulation device has no DDR. This function is just storing a pointer to the array +// Note: inputAray should not be deleted until the end of the Emulation runs +// i.e. StartDla completes +void RawBatchJob::LoadInputFeatureToDDR(void* inputArray) { + dlaBuffers_->input_feature_buffer = (uint8_t*) inputArray; + StartDla(); +} + +void RawBatchJob::StartDla() { + // Write input / output buffers to files + writeInputOutputToFiles(compiledResult->get_arch_hash(), compiledResult->get_build_version_string(), compiledResult->get_arch_name(), *dlaBuffers_, output_); +} + +// Emulation device has no DDR. Output is copied into the outputArray. +void RawBatchJob::ReadOutputFeatureFromDDR(void* outputArray) const { + memcpy(outputArray, output_.output_feature_buffer, dlaBuffers_->output_feature_buffer_size); +} diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_device.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_device.cpp new file mode 100644 index 0000000..0b8e838 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_device.cpp @@ -0,0 +1,67 @@ +// Copyright 2022 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +/* + The raw_batch_job, raw_graph_job, and raw_device implement the interfaces + used by dliaPlugin to mimic a inference flow without actually providing a + inference. It is used to get the transformed input performed by the dliaPlugin + upper layers +*/ + +#include "raw_device.h" +#include "raw_graph_job.h" +unique_ptr<Device> Device::MakeUnique(const arch_params* archParams, + uint32_t waitForDlaTimeoutSeconds) { + return unique_ptr<Device>(new RawDevice(archParams)); +} + +RawDevice::RawDevice(const arch_params* archParams) { + numInstances_ = 1; + archParams_ = archParams; +} + +GraphJob* RawDevice::CreateGraphJob(const CompiledResult * compiledResult, + size_t numPipelines, + int instance, + std::string AES_key, + std::string IV_key, + bool encryption_enabled, + const std::string export_dir, + const std::string parameter_rom_export_dir) +{ + (void) export_dir; // unused in HW runtime. CoreDLA utilizes base pointers, which the SW reference utilizes this variable. We void it here. + (void) parameter_rom_export_dir; + assert(instance < numInstances_); + allGraphJobs_.push_back(move(RawGraphJob::MakeUnique(archParams_, compiledResult, numPipelines, instance, 0, + AES_key, IV_key, encryption_enabled))); + return (allGraphJobs_.back()).get(); +} + +void RawDevice::WaitForDla(int instance, size_t threadId/* = 0 */, std::function<bool()> isCancelled) { + //RawDevice does not do any real work. No need to wait +} + +int RawDevice::GetNumInferencesCompleted(int instance) const { + std::cout << "This function, GetNumInferencesCompleted, is not implemented for raw device" << std::endl; + return 0; +} + +double RawDevice::GetActiveHWTimeMs(int instance) const { + std::cout << "This function, GetActiveHWTimeMs, is not implemented for raw device" << std::endl; + return 0; +} + +double RawDevice::GetAvgHWTimePerJobMs(size_t num_jobs, int instance) const { + std::cout << "This function, GetAvgHWTimePerJobMs, is not implemented for raw device" << std::endl; + return 0; +} diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_graph_job.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_graph_job.cpp new file mode 100644 index 0000000..c698110 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_graph_job.cpp @@ -0,0 +1,89 @@ +// Copyright 2022 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +/* + The raw_batch_job, raw_graph_job, and raw_device implement the interfaces + used by dliaPlugin to mimic a inference flow without actually providing a + inference. It is used to get the transformed input performed by the dliaPlugin + upper layers +*/ + +#include "raw_graph_job.h" +#include "dla_aot_utils.h" +#include <fstream> +#include "dla_defines.h" + +unique_ptr<GraphJob> RawGraphJob::MakeUnique(const arch_params* archParams, + const CompiledResult * compiledResult, + size_t numPipelines, + int instance, + uint32_t debugLevel = 0, + std::string AES_key = "", + std::string IV_key = "", + bool encryption_enabled = false) +{ + return unique_ptr<GraphJob>(new RawGraphJob(archParams, compiledResult, numPipelines, instance, debugLevel, AES_key, IV_key, encryption_enabled)); +} + +RawGraphJob::RawGraphJob(const arch_params* archParams, + const CompiledResult * compiledResult, + size_t numPipelines, + int instance, + uint32_t debugLevel, + std::string AES_key, + std::string IV_key, + bool encryption_enabled) +{ + assert(numPipelines); + instance_ = instance; + debugLevel_ = debugLevel; + batchJobsRequested_ = 0; + // input feature buffer size + // TODO: support multi-input graph + dlaBuffers_.input_feature_buffer_size = + compiledResult->get_conv_input_size_in_bytes(); + // input feature buffer to be allocated outside this routine + + // output buffer size + dlaBuffers_.output_feature_buffer_size = + compiledResult->get_conv_output_size_in_bytes(); + + // intermediate buffer size + dlaBuffers_.intermediate_feature_buffer_size = + compiledResult->get_conv_intermediate_size_in_bytes(); + + // config and filter buffer size + size_t num_config_words = compiledResult->get_num_config_words(); + dlaBuffers_.config_buffer_size = num_config_words * CONFIG_WORD_SIZE; + dlaBuffers_.filter_bias_scale_buffer_size = + compiledResult->get_total_filter_bias_scale_buffer_size(); + // store a pointer to CompiledResult to use config and filter buffer directly without copying + dlaBuffers_.compiled_result = compiledResult; + for(size_t i = 0; i < numPipelines; i++) { + batchJobs_.push_back(move(RawBatchJob::MakeUnique(compiledResult, &dlaBuffers_, instance_, debugLevel_, AES_key, IV_key, encryption_enabled))); + } + + dlaBuffers_.input_feature_buffer = NULL; +} + +BatchJob* RawGraphJob::GetBatchJob() { + graphJobMutex.lock(); + if(batchJobsRequested_ >= batchJobs_.size()) { + graphJobMutex.unlock(); + return nullptr; + } + auto * batchJob = batchJobs_[batchJobsRequested_].get(); + batchJobsRequested_++; + graphJobMutex.unlock(); + return batchJob; +} diff --git a/python/openvino/runtime/dla_aot_splitter/inc/dla_aot_splitter.hpp b/python/openvino/runtime/dla_aot_splitter/inc/dla_aot_splitter.hpp new file mode 100644 index 0000000..44448e8 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/inc/dla_aot_splitter.hpp @@ -0,0 +1,130 @@ +// Copyright 2022-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include <gflags/gflags.h> +#include <iostream> +#include <string> +#include <vector> + +/// @brief message for help argument +static const char help_message[] = "Print a usage message"; + +/// @brief message for images argument +static const char input_message[] = + "Optional. Path to a folder with images and/or binaries or to specific image or binary file."; + +/// @brief message for compiled model argument +static const char compiled_model_message[] = "Optional. Path to a .bin file with a trained compiled model"; + +// @brief message for the custom plugins.xml file option +static const char plugins_message[] = "Optional. Select a custom plugins to use."; + +// @brief message folding_option flag +static const char folding_option_message[] = "Optional. Set the folding options for dla compiler: options 0-3."; + +// @brief message fold_preprocessing flag +static const char fold_preprocessing_message[] = "Optional. Enable fold preprocessing option for dla compiler."; + +// @brief message bgr flag +static const char bgr_message[] = "Optional. Indicate images are in bgr format."; + +// @brief message encryption_key flag +static const char encryption_key_message[] = + "Optional. Encryption key (using hexidecimal characters, 16 bytes- 32 hexidecimal char)."; + +// @brief message encryption_iv flag +static const char encryption_iv_message[] = + "Optional. Initialization vector for encryption. (8 bytes - 16 hexidecimal char)"; + +// @brief message binary flag +static const char bin_data_message[] = + "Optional. Specify that the input should be read as binary data (otherwise, if input tensor has depth 1, or 3 it " + "will default to U8 image processing)."; + +/// @brief message resize flag +static const char input_image_resize_message[] = + "Optional. Input image resizing methods when the input image width and height do not match the desired " + "input width and height of the model. resize: Resizing the input image to the model input size; " + "pad_resize: Pad the input image with black pixels (i.e., 0) into a squared image and " + "resize the padded image to model input size."; + +/// @brief message enable early-access features flag +static const char enable_early_access_message[] = + "Optional. Enables early access (EA) features of FPGA AI Suite. These are features that are actively being " + "developed and have not yet met production quality standards. These features may have flaws. " + "Consult the FPGA AI Suite documentation for details."; + +/// @brief Define flag for showing help message <br> +DEFINE_bool(h, false, help_message); + +/// @brief Declare flag for showing help message <br> +DECLARE_bool(help); + +/// @brief Define parameter for set image file <br> +/// i or mif is a required parameter +DEFINE_string(i, "", input_message); + +/// @brief Define parameter for compiled model file <br> +/// It is not a required parameter +DEFINE_string(cm, "", compiled_model_message); + +/// @brief Path to a plugins_xml file +DEFINE_string(plugins, "", plugins_message); + +/// @brief Define flag whether the image is in bgr format +DEFINE_bool(bgr, false, bgr_message); + +/// Select folding options; 0,1,2,3 +DEFINE_int32(folding_option, 1, folding_option_message); + +/// @brief Define flag for enabling folding preprocessing +DEFINE_bool(fold_preprocessing, false, fold_preprocessing_message); + +/// @brief encryption key +DEFINE_string(encryption_key, "", encryption_key_message); + +/// @brief initialization vector +DEFINE_string(encryption_iv, "", encryption_iv_message); + +/// @brief Specify that the inputs should be read as binary. +DEFINE_bool(bin_data, false, bin_data_message); + +/// @brief Define flag for using input image resize <br> +DEFINE_string(resize_type, "", input_image_resize_message); + +/// @brief Enables early-access (EA) features of CoreDLA <br> +DEFINE_bool(enable_early_access, false, enable_early_access_message); + +/** + * @brief This function show a help message + */ +static void showUsage() { + std::cout << std::endl; + std::cout << "aot_splitter [OPTION]" << std::endl; + std::cout << "Options:" << std::endl; + std::cout << std::endl; + std::cout << " -h, --help " << help_message << std::endl; + std::cout << " -i \"<path>\" " << input_message << std::endl; + std::cout << " -cm \"<path>\" " << compiled_model_message << std::endl; + std::cout << " -plugins " << plugins_message << std::endl; + std::cout << " -bgr " << bgr_message << std::endl; + std::cout << " -bin_data " << bin_data_message << std::endl; + std::cout << " -resize_type \"resize/pad_resize\" " << input_image_resize_message << std::endl; + std::cout << " -folding_option " << folding_option_message << std::endl; + std::cout << " -fold_preprocessing " << fold_preprocessing_message << std::endl; + std::cout << " -encryption_key " << encryption_key_message << std::endl; + std::cout << " -encryption_iv " << encryption_iv_message << std::endl; + std::cout << " -enable_early_access " << enable_early_access_message << std::endl; +} diff --git a/python/openvino/runtime/dla_aot_splitter/sdl.cmake b/python/openvino/runtime/dla_aot_splitter/sdl.cmake new file mode 100644 index 0000000..3f8af7a --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/sdl.cmake @@ -0,0 +1,96 @@ + +#################################################################### +## SDL required compiler flags +#################################################################### +# Needed for all builds +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wformat -Wformat-security") +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations") + +set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fPIE") +set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fPIE") + +# Release build only +set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2") +if (GCC_VERSION VERSION_GREATER 4.9 OR GCC_VERSION VERSION_EQUAL 4.9) + set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fstack-protector-strong") + set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -z noexecstack -z relro -z now") + + # These are for 8478-CT158 in the SDL process + # ( https://sdp-prod.intel.com/bunits/intel/coredla/coredla-ip-20212/tasks/phase/development/8478-CT158/ ) +else() + set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fstack-protector-all") +endif() + +set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -D_FORTIFY_SOURCE=2") +if (GCC_VERSION VERSION_GREATER 4.9 OR GCC_VERSION VERSION_EQUAL 4.9) + set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fstack-protector-strong") + set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -z noexecstack -z relro -z now") +else() + set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fstack-protector-all") +endif() + +# These are for 8478-CT158 in the SDL process +# ( https://sdp-prod.intel.com/bunits/intel/coredla/coredla-ip-20212/tasks/phase/development/8478-CT158/ ) +set (CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv") +set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv") +set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv") + +#################################################################### + +set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -ggdb3") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") + +#### Sanitizer settings #### +# Address +set(CMAKE_C_FLAGS_ASAN "-O1 -g -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls") +set(CMAKE_CXX_FLAGS_ASAN "-O1 -g -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls") + +# Memory +set(CMAKE_C_FLAGS_MSAN "-O1 -g -fsanitize=memory -fno-omit-frame-pointer -fno-optimize-sibling-calls") +set(CMAKE_CXX_FLAGS_MSAN "-O1 -g -fsanitize=memory -fno-omit-frame-pointer -fno-optimize-sibling-calls") + +# Thread +set(CMAKE_C_FLAGS_TSAN "-O1 -g -fsanitize=thread -fno-omit-frame-pointer -fno-optimize-sibling-calls") +set(CMAKE_CXX_FLAGS_TSAN "-O1 -g -fsanitize=thread -fno-omit-frame-pointer -fno-optimize-sibling-calls") + + +set (CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +# Enable all warnings except unknown-pragmas. Wunknown-pragmas must be excluded because +# it is triggered by header file included from OpenCL runtime +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas") + +# Make warnings errors to avoid having them in SDL report +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") +#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror") + +# Should cleanup the signed and unsigned compares then remove this exception +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=sign-compare -Wno-error=unused-function -Wno-error=switch -Wno-error=unused-variable -Wno-error=unused-value -Wno-error=unused-but-set-variable -Wno-error=undef -Wno-error=return-type -Wno-error=reorder") + +# This is required on Ubuntu 18; the new linker behaviour transforms +# RPATH into RUNPATH (which can be seen in the output of 'readelf -d'). +# However, RUNPATH does not work recursively, so when OpenVINO reads +# the plugins.xml file and searches for the specified libcoreDlaRuntimePlugin.so +# library, it fails. The --disable-new-dtags option causes the linker +# to keep RPATH as RPATH (rather than morphing to RUNPATH). +# +# References: +# https://stackoverflow.com/questions/52018092/how-to-set-rpath-and-runpath-with-gcc-ld +# https://stackoverflow.com/questions/59248421/c-secondary-dependency-resolution-with-runpath +# +# The solution below seems preferable to setting LD_LIBRARY_PATH, if only barely. +# For additional motivation, go ahead and throw away part of your day reading either +# of the screeds: +# http://xahlee.info/UnixResource_dir/_/ldpath.html +# https://gms.tf/ld_library_path-considered-harmful.html +# You may find that neither is fully convincing, of course. +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--disable-new-dtags") diff --git a/python/openvino/runtime/dla_aot_splitter/src/main.cpp b/python/openvino/runtime/dla_aot_splitter/src/main.cpp new file mode 100644 index 0000000..ffc098e --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/src/main.cpp @@ -0,0 +1,475 @@ +// Copyright 2022-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include <stdio.h> +#include <sys/stat.h> +#include <algorithm> +#include <map> +#include <memory> +#include <string> +#include <utility> +#include <vector> +#if defined(_WIN32) || defined(_WIN64) +#else +#include <dirent.h> +#include <unistd.h> +#endif + +#include <openvino/openvino.hpp> +#include "samples/args_helper.hpp" +#include "samples/common.hpp" +#include "samples/slog.hpp" + +// #include "average_precision.hpp" +#include "dla_aot_splitter.hpp" +// #include "infer_request_wrap.hpp" +#include "dla_plugin_config.hpp" +#include "inputs_filling.hpp" +#include "utils.hpp" + +using DebugNetworkData = std::map<std::string, uint64_t>; + +bool exists_test(const std::string& name) { + struct stat buffer; + return (stat(name.c_str(), &buffer) == 0); +} + +// This function appears in dla_benchmark/main.cpp too. +bool dir_open_test(const std::string& name) { +#if (!defined(_WIN32) && !defined(_WIN64)) + // If we can open the directory then return true + DIR* dp = opendir(name.c_str()); + if (dp != nullptr) { + closedir(dp); + return true; + } +#endif // !_WIN32 && !_WIN64 + struct stat sb; + if (stat(name.c_str(), &sb) == 0) { + if ((sb.st_mode & S_IFMT) != S_IFREG) { + slog::err << "File " << name << " cannot be opened!" << slog::endl; + throw std::logic_error("File cannot be opened!"); + } + } + return true; +} + +// copy arguments into a new array to split the '-i=<arg>' into +// two arguments (i.e. '-i' and '<arg>') to overcome a bug +// parseInputFilesArguments function where is doesn't recognize +// the -i=<arg> format +void parseCommandLine(int argc, char** argv) { + int num_args = argc; + // allocated enough memory in case we needed to split the -i argument into two + char** arguments = new char*[num_args + 1]; + for (int i = 0, j = 0; j < argc; ++i, ++j) { + if (strstr(argv[j], "-i=")) { + // number of arguments will increase by one after splitting + num_args++; + arguments[i] = new char[3]; + strcpy(arguments[i++], "-i"); + // copy the reset of the argument (i.e. post "-i=") + arguments[i] = new char[strlen(argv[j]) - 2]; + strcpy(arguments[i], argv[j] + 3); + continue; + } + arguments[i] = new char[strlen(argv[j]) + 1]; + strcpy(arguments[i], argv[j]); + } + // the parse function is modifying the arguments point so we need to keep + // a copy of the original pointer value to delete it properly + char** orig_arg_ptr = arguments; + gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true); + // delete the allocated memory + for (int i = 0; i < num_args; ++i) { + delete[] orig_arg_ptr[i]; + } + delete[] orig_arg_ptr; +} + +bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& netSize) { + // ---------------------------Parsing and validating input arguments-------------------------------------- + slog::info << "Parsing input parameters" << slog::endl; + + // Check for any flags that are missing their preceding dashes + // GFlags quietly ignores any flags missing their dashes, which can cause + // aot_splitter to run with settings other than what the user intended + + // GFlags supports two different styles of flag: + // 1. --<flag> + // 2. -<flag> + // It also supports two different ways of specifying values for flags which + // take values: + // 1. --<flag>=<value> + // 2. --<flag> <value> + + // If we are not expecting a flag, we are expecting a value for the + // preceding flag + bool expectingFlag = true; + // Start at 1 to skip the command itself + for (int i = 1; i < argc; i++) { + if (expectingFlag) { + // A flag is always denoted by the first char being '-' + if (argv[i][0] != '-') { + slog::err << "Argument " << argv[i] << " is invalid. You" + << " may have forgotten a preceding '-'." << slog::endl; + throw std::logic_error("One or more invalid arguments"); + } + + char* flagNameStart = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1]; + std::string flagName; + + gflags::CommandLineFlagInfo flagInfo; + if (strstr(flagNameStart, "=")) { + flagName = std::string(flagNameStart, size_t(strstr(flagNameStart, "=") - flagNameStart)); + } else { + flagName = std::string(flagNameStart); + } + + // We expect a flag in the next argv if the current flag is a bool, + // because bool flags do not take a value. + // If GetCommandLineFlagInfo returns false, we assume the current + // flag is a boolean because boolean flags can be specified as + // -no<flag>, which is equivalent to -<flag>=false, or the flag + // simply being omitted. However, "no<flag>" is not recognized by + // GetCommandLineFlagInfo. + // Therefore, if the name is not recognized either the flag is a + // boolean flag or doesn't exist. In the latter case, gflags errors + // when we call parseCommandLine so we can assume here it's a bool. + if (!GetCommandLineFlagInfo(flagName.c_str(), &flagInfo) || strstr(argv[i], "=") || flagInfo.type == "bool") { + expectingFlag = true; + } else { + expectingFlag = false; + } + } else { + // If we were expecting a value, doesn't matter what it is + // gflags will check all values are the correct type, and + // aot_splitter checks if the values received are sane + expectingFlag = true; + } + } + + parseCommandLine(argc, argv); + + if (FLAGS_help || FLAGS_h) { + showUsage(); + // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it + // is an OpenCL/DLAv1 device. Since it is not, it then errors-out when the device + // does not response as expected to the OpenCL query. + // showAvailableDevices(); + std::cout << "\n"; + return false; + } + + if (FLAGS_cm.empty()) { + throw std::logic_error("Model is required but not set. Please set -cm option."); + } else { + std::vector<std::string> m_paths = split(FLAGS_cm, MULTIGRAPH_SEP); + netSize = m_paths.size(); + slog::info << "Found " << netSize << " compiled graph" << (netSize == 1 ? "" : "s") << slog::endl; + for (auto& m_path : m_paths) { + if (!exists_test(m_path)) { + slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm." + << slog::endl; + throw std::logic_error("Compiled model file path does not exist."); + } + } + } + + if (!FLAGS_plugins.empty()) { + slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl; + } + + if (!exists_test(FLAGS_plugins)) { + slog::err << "plugins_xml file: " << FLAGS_plugins << " doesn't exist. Please provide a valid path." << slog::endl; + throw std::logic_error("plugins_xml file path does not exist."); + } + + return true; +} + +static void next_step(const std::string additional_info = "") { + static size_t step_id = 0; + static const std::map<size_t, std::string> step_names = { + {1, "Parsing and validating input arguments"}, + {2, "Loading Inference Engine"}, + {3, "Setting device configuration"}, + {4, "Reading the Intermediate Representation network"}, + {5, "Resizing network to match image sizes and given batch"}, + {6, "Configuring input of the model"}, + {7, "Loading the model to the device"}, + {8, "Setting optimal runtime parameters"}, + {9, "Creating infer requests and filling input blobs with images"}, + {10, "Measuring performance"}, + {11, "Dumping statistics report"}, + {12, "Dumping the output values"}}; + + step_id++; + if (step_names.count(step_id) == 0) { + THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size(); + } + + std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id) + << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl; +} + +template <typename T> +T getMedianValue(const std::vector<T>& vec) { + std::vector<T> sortedVec(vec); + std::sort(sortedVec.begin(), sortedVec.end()); + return (sortedVec.size() % 2 != 0) + ? sortedVec[sortedVec.size() / 2ULL] + : (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0); +} + +/** + * @brief The entry point of the dla benchmark + */ +int main(int argc, char* argv[]) { + try { + // Declaring the ExecutableNetwork object as a pointer to workaround the segfault + // that occurs when destructing the object. Now that it's declared as a pointer + // the complier won't automatically call the destructor of the object at the end + // of this scope and we won't delete the allocated memory either + std::vector<ov::CompiledModel*> exeNetworks; + size_t netSize = 0; // parse the size of networks for arguments check + + size_t return_code = 0; // universal return code, return this value after dumping out Debug info + + // ----------------- 1. Parsing and validating input arguments ------------------------------------------------- + next_step(); + + if (!ParseAndCheckCommandLine(argc, argv, netSize)) { + return 0; + } + + bool isNetworkCompiled = !FLAGS_cm.empty(); + if (isNetworkCompiled) { + slog::info << "Network is compiled" << slog::endl; + } + + // The set of arguments printed is meant to be a useful summary to the + // user, rather than all of the arguments to aot_splitter + slog::info << "Printing summary of arguments being used by aot_splitter" << slog::endl + << "Device (-d) .......................... " + << "HETERO:FPGA" << slog::endl + << "Compiled model (-cm) ................. " << FLAGS_cm << slog::endl + << "Input images directory (-i) .......... " + << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl + << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl + << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl; + + /** This vector stores paths to the processed images **/ + auto multiInputFiles = VectorMap<std::vector<std::string>>( + SplitMultiInputFilesArguments(netSize), // get input directory list + [&](const std::vector<std::string>& inputArgs) mutable { + std::vector<std::string> files; + for (auto& inputArg : inputArgs) { + // Test if the path exists + if (!exists_test(inputArg)) { + slog::err << "Specified image path: " << inputArg << " does not exist" << slog::endl; + throw std::logic_error("Image path does not exist"); + } + // Test whether the path can be opened if it's a directory + dir_open_test(inputArg); + readInputFilesArguments(files, inputArg); + } + + return files; + }); + if (multiInputFiles.size() == 0) { + // failed to read input files + slog::err << "Failed to read input files" << slog::endl; + return 1; + } + + uint32_t num_batches = 1; + + // ----------------- 2. Loading the Inference Engine ----------------------------------------------------------- + next_step(); + + // Get optimal runtime parameters for device + std::string device_name = "HETERO:FPGA"; + ov::Core core(FLAGS_plugins); + + if (device_name.find("FPGA") != std::string::npos) { + if (FLAGS_encryption_key != "") { + core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}}); + } + if (FLAGS_encryption_iv != "") { + core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}}); + } + } + + slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl; + + // ----------------- 3. Setting device configuration ----------------------------------------------------------- + next_step(); + + size_t batchSize = 1; + std::vector<std::string> topology_names; + if (!isNetworkCompiled) { + } else { + next_step(); + slog::info << "Skipping the step for compiled network" << slog::endl; + next_step(); + slog::info << "Skipping the step for compiled network" << slog::endl; + next_step(); + slog::info << "Skipping the step for compiled network" << slog::endl; + // ----------------- 7. Loading the model to the device -------------------------------------------------------- + next_step(); + + int folding_option = 1; + bool fold_preprocessing = false; + bool enable_early_access = false; + if (FLAGS_folding_option) { + folding_option = FLAGS_folding_option; + } + if (FLAGS_fold_preprocessing) { + fold_preprocessing = FLAGS_fold_preprocessing; + } + if (FLAGS_enable_early_access) { + enable_early_access = FLAGS_enable_early_access; + } + core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}}); + core.set_property("FPGA", + {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}}); + core.set_property("FPGA", + {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}}); + + auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP); + exeNetworks = vectorMapWithIndex<ov::CompiledModel*>( + split(FLAGS_cm, MULTIGRAPH_SEP), // get a list of compiled graphs + [&](const std::string& compiled_graph_path, size_t index) { + std::stringstream generated_name; + generated_name << "Graph_" << index; + slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as " + << generated_name.str() << slog::endl; + std::filebuf objFileBuf; + objFileBuf.open(compiled_graph_paths[index].c_str(), std::ios::in | std::ios::binary); + std::istream objIstream(&objFileBuf); + auto exeNetwork = new ov::CompiledModel(); + *exeNetwork = core.import_model(objIstream, device_name, {}); + topology_names.push_back(generated_name.str()); + objFileBuf.close(); + printInputAndOutputsInfoShort(*exeNetwork); + if (batchSize == 0) { + batchSize = 1; + } + const auto& inputs = exeNetwork->inputs(); + for (const auto& item : inputs) { + auto& dims = item.get_shape(); + if (dims[0] != batchSize) { + slog::err << "Batch size of the compiled model is " << dims[0] << " and batch size provided is " + << batchSize << slog::endl; + std::cout << "Set the same batch size = " << dims[0] << " when running the app" << std::endl; + std::cout << "Or recompile model with batch size = " << batchSize << std::endl; + exit(5); + } + } + return exeNetwork; + }); + } + // ----------------- 8. Setting optimal runtime parameters ----------------------------------------------------- + next_step(); + + // Number of requests + uint32_t nireq = 1; + if (nireq == 0) { + nireq = 1; + } + int niter = 1; + + if (niter > 0) { + num_batches = niter; + } + + // ----------------- 9. Creating infer requests and filling input blobs ---------------------------------------- + next_step(); + std::vector<dla_benchmark::InputsInfo> inputInfos; + // Data structure hierarchy + // Outermost vec: which model it corresponds to (multigraph) + // Map: input/output name and its corresponding TensorVector + // TensorVector: An alias for vector<ov::tensor> where each vector element correspond to the batch + std::vector<std::map<std::string, ov::TensorVector>> inputsData; + std::vector<std::map<std::string, ov::TensorVector>> outputTensors(exeNetworks.size()); + + std::vector<std::unique_ptr<InferRequestsQueue>> inferRequestsQueues; + const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type; + for (size_t netIdx = 0; netIdx < exeNetworks.size(); netIdx++) { + // Handle the case that use same inputs for all networks + const auto& inputFiles = netIdx >= multiInputFiles.size() ? multiInputFiles.back() : multiInputFiles[netIdx]; + inputInfos.push_back(GetInputsInfo(batchSize, exeNetworks[netIdx]->inputs(), FLAGS_bin_data)); + inputsData.push_back(GetStaticTensors(inputFiles.empty() ? std::vector<std::string>{} : inputFiles, + batchSize, + inputInfos[netIdx], + num_batches, + resize_type, + FLAGS_bgr, + FLAGS_bin_data, + false /* verbose outputs not supported for aot splitter */)); + // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv + inferRequestsQueues.push_back( + std::move(std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(*(exeNetworks[netIdx]), nireq)))); + } + + /** Start inference & calculate performance **/ + /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/ + std::vector<size_t> iterations(exeNetworks.size(), 0); + + try { + { + // set up all infer request and prep all i/o Blobs + for (size_t net_id = 0; net_id < exeNetworks.size(); net_id++) { + for (size_t iireq = 0; iireq < nireq; iireq++) { + auto inferRequest = inferRequestsQueues.at(net_id)->get_idle_request(); + if (!inferRequest) { + THROW_IE_EXCEPTION << "No idle Infer Requests!"; + } + + if (niter != 0LL) { + const auto& outputs = exeNetworks[net_id]->outputs(); + for (const auto& output : outputs) { + const std::string& name = output.get_any_name(); + outputTensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape()); + inferRequest->set_tensor(name, outputTensors.at(net_id).at(name).at(iterations.at(net_id))); + } + const auto& inputs = exeNetworks[net_id]->inputs(); + for (auto& input : inputs) { + const std::string& inputName = input.get_any_name(); + const auto& data = inputsData.at(net_id).at(inputName)[iterations.at(net_id)]; + inferRequest->set_tensor(inputName, data); + } + } + + { + std::cout << "Generating Artifacts" << std::endl; + inferRequest->infer(); + } + } + } + } + } catch (const std::exception& ex) { + std::cerr << ex.what() << std::endl; + slog::err << "Generation failed" << slog::endl; + return_code = 1; + } + + if (return_code) return return_code; + } catch (const std::exception& ex) { + slog::err << ex.what() << slog::endl; + return 3; + } + + return 0; +} |
