diff options
| author | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
|---|---|---|
| committer | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
| commit | ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch) | |
| tree | a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example | |
| parent | 40da1752f2c8639186b72f6838aa415e854d0b1d (diff) | |
| download | thesis-master.tar.gz thesis-master.tar.bz2 thesis-master.zip | |
Diffstat (limited to 'python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example')
| -rw-r--r-- | python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt | 209 | ||||
| -rw-r--r-- | python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp | 180 |
2 files changed, 389 insertions, 0 deletions
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt new file mode 100644 index 0000000..a6f2ce8 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt @@ -0,0 +1,209 @@ +# Copyright (C) 2018-2020 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +add_executable(dla_aot_splitter_example EXCLUDE_FROM_ALL src/main.cpp) + +target_compile_features(dla_aot_splitter_example PUBLIC cxx_std_11) + +target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_MMD) + +file(GLOB SOURCES + # coredla_device + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/device_memory_allocator.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/dla_dma_constants.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/mmd_wrapper.h + $ENV{COREDLA_ROOT}/runtime/coredla_device/src/device_memory_allocator.cpp + # + src/main.cpp +) +if (SYSTEM_CONSOLE_PLATFORM) + list(APPEND SOURCES ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/mmd_wrapper.cpp) +else () + list(APPEND SOURCES $ENV{COREDLA_ROOT}/runtime/coredla_device/src/mmd_wrapper.cpp) +endif () + +target_sources (dla_aot_splitter_example PRIVATE ${SOURCES}) + +if (DISABLE_JIT) +# for dla_dma_constants.svh + if (EXISTS $ENV{COREDLA_ROOT}/inc) + target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/inc) + else() + target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/build/coredla/dla/inc) + endif() +endif() + +target_link_libraries(dla_aot_splitter_example PRIVATE + pthread +) + +if (DISABLE_JIT) + target_include_directories(dla_aot_splitter_example PRIVATE + $ENV{COREDLA_ROOT}/util/inc + $ENV{COREDLA_XUTIL_DIR}/compiled_result/inc + ) + target_sources(dla_aot_splitter_example PRIVATE $ENV{COREDLA_XUTIL_DIR}/compiled_result/src/compiled_result_reader_writer.cpp) +else() + target_link_libraries(dla_aot_splitter_example + PRIVATE + dla_compiled_result + ) +endif() + +if (DE10_AGILEX) + target_link_libraries(dla_aot_splitter_example PRIVATE de10_agilex) +elseif(PAC_A10) + target_link_libraries(dla_aot_splitter_example PRIVATE dcp_a10_pac) +elseif(AGX7_IDK) + target_link_libraries(dla_aot_splitter_example PRIVATE agx7_i_dk) + find_library(libjson-c_LIBRARIES + NAMES json-c + PATHS ${LIBOPAE-C_ROOT}/lib + ${LIBOPAE-C_ROOT}/lib64 + /usr/local/lib + /usr/lib + /lib + /usr/lib/x86_64-linux-gnu + ${CMAKE_EXTRA_LIBS}) + target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES}) +elseif(AGX7_N6001) + target_link_libraries(dla_aot_splitter_example PRIVATE agx7_n6001) + find_library(libjson-c_LIBRARIES + NAMES json-c + PATHS ${LIBOPAE-C_ROOT}/lib + ${LIBOPAE-C_ROOT}/lib64 + /usr/local/lib + /usr/lib + /lib + /usr/lib/x86_64-linux-gnu + ${CMAKE_EXTRA_LIBS}) + target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES}) +elseif(SYSTEM_CONSOLE_PLATFORM) + # Agilex 5 JTAG ED: do nothing +elseif(REFERENCE) + # Reference: do nothing +else() + message(FATAL_ERROR "Building DLA AOT Aplitter Example with unsupported platform") +endif() + +target_include_directories(dla_aot_splitter_example PRIVATE + $ENV{COREDLA_ROOT}/runtime/coredla_device/inc + if(PAC_A10) + $ENV{COREDLA_ROOT}/runtime/coredla_device/mmd/dcp_a10_pac/host + endif() +) + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include) + +target_sources (dla_aot_splitter_example PRIVATE + ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem +) +target_include_directories(dla_aot_splitter_example PRIVATE + ${CMAKE_CURRENT_BINARY_DIR}/include +) + +if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_MODEL}) + set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{AOT_SPLITTER_EXAMPLE_MODEL}) +else() + if (EXISTS $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml) + set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml) + else() + # The path below is for Intel internal use only + if (EXISTS /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml) + set (AOT_SPLITTER_EXAMPLE_MODEL /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml) + endif() + endif() +endif() + +if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_INPUT}) + set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{AOT_SPLITTER_EXAMPLE_INPUT}) +else() + if (EXISTS $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp) + set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp) + else() + # The path below is for Intel internal use only + if (EXISTS /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp) + set (AOT_SPLITTER_EXAMPLE_INPUT /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp) + endif() + endif() +endif() + +if (EXISTS ${CoreDLA_DIR}/../bin) + set(COREDLA_BIN ${CoreDLA_DIR}/../bin) + set(COREDLA_LIB ${CoreDLA_DIR}/../lib) + set(COREDLA_EXARCH ${CoreDLA_DIR}/../example_architectures) + if(DE10_AGILEX OR AGX7_IDK OR AGX7_N6001) + set (AOT_SPLITTER_EXAMPLE_ARCH AGX7_Performance.arch) + elseif(SYSTEM_CONSOLE_PLATFORM) + set (AOT_SPLITTER_EXAMPLE_ARCH AGX5_Small_Softmax.arch) + else() + set (AOT_SPLITTER_EXAMPLE_ARCH A10_Performance.arch) + endif() +else() + set(COREDLA_BIN $ENV{COREDLA_ROOT}/build/coredla/dla/bin) + set(COREDLA_LIB $ENV{COREDLA_ROOT}/build/coredla/dla/lib) + set(COREDLA_EXARCH $ENV{COREDLA_ROOT}/example_architectures) + + # The paths below are for Intel internal use only + if(DE10_AGILEX) + set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/64x32_i5x1_fp13agx_sb31744_xbark32_clamp_preluk32_poolk4_softmax_1inst.arch) + elseif(AGX7_IDK OR AGX7_N6001) + set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/32x64_i5x1_fp13agx_sb32768_poolk4_actk32_prelu_rclamp_sig_softmaxk1.arch) + elseif(SYSTEM_CONSOLE_PLATFORM) + set (AOT_SPLITTER_EXAMPLE_ARCH 16x16_i12x1_fp12agx_sb8192_poolk4_actk16_clamp_softmaxk1.arch) + else() + set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/A10/64x32_i4x1_fp11_sb31744_xbark32_clamp_preluk32_poolk4_softmax.arch) + endif() +endif() + +if (NOT DEFINED AOT_SPLITTER_INPUT_ARGUMENTS) + set (AOT_SPLITTER_INPUT_ARGUMENTS ) + if (DEFINED AOT_SPLITTER_EXAMPLE_INPUT) + set (AOT_SPLITTER_INPUT_ARGUMENTS -i ${AOT_SPLITTER_EXAMPLE_INPUT} -bgr) + endif() +endif() + +# Need to copy the system console script for Agilex 5E JTAG ED +# Also link against Boost +if (SYSTEM_CONSOLE_PLATFORM) + find_package(Boost REQUIRED COMPONENTS filesystem) + target_link_libraries(dla_aot_splitter_example PRIVATE Boost::filesystem) + add_custom_command( + TARGET dla_aot_splitter_example POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/system_console_script.tcl + ${CMAKE_CURRENT_BINARY_DIR}/system_console_script.tcl + ) + target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_SYSCON_SOURCE_ROOT=${CMAKE_CURRENT_BINARY_DIR}) +endif() + +add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem + ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem + COMMAND + LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} ${COREDLA_BIN}/dlac --network-file ${AOT_SPLITTER_EXAMPLE_MODEL} --march ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH} --foutput-format open_vino_hetero --o ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin + COMMAND + LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} $<TARGET_FILE:dla_aot_splitter> ${AOT_SPLITTER_INPUT_ARGUMENTS} -cm ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin -plugins $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml + DEPENDS + ${COREDLA_BIN}/dlac + dla_benchmark + dla_aot_splitter + dla_aot_splitter_plugin + ${AOT_SPLITTER_EXAMPLE_MODEL} + ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH} + ${AOT_SPLITTER_EXAMPLE_INPUT} + $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR}/include +) diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp new file mode 100644 index 0000000..b90ccd5 --- /dev/null +++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp @@ -0,0 +1,180 @@ +// Copyright 2022 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// +// This small tool demonstrates the minimum number of steps necessary to run an +// inference on the FPGA while using the output files from the AoT splitter. +// + +#include <iostream> +#include <iomanip> +#include <fstream> +#include <stdint.h> +#include <array> +#include <cstring> //memcpy + +uint32_t arch_build_mem_32[] = +{ + #include "arch_build.mem" +}; +uint8_t* const arch_build_mem = (uint8_t*)&arch_build_mem_32[0]; +const uint32_t arch_build_mem_size = sizeof(arch_build_mem_32); + +uint32_t input_mem_32[] = +{ + #include "input.mem" +}; +uint8_t* const input_mem = sizeof(input_mem_32) ? (uint8_t*)&input_mem_32[0] : nullptr; +const uint32_t input_mem_size = sizeof(input_mem_32); + +uint32_t config_mem_32[] = +{ + #include "config.mem" +}; +uint8_t* const config_mem = (uint8_t*)&config_mem_32[0]; +const uint32_t config_mem_size = sizeof(config_mem_32); + +uint32_t filter_mem_32[] = +{ + #include "filter.mem" +}; +uint8_t* const filter_mem = (uint8_t*)&filter_mem_32[0]; +const uint32_t filter_mem_size = sizeof(filter_mem_32); + +constexpr uint32_t output_mem_size = + #include "output_size.mem" +; + +constexpr uint32_t inter_mem_size = + #include "inter_size.mem" +; + +#include "mmd_wrapper.h" +#include "device_memory_allocator.h" +#include "dla_dma_constants.h" //DLA_DMA_CSR_OFFSET_*** + +int main(int argc, char *argv[]) { + std::array<uint8_t, output_mem_size> actual_output_mem; + for (uint64_t i=0u; i < actual_output_mem.size();i++) + { + actual_output_mem[i] = (0xDEADBEEF) >> ((3-(i%4)) * 8); + } + + std::cout << "AOT Splitter Example" << std::endl; + + constexpr int instance = 0; + + constexpr int _maxNumPipelines = 5; + constexpr int numPipelines = _maxNumPipelines; + + // TODO: retrieve this from the arch file + constexpr uint64_t featureWordSize = 32; + constexpr uint64_t filterWordSize = 64; + + + constexpr int ARCH_HASH_SIZE = 16; + constexpr int BUILD_VERSION_SIZE = 32; + + MmdWrapper mmdWrapper{}; + DeviceMemoryAllocator ddrAllocator{}; + + for (size_t i = 0; i < ARCH_HASH_SIZE; i+=4) { + uint32_t arch_build_word_from_device = mmdWrapper.ReadFromCsr(instance, i); + if (arch_build_mem_32[i/4] != arch_build_word_from_device) + { + std::cout << "Arch hash mismatch at word " << i << " : expected " << + std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_mem_32[i/4] << + " != " << + std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_word_from_device << std::endl; + return 1; + } + } + char expected_build_version[BUILD_VERSION_SIZE + 1]; + expected_build_version[BUILD_VERSION_SIZE] = '\0'; + std::memcpy(expected_build_version, (uint8_t*)&arch_build_mem_32[ARCH_HASH_SIZE/sizeof(uint32_t)], BUILD_VERSION_SIZE); + + char actual_build_version[BUILD_VERSION_SIZE + 1]; + actual_build_version[BUILD_VERSION_SIZE] = '\0'; + + for (uint32_t i=0;i < BUILD_VERSION_SIZE; i+=4) + { + uint32_t chunk = mmdWrapper.ReadFromCsr(instance, ARCH_HASH_SIZE + i); + for (uint8_t j=0;j < 4; j++) + { + actual_build_version[i+j] = chunk & 0xFF; + chunk >>= 8; + } + } + if (0 != std::strncmp(expected_build_version, actual_build_version, BUILD_VERSION_SIZE)) + { + std::cout << "Build version mismath. Expected " << expected_build_version << " actual " << actual_build_version << std::endl; + return 1; + } + + ddrAllocator.Initialize(mmdWrapper.GetDDRSizePerInstance(), &mmdWrapper); + + ddrAllocator.AllocateSharedBuffer(inter_mem_size, instance); + //mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR, 0); + + + uint64_t inputOutputBufferSize = numPipelines * (input_mem_size + output_mem_size); // how much space to allocate + uint64_t inputOutputBufferAlignment = featureWordSize; // starting address must be aligned to this + uint64_t inputOutputBufferAddr; // where did the allocator place this buffer + ddrAllocator.AllocatePrivateBuffer(inputOutputBufferSize, inputOutputBufferAlignment, inputOutputBufferAddr); + + uint64_t configFilterBufferSize = config_mem_size + filter_mem_size; + uint64_t configFilterBufferAlignment = filterWordSize; + uint64_t configFilterBufferAddr; + ddrAllocator.AllocatePrivateBuffer(configFilterBufferSize, configFilterBufferAlignment, configFilterBufferAddr); + + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0); + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, 3); + uint32_t completionCount = mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT); + std::cout << "Initial completion count " << completionCount << std::endl; + + mmdWrapper.WriteToDDR(instance, inputOutputBufferAddr, input_mem_size, input_mem); + + mmdWrapper.WriteToDDR(instance, configFilterBufferAddr, config_mem_size, config_mem); + mmdWrapper.WriteToDDR(instance, configFilterBufferAddr + config_mem_size, filter_mem_size, filter_mem); + + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, configFilterBufferAddr); + constexpr int CONFIG_READER_DATA_BYTES = 8; // May want to move to a header in production code + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, ((config_mem_size) / CONFIG_READER_DATA_BYTES) - 2); + + + // base address for feature reader -- this will trigger one run of DLA + mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, inputOutputBufferAddr); + + int i=0; + while(mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT) == completionCount) + { + i++; + if (i % 100000 == 0) { + std::cout << "Timeout" << std::endl; + return 1; + } + } + + std::cout << "Completed infered in " << i << " polling intervals" << std::endl; + + //Reading from pipeline zero + mmdWrapper.ReadFromDDR(instance, inputOutputBufferAddr + input_mem_size, actual_output_mem.size(), actual_output_mem.data()); + + std::ofstream of ("actual_output.mem", std::ios_base::out | std::ios_base::binary); + if (of) { + of.write((const char*)actual_output_mem.data(), actual_output_mem.size()); + } + of.close(); + + return 0; +} |
