summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example
diff options
context:
space:
mode:
authorEric Dao <eric@erickhangdao.com>2025-03-10 17:54:31 -0400
committerEric Dao <eric@erickhangdao.com>2025-03-10 17:54:31 -0400
commitab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
treea1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example
parent40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
downloadthesis-master.tar.gz
thesis-master.tar.bz2
thesis-master.zip
completed thesisHEADmaster
Diffstat (limited to 'python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example')
-rw-r--r--python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt209
-rw-r--r--python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp180
2 files changed, 389 insertions, 0 deletions
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt
new file mode 100644
index 0000000..a6f2ce8
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt
@@ -0,0 +1,209 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+add_executable(dla_aot_splitter_example EXCLUDE_FROM_ALL src/main.cpp)
+
+target_compile_features(dla_aot_splitter_example PUBLIC cxx_std_11)
+
+target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_MMD)
+
+file(GLOB SOURCES
+ # coredla_device
+ $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/device_memory_allocator.h
+ $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/dla_dma_constants.h
+ $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/mmd_wrapper.h
+ $ENV{COREDLA_ROOT}/runtime/coredla_device/src/device_memory_allocator.cpp
+ #
+ src/main.cpp
+)
+if (SYSTEM_CONSOLE_PLATFORM)
+ list(APPEND SOURCES ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/mmd_wrapper.cpp)
+else ()
+ list(APPEND SOURCES $ENV{COREDLA_ROOT}/runtime/coredla_device/src/mmd_wrapper.cpp)
+endif ()
+
+target_sources (dla_aot_splitter_example PRIVATE ${SOURCES})
+
+if (DISABLE_JIT)
+# for dla_dma_constants.svh
+ if (EXISTS $ENV{COREDLA_ROOT}/inc)
+ target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/inc)
+ else()
+ target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/build/coredla/dla/inc)
+ endif()
+endif()
+
+target_link_libraries(dla_aot_splitter_example PRIVATE
+ pthread
+)
+
+if (DISABLE_JIT)
+ target_include_directories(dla_aot_splitter_example PRIVATE
+ $ENV{COREDLA_ROOT}/util/inc
+ $ENV{COREDLA_XUTIL_DIR}/compiled_result/inc
+ )
+ target_sources(dla_aot_splitter_example PRIVATE $ENV{COREDLA_XUTIL_DIR}/compiled_result/src/compiled_result_reader_writer.cpp)
+else()
+ target_link_libraries(dla_aot_splitter_example
+ PRIVATE
+ dla_compiled_result
+ )
+endif()
+
+if (DE10_AGILEX)
+ target_link_libraries(dla_aot_splitter_example PRIVATE de10_agilex)
+elseif(PAC_A10)
+ target_link_libraries(dla_aot_splitter_example PRIVATE dcp_a10_pac)
+elseif(AGX7_IDK)
+ target_link_libraries(dla_aot_splitter_example PRIVATE agx7_i_dk)
+ find_library(libjson-c_LIBRARIES
+ NAMES json-c
+ PATHS ${LIBOPAE-C_ROOT}/lib
+ ${LIBOPAE-C_ROOT}/lib64
+ /usr/local/lib
+ /usr/lib
+ /lib
+ /usr/lib/x86_64-linux-gnu
+ ${CMAKE_EXTRA_LIBS})
+ target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES})
+elseif(AGX7_N6001)
+ target_link_libraries(dla_aot_splitter_example PRIVATE agx7_n6001)
+ find_library(libjson-c_LIBRARIES
+ NAMES json-c
+ PATHS ${LIBOPAE-C_ROOT}/lib
+ ${LIBOPAE-C_ROOT}/lib64
+ /usr/local/lib
+ /usr/lib
+ /lib
+ /usr/lib/x86_64-linux-gnu
+ ${CMAKE_EXTRA_LIBS})
+ target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES})
+elseif(SYSTEM_CONSOLE_PLATFORM)
+ # Agilex 5 JTAG ED: do nothing
+elseif(REFERENCE)
+ # Reference: do nothing
+else()
+ message(FATAL_ERROR "Building DLA AOT Aplitter Example with unsupported platform")
+endif()
+
+target_include_directories(dla_aot_splitter_example PRIVATE
+ $ENV{COREDLA_ROOT}/runtime/coredla_device/inc
+ if(PAC_A10)
+ $ENV{COREDLA_ROOT}/runtime/coredla_device/mmd/dcp_a10_pac/host
+ endif()
+)
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include)
+
+target_sources (dla_aot_splitter_example PRIVATE
+ ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem
+)
+target_include_directories(dla_aot_splitter_example PRIVATE
+ ${CMAKE_CURRENT_BINARY_DIR}/include
+)
+
+if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_MODEL})
+ set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{AOT_SPLITTER_EXAMPLE_MODEL})
+else()
+ if (EXISTS $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml)
+ set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml)
+ else()
+ # The path below is for Intel internal use only
+ if (EXISTS /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml)
+ set (AOT_SPLITTER_EXAMPLE_MODEL /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml)
+ endif()
+ endif()
+endif()
+
+if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_INPUT})
+ set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{AOT_SPLITTER_EXAMPLE_INPUT})
+else()
+ if (EXISTS $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp)
+ set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp)
+ else()
+ # The path below is for Intel internal use only
+ if (EXISTS /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp)
+ set (AOT_SPLITTER_EXAMPLE_INPUT /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp)
+ endif()
+ endif()
+endif()
+
+if (EXISTS ${CoreDLA_DIR}/../bin)
+ set(COREDLA_BIN ${CoreDLA_DIR}/../bin)
+ set(COREDLA_LIB ${CoreDLA_DIR}/../lib)
+ set(COREDLA_EXARCH ${CoreDLA_DIR}/../example_architectures)
+ if(DE10_AGILEX OR AGX7_IDK OR AGX7_N6001)
+ set (AOT_SPLITTER_EXAMPLE_ARCH AGX7_Performance.arch)
+ elseif(SYSTEM_CONSOLE_PLATFORM)
+ set (AOT_SPLITTER_EXAMPLE_ARCH AGX5_Small_Softmax.arch)
+ else()
+ set (AOT_SPLITTER_EXAMPLE_ARCH A10_Performance.arch)
+ endif()
+else()
+ set(COREDLA_BIN $ENV{COREDLA_ROOT}/build/coredla/dla/bin)
+ set(COREDLA_LIB $ENV{COREDLA_ROOT}/build/coredla/dla/lib)
+ set(COREDLA_EXARCH $ENV{COREDLA_ROOT}/example_architectures)
+
+ # The paths below are for Intel internal use only
+ if(DE10_AGILEX)
+ set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/64x32_i5x1_fp13agx_sb31744_xbark32_clamp_preluk32_poolk4_softmax_1inst.arch)
+ elseif(AGX7_IDK OR AGX7_N6001)
+ set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/32x64_i5x1_fp13agx_sb32768_poolk4_actk32_prelu_rclamp_sig_softmaxk1.arch)
+ elseif(SYSTEM_CONSOLE_PLATFORM)
+ set (AOT_SPLITTER_EXAMPLE_ARCH 16x16_i12x1_fp12agx_sb8192_poolk4_actk16_clamp_softmaxk1.arch)
+ else()
+ set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/A10/64x32_i4x1_fp11_sb31744_xbark32_clamp_preluk32_poolk4_softmax.arch)
+ endif()
+endif()
+
+if (NOT DEFINED AOT_SPLITTER_INPUT_ARGUMENTS)
+ set (AOT_SPLITTER_INPUT_ARGUMENTS )
+ if (DEFINED AOT_SPLITTER_EXAMPLE_INPUT)
+ set (AOT_SPLITTER_INPUT_ARGUMENTS -i ${AOT_SPLITTER_EXAMPLE_INPUT} -bgr)
+ endif()
+endif()
+
+# Need to copy the system console script for Agilex 5E JTAG ED
+# Also link against Boost
+if (SYSTEM_CONSOLE_PLATFORM)
+ find_package(Boost REQUIRED COMPONENTS filesystem)
+ target_link_libraries(dla_aot_splitter_example PRIVATE Boost::filesystem)
+ add_custom_command(
+ TARGET dla_aot_splitter_example POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy
+ ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/system_console_script.tcl
+ ${CMAKE_CURRENT_BINARY_DIR}/system_console_script.tcl
+ )
+ target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_SYSCON_SOURCE_ROOT=${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+add_custom_command(
+ OUTPUT
+ ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem
+ ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem
+ COMMAND
+ LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} ${COREDLA_BIN}/dlac --network-file ${AOT_SPLITTER_EXAMPLE_MODEL} --march ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH} --foutput-format open_vino_hetero --o ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin
+ COMMAND
+ LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} $<TARGET_FILE:dla_aot_splitter> ${AOT_SPLITTER_INPUT_ARGUMENTS} -cm ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin -plugins $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml
+ DEPENDS
+ ${COREDLA_BIN}/dlac
+ dla_benchmark
+ dla_aot_splitter
+ dla_aot_splitter_plugin
+ ${AOT_SPLITTER_EXAMPLE_MODEL}
+ ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH}
+ ${AOT_SPLITTER_EXAMPLE_INPUT}
+ $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml
+ WORKING_DIRECTORY
+ ${CMAKE_CURRENT_BINARY_DIR}/include
+)
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp
new file mode 100644
index 0000000..b90ccd5
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp
@@ -0,0 +1,180 @@
+// Copyright 2022 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+//
+// This small tool demonstrates the minimum number of steps necessary to run an
+// inference on the FPGA while using the output files from the AoT splitter.
+//
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <stdint.h>
+#include <array>
+#include <cstring> //memcpy
+
+uint32_t arch_build_mem_32[] =
+{
+ #include "arch_build.mem"
+};
+uint8_t* const arch_build_mem = (uint8_t*)&arch_build_mem_32[0];
+const uint32_t arch_build_mem_size = sizeof(arch_build_mem_32);
+
+uint32_t input_mem_32[] =
+{
+ #include "input.mem"
+};
+uint8_t* const input_mem = sizeof(input_mem_32) ? (uint8_t*)&input_mem_32[0] : nullptr;
+const uint32_t input_mem_size = sizeof(input_mem_32);
+
+uint32_t config_mem_32[] =
+{
+ #include "config.mem"
+};
+uint8_t* const config_mem = (uint8_t*)&config_mem_32[0];
+const uint32_t config_mem_size = sizeof(config_mem_32);
+
+uint32_t filter_mem_32[] =
+{
+ #include "filter.mem"
+};
+uint8_t* const filter_mem = (uint8_t*)&filter_mem_32[0];
+const uint32_t filter_mem_size = sizeof(filter_mem_32);
+
+constexpr uint32_t output_mem_size =
+ #include "output_size.mem"
+;
+
+constexpr uint32_t inter_mem_size =
+ #include "inter_size.mem"
+;
+
+#include "mmd_wrapper.h"
+#include "device_memory_allocator.h"
+#include "dla_dma_constants.h" //DLA_DMA_CSR_OFFSET_***
+
+int main(int argc, char *argv[]) {
+ std::array<uint8_t, output_mem_size> actual_output_mem;
+ for (uint64_t i=0u; i < actual_output_mem.size();i++)
+ {
+ actual_output_mem[i] = (0xDEADBEEF) >> ((3-(i%4)) * 8);
+ }
+
+ std::cout << "AOT Splitter Example" << std::endl;
+
+ constexpr int instance = 0;
+
+ constexpr int _maxNumPipelines = 5;
+ constexpr int numPipelines = _maxNumPipelines;
+
+ // TODO: retrieve this from the arch file
+ constexpr uint64_t featureWordSize = 32;
+ constexpr uint64_t filterWordSize = 64;
+
+
+ constexpr int ARCH_HASH_SIZE = 16;
+ constexpr int BUILD_VERSION_SIZE = 32;
+
+ MmdWrapper mmdWrapper{};
+ DeviceMemoryAllocator ddrAllocator{};
+
+ for (size_t i = 0; i < ARCH_HASH_SIZE; i+=4) {
+ uint32_t arch_build_word_from_device = mmdWrapper.ReadFromCsr(instance, i);
+ if (arch_build_mem_32[i/4] != arch_build_word_from_device)
+ {
+ std::cout << "Arch hash mismatch at word " << i << " : expected " <<
+ std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_mem_32[i/4] <<
+ " != " <<
+ std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_word_from_device << std::endl;
+ return 1;
+ }
+ }
+ char expected_build_version[BUILD_VERSION_SIZE + 1];
+ expected_build_version[BUILD_VERSION_SIZE] = '\0';
+ std::memcpy(expected_build_version, (uint8_t*)&arch_build_mem_32[ARCH_HASH_SIZE/sizeof(uint32_t)], BUILD_VERSION_SIZE);
+
+ char actual_build_version[BUILD_VERSION_SIZE + 1];
+ actual_build_version[BUILD_VERSION_SIZE] = '\0';
+
+ for (uint32_t i=0;i < BUILD_VERSION_SIZE; i+=4)
+ {
+ uint32_t chunk = mmdWrapper.ReadFromCsr(instance, ARCH_HASH_SIZE + i);
+ for (uint8_t j=0;j < 4; j++)
+ {
+ actual_build_version[i+j] = chunk & 0xFF;
+ chunk >>= 8;
+ }
+ }
+ if (0 != std::strncmp(expected_build_version, actual_build_version, BUILD_VERSION_SIZE))
+ {
+ std::cout << "Build version mismath. Expected " << expected_build_version << " actual " << actual_build_version << std::endl;
+ return 1;
+ }
+
+ ddrAllocator.Initialize(mmdWrapper.GetDDRSizePerInstance(), &mmdWrapper);
+
+ ddrAllocator.AllocateSharedBuffer(inter_mem_size, instance);
+ //mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR, 0);
+
+
+ uint64_t inputOutputBufferSize = numPipelines * (input_mem_size + output_mem_size); // how much space to allocate
+ uint64_t inputOutputBufferAlignment = featureWordSize; // starting address must be aligned to this
+ uint64_t inputOutputBufferAddr; // where did the allocator place this buffer
+ ddrAllocator.AllocatePrivateBuffer(inputOutputBufferSize, inputOutputBufferAlignment, inputOutputBufferAddr);
+
+ uint64_t configFilterBufferSize = config_mem_size + filter_mem_size;
+ uint64_t configFilterBufferAlignment = filterWordSize;
+ uint64_t configFilterBufferAddr;
+ ddrAllocator.AllocatePrivateBuffer(configFilterBufferSize, configFilterBufferAlignment, configFilterBufferAddr);
+
+ mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0);
+ mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, 3);
+ uint32_t completionCount = mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+ std::cout << "Initial completion count " << completionCount << std::endl;
+
+ mmdWrapper.WriteToDDR(instance, inputOutputBufferAddr, input_mem_size, input_mem);
+
+ mmdWrapper.WriteToDDR(instance, configFilterBufferAddr, config_mem_size, config_mem);
+ mmdWrapper.WriteToDDR(instance, configFilterBufferAddr + config_mem_size, filter_mem_size, filter_mem);
+
+ mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, configFilterBufferAddr);
+ constexpr int CONFIG_READER_DATA_BYTES = 8; // May want to move to a header in production code
+ mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, ((config_mem_size) / CONFIG_READER_DATA_BYTES) - 2);
+
+
+ // base address for feature reader -- this will trigger one run of DLA
+ mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, inputOutputBufferAddr);
+
+ int i=0;
+ while(mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT) == completionCount)
+ {
+ i++;
+ if (i % 100000 == 0) {
+ std::cout << "Timeout" << std::endl;
+ return 1;
+ }
+ }
+
+ std::cout << "Completed infered in " << i << " polling intervals" << std::endl;
+
+ //Reading from pipeline zero
+ mmdWrapper.ReadFromDDR(instance, inputOutputBufferAddr + input_mem_size, actual_output_mem.size(), actual_output_mem.data());
+
+ std::ofstream of ("actual_output.mem", std::ios_base::out | std::ios_base::binary);
+ if (of) {
+ of.write((const char*)actual_output_mem.data(), actual_output_mem.size());
+ }
+ of.close();
+
+ return 0;
+}