completed thesisHEAD master

author: Eric Dao <eric@erickhangdao.com> 2025-03-10 17:54:31 -0400
committer: Eric Dao <eric@erickhangdao.com> 2025-03-10 17:54:31 -0400
commit: ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
tree: a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/coredla_device/inc
parent: 40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
download: thesis-master.tar.gz
thesis-master.tar.bz2
thesis-master.zip
10 files changed, 675 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/inc/batch_job.h b/python/openvino/runtime/coredla_device/inc/batch_job.h
new file mode 100644
index 0000000..76fd968
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/batch_job.h
@@ -0,0 +1,31 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef BATCH_JOB_H
+#define BATCH_JOB_H
+
+class BatchJob {
+ public:
+  // @param inputArray - ptr to CPU array containing input data to be copied to DDR
+  // blocking function
+  virtual void LoadInputFeatureToDDR(void* inputArray) = 0;
+  // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+  // outputArray must be allocated by the caller (size >= output_size_ddr)
+  // blocking function
+  virtual void ReadOutputFeatureFromDDR(void* outputArray) const = 0;
+  virtual void ScheduleInputFeature() const = 0;
+  virtual void StartDla() = 0;
+  virtual ~BatchJob() {}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
new file mode 100644
index 0000000..7d91f0e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
@@ -0,0 +1,88 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "batch_job.h"    // BatchJob
+#include "mmd_wrapper.h"  // MmdWrapper
+
+// TODO:integrate with dla compiler later
+// #include "dla_types.h"
+// #include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint>  // uint64_t
+#include <memory>   // std::unique_ptr
+
+class StreamControllerComms;
+
+// BatchJob represents one batch execution
+// Contains input/output address and size in DDR for one batch
+// Contains functions to write feature data to DDR, start DLA and read output data from DDR
+class CoreDlaBatchJob : public BatchJob {
+ private:
+  // MMD access is required to handshake with CSR and transfer data between host/device memory
+  MmdWrapper* mmdWrapper_;
+  int instance_;
+  // size and address of graph config data allocated in DDR
+  uint64_t totalConfigWords_;
+  uint64_t configBaseAddrDDR_;
+  // size and address of input and output data allocated in DDR for 1 batch
+  uint64_t inputAddrDDR_;
+  uint64_t outputAddrDDR_;
+  uint64_t inputSizeDDR_;
+  uint64_t outputSizeDDR_;
+  const bool enableIstream_;
+  const bool enableOstream_;
+  uint64_t lastJobQueueNumber_;
+
+  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+
+  CoreDlaBatchJob(MmdWrapper* mmdWrapper,
+                  uint64_t totalConfigWords,
+                  uint64_t configBaseAddrDDR,
+                  uint64_t inputAddrDDR,
+                  uint64_t outputAddrDDR,
+                  uint64_t inputSizeDDR,
+                  uint64_t outputSizeDDR,
+                  const bool enableIstream,
+                  const bool enableOstream,
+                  int instance,
+                  std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+
+ public:
+  CoreDlaBatchJob(const CoreDlaBatchJob&) = delete;
+  CoreDlaBatchJob(CoreDlaBatchJob&) = delete;
+  CoreDlaBatchJob& operator=(const CoreDlaBatchJob&) = delete;
+  static std::unique_ptr<BatchJob> MakeUnique(MmdWrapper* mmdWrapper,
+                                              uint64_t totalConfigWords,
+                                              uint64_t configBaseAddrDDR,
+                                              uint64_t inputAddrDDR,
+                                              uint64_t outputAddrDDR,
+                                              uint64_t inputSizeDDR,
+                                              uint64_t outputSizeDDR,
+                                              const bool enableIstream,
+                                              const bool enableOstream,
+                                              int instance,
+                                              std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+  // @param inputArray - ptr to CPU array containing input data tp be copied to DDR
+  // blocking function
+  void LoadInputFeatureToDDR(void* inputArray) override;
+  void ScheduleInputFeature() const override;
+
+  // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data
+  void StartDla() override;
+  // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+  // outputArray must be allocated by the caller (size >= output_size_ddr)
+  // blocking function
+  void ReadOutputFeatureFromDDR(void* outputArray) const override;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_device.h b/python/openvino/runtime/coredla_device/inc/coredla_device.h
new file mode 100644
index 0000000..2a04fa8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_device.h
@@ -0,0 +1,144 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h"          //dla::CompiledResult
+#include "device.h"                   //Device
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "graph_job.h"                //GraphJob
+#include "mmd_wrapper.h"              //MmdWrapper
+
+#include <condition_variable>  //std::condition_variable
+#include <cstdint>             //uint64_t
+#include <map>                 //std::map
+#include <memory>              //std::unique_ptr
+#include <mutex>               //std::mutex
+#include <vector>              //std::vector
+
+class StreamControllerComms;
+
+// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through
+// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job
+// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of
+// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and
+// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector.
+// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR
+// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide
+struct InterruptServiceRoutineData {
+  MmdWrapper* mmdWrapper;
+  std::vector<uint64_t> jobsFinished;
+  std::vector<uint32_t> base_multiplier;
+  std::vector<uint32_t> prevCount;
+  std::vector<uint32_t> desc_queue_diag;
+  std::vector<std::mutex> isrMutex;
+  std::vector<std::condition_variable> isrCondVar;
+};
+
+/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack
+ * On construction, dynamically loads MMD library at runtime and initialized the state of MMD
+ * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts
+ */
+class CoreDlaDevice : public Device {
+ public:
+  GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+                           size_t numPipelines,
+#else
+                           uint64_t numPipelines,
+#endif
+                           int instance,
+                           std::string AES_key,
+                           std::string IV_key,
+                           bool encryption_enabled,
+                           // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers
+                           // for both HW and SW emulator runtime. The software emulator has output file where as currently the
+                           // HW runtime does not.
+                           const std::string export_dir,
+                           const std::string parameter_rom_export_dir);
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); }
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  double GetActiveHWTimeMs(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the input feature reader
+  uint64_t GetNumInputFeatureMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the filter reader
+  uint64_t GetNumFilterMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory writes made by the output feature writer
+  uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override;
+
+ private:
+  // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail
+  // if the module number and address have not been implemented. The debug network is fault tolerant to both read
+  // requests never being accepted as well as read responses never being produced.
+  bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const;
+
+#ifndef USE_OLD_COREDLA_DEVICE
+  // Must be called when there are no active jobs on DLA
+  // Returns total number of clocks by DLA jobs on hardware.
+  uint64_t GetClocksActive(int instance) const;
+
+  // Must be called when there are no active jobs on DLA
+  // Returns the clocks of all jobs
+  uint64_t GetClocksAllJobs(int instance) const;
+#endif
+
+  uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumFilterMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const;
+
+ public:
+  // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+  // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+  // information the debug register contains, and the value is the data of the debug register.
+  DebugNetworkData ReadDebugNetwork(int instance) const override;
+
+  CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds);
+  ~CoreDlaDevice();
+  int GetSizeCsrDescriptorQueue() const override;
+  double GetCoreDlaClockFreq() const override;
+  int GetNumInstances() const override { return numInstances_; }
+  void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override;  // threadId is optional and for debugging purpose only
+  std::string SchedulerGetStatus() const override;
+  bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests,
+                           const std::string source_fifo_file="") override;
+
+ private:
+  std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_;
+  std::vector<std::unique_ptr<GraphJob>> allGraphJobs_;
+  int numInstances_;
+  MmdWrapper mmdWrapper_;
+  InterruptServiceRoutineData isrData_;
+  std::vector<uint64_t> jobsWaited_;
+#ifndef USE_OLD_COREDLA_DEVICE
+  std::vector<uint64_t> startClocksActive;
+  std::vector<uint64_t> startClockAllJobs;
+#endif
+  std::vector<uint64_t> startNumInputFeatureMemoryReads;
+  std::vector<uint64_t> startNumFilterMemoryReads;
+  std::vector<uint64_t> startNumOutputFeatureMemoryWrites;
+  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+  bool runtimePolling_;
+  uint32_t waitForDlaTimeoutSeconds_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
new file mode 100644
index 0000000..3dc91bc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
@@ -0,0 +1,83 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h"          //dla::CompiledResult
+#include "coredla_batch_job.h"        //BatchJob
+#include "device.h"                   //DLA_LOG
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "graph_job.h"                //GraphJob
+#include "mmd_wrapper.h"              //MmdWrapper
+
+// TODO:integrate with dla compiler later
+//#include "dla_types.h"
+//#include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint>  //uint64_t
+#include <memory>   //std::unique_ptr
+#include <mutex>    //std::mutex
+#include <vector>   //std::vector
+
+/*! GraphJob is a DLA compiled graph loaded onto a device
+ * Initialized with DlaDevice object
+ * GraphJob allocates space in DDR for filter, bias, config, inputs and outputs
+ * It provides handle to "batch job" objects that are used to load input and start DLA for one batch
+ */
+
+class CoreDlaGraphJob : public GraphJob {
+ public:
+  // Function to construct and return a unique pointer GraphJob object to the runtime user
+  // TODO: Provide DLA compiled result object which will contain all the necessary rutime elements as below
+  // @param configFilterBiasBufferSizeDDR - total size of the constants - config, filter and bias
+  // @param configFilterBiasBuffer - ptr to one contigous CPU array for config, filter and bias (obtained from DLA
+  // compiler's output)
+  // @param totalConfigWords - size of config data in words (size of 1 config word is defined in dla_device.h
+  // "CONFIG_READER_DATA_BYTES")
+  // @param intermediateBufferSizeDDR - size of the buffer space required in DDR for feature data of intermediate layers
+  // @param inputSizeDDR - size of one batch input data in DDR. Multiple images in one batch should be contigously
+  // placed
+  // @param outputSizeDDR - size of one batch output data in DDR
+  // @param numPipelines - number of I/O bufffer pairs created for CPU-FPGA pipelining of multiple batch runs
+  // @param spStreamControllerComms - optional interface to stream controller
+  static std::unique_ptr<GraphJob> MakeUnique(DeviceMemoryAllocator* ddrBufferAllocator,
+                                              MmdWrapper* mmdWrapper,
+                                              const dla::CompiledResult* compiled_result,
+                                              uint64_t numPipelines,
+                                              int instance,
+                                              std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+  // Returns an unused batch job object
+  // If all batch jobs are used, returns null
+  // Increments batchJobsRequested_
+  // Thread safe
+  BatchJob* GetBatchJob();
+  CoreDlaGraphJob(const GraphJob&) = delete;
+  CoreDlaGraphJob(CoreDlaGraphJob&) = delete;
+  CoreDlaGraphJob& operator=(const CoreDlaGraphJob&) = delete;
+
+ private:
+  uint64_t configFilterBiasBufferSizeDDR_;
+  uint64_t intermediateBufferSizeDDR_;
+  DeviceMemoryAllocator* ddrBufferAllocator_;
+  MmdWrapper* mmdWrapper_;
+  std::vector<std::unique_ptr<BatchJob>> batchJobs_;
+  unsigned int batchJobsRequested_;
+  unsigned int instance_;
+  std::mutex graphJobMutex;
+  CoreDlaGraphJob(DeviceMemoryAllocator* ddrBufferAllocator,
+                  MmdWrapper* mmdWrapper,
+                  const dla::CompiledResult* compiledResult,
+                  uint64_t numPipelines,
+                  int instance,
+                  std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+};
diff --git a/python/openvino/runtime/coredla_device/inc/device.h b/python/openvino/runtime/coredla_device/inc/device.h
new file mode 100644
index 0000000..e506578
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device.h
@@ -0,0 +1,81 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef DEVICE_H
+#define DEVICE_H
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dla_runtime_log.h"
+
+using namespace std;
+using DebugNetworkData = std::map<std::string, uint64_t>;
+
+// dla log macro
+#define DLA_LOG(fmt, ...) printf(fmt, ##__VA_ARGS__);
+#define DLA_ERROR(fmt, ...) printf(fmt, ##__VA_ARGS__);
+
+class GraphJob;
+class arch_params;
+namespace dla {
+class CompiledResult;
+}
+class Device {
+ public:
+  static unique_ptr<Device> MakeUnique(const arch_params* archParams, uint32_t waitForDlaTimeoutSeconds);
+  virtual GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+                                   size_t numPipelines,
+                                   int instance,
+                                   std::string AES_key,
+                                   std::string IV_key,
+                                   bool encryption_enabled,
+                                   const std::string export_dir,
+                                   const std::string parameter_rom_export_dir) = 0;
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  virtual int GetNumInferencesCompleted(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  virtual double GetActiveHWTimeMs(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  virtual double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the input feature reader
+  virtual uint64_t GetNumInputFeatureMemoryReads(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the filter reader
+  virtual uint64_t GetNumFilterMemoryReads(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory writes made by the output feature writer
+  virtual uint64_t GetNumOutputFeatureMemoryWrites(int instance) const = 0;
+  // Waits for a job to finish on specified instance
+  virtual void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) = 0;
+  virtual int GetNumInstances() const = 0;
+  virtual double GetCoreDlaClockFreq() const = 0;
+  virtual int GetSizeCsrDescriptorQueue() const = 0;
+  virtual std::string SchedulerGetStatus() const = 0;
+  virtual bool InitializeScheduler(uint32_t sourceBufferSize,
+                                   uint32_t dropSourceBuffers,
+                                   uint32_t numInferenceRequests,
+                                   const std::string source_fifo_file="") = 0;
+  virtual DebugNetworkData ReadDebugNetwork(int instance) const = 0;
+  virtual ~Device(){}
+};
+
+#endif  // DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
new file mode 100644
index 0000000..adc0a71
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
@@ -0,0 +1,61 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "mmd_wrapper.h"  //MmdWrapper
+
+#include <cstdint>  //uint64_t
+
+/*! DeviceMemoryAllocator class allocates multiple DLA graph buffers in DDR
+ * Each graph is expected to have one contigous buffer containing all data (config, filter, bias, I/O)
+ * A graph buffer is allocated in DDR from right to left
+ * A scratchpad space is allocated in DDR to be shared across all graphs for intermediate feature data
+ * This intermediate buffer space is allocated from left to right (starting address is 0)
+ * and is expanded based on graph's requirement
+ */
+class DeviceMemoryAllocator {
+ public:
+  void Initialize(uint64_t totalSize, MmdWrapper *mmdWrapper);
+  ~DeviceMemoryAllocator();
+
+  // Buffers that can be shared across multiple graphs may grow in size after they are allocated. The intermediate
+  // buffer is an example of this. We have decided to allocate this at the lowest address and let it grow upwards.
+  // @param bufferSize - the size of the buffer in bytes
+  // @param instance - there can be multiple instances of DLA on FPGA, specify which DLA instance is this buffer for
+  void AllocateSharedBuffer(uint64_t bufferSize, int instance);
+
+  // Buffers that are private to one graph will not change in size after allocation. The config/filter buffer is
+  // an example of this. We have decided to allocate this at the upper address and allocate downwards from there.
+  // Hardware requires the starting address of each buffer to have some alignment, and the allocator will add
+  // as much padding as needed to ensure this. Each contiguous section in device memory should have its own call
+  // to the allocator.
+  // @param bufferSize - the size of the buffer in bytes
+  // @param bufferAlignment - specify how much address alignment is needed for this buffer, must be a power of 2
+  // @param bufferAddr - the allocator indicates where it placed this buffer
+  void AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t &bufferAddr);
+
+  // Clears whole DDR space including the intermediate buffer
+  void Clear();
+
+ private:
+  // total DDR size (BSP parameter)
+  uint64_t totalGlobalMemSize_;
+  // For access to MMD
+  MmdWrapper *mmdWrapper_;
+  // current starting address of allocated graph buffer region
+  // graph buffers are allocated right to left
+  uint64_t currentStartAddressGraphBufferSpace_;
+  // current maximum allocated size for intermediate data
+  uint64_t currentIntermediateMaxBufferSizeAllocated_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
new file mode 100644
index 0000000..13fb56b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
@@ -0,0 +1,27 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+// save a copy
+#pragma push_macro("localparam")
+
+// convert the syntax of verilog into C++, replace "localparam int MY_VAR = 123;" with "constexpr int MY_VAR = 123;"
+#undef localparam
+#define localparam constexpr
+
+// include the verilog header
+#include "dla_dma_constants.svh"
+
+// undo the syntax change
+#pragma pop_macro("localparam")
diff --git a/python/openvino/runtime/coredla_device/inc/graph_job.h b/python/openvino/runtime/coredla_device/inc/graph_job.h
new file mode 100644
index 0000000..b04dde1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/graph_job.h
@@ -0,0 +1,28 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef GRAPH_JOB_H
+#define GRAPH_JOB_H
+
+#include "batch_job.h"
+using namespace std;
+class GraphJob {
+ public:
+  // Returns an unused batch job object
+  // If all batch jobs are used, returns null
+  virtual BatchJob* GetBatchJob() = 0;
+
+  virtual ~GraphJob(){}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
new file mode 100644
index 0000000..4014454
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
@@ -0,0 +1,63 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <cstdint>  //uint32_t
+
+using interrupt_service_routine_signature = void (*)(int handle, void *data);
+
+class MmdWrapper {
+ public:
+  MmdWrapper();
+  // Note that ~MmdWrapper() can call std::exit(1) if aocl_mmd_close()
+  // fails.  Ideally we would find some way to re-order the code so that it
+  // can throw an exception (before calling the destructor) if aocl_mmd_close()
+  // fails.
+  ~MmdWrapper();
+
+  // class cannot be copied
+  MmdWrapper(const MmdWrapper &) = delete;
+  MmdWrapper &operator=(const MmdWrapper &) = delete;
+
+  // Register a function to run as the interrupt service routine
+  void RegisterISR(interrupt_service_routine_signature func, void *data) const;
+
+  // 32-bit handshake with each CSR
+  void WriteToCsr(int instance, uint32_t addr, uint32_t data) const;
+  uint32_t ReadFromCsr(int instance, uint32_t addr) const;
+
+  // Copy data between host and device memory
+  void WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const;
+  void ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const;
+
+  // If the mmd layer supports accesses to the STREAM CONTROLLER
+  bool bIsStreamControllerValid(int instance) const;
+
+  // 32-bit handshake with each Stream Controller CSR
+  void WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const;
+  void ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const;
+
+  // Provide read-only access to board-specific constants
+  int GetMaxInstances() const { return maxInstances_; }
+  uint64_t GetDDRSizePerInstance() const { return ddrSizePerInstance_; }
+  double GetCoreDlaClockFreq() const { return coreDlaClockFreq_; }
+  double GetDDRClockFreq() const { return ddrClockFreq_; }
+
+ private:
+  int handle_;
+  int maxInstances_;
+  uint64_t ddrSizePerInstance_;
+  double coreDlaClockFreq_;
+  double ddrClockFreq_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
new file mode 100644
index 0000000..e2fcdfc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
@@ -0,0 +1,69 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <mutex>
+#include <string>
+#include <vector>
+#include "mmd_wrapper.h"
+#include "stream_controller_messages.h"
+
+template <class T>
+struct Payload : public T {
+  void* GetPayload() { return this; }
+  size_t GetSize() { return sizeof(*this); }
+};
+
+class BusyFlag {
+ public:
+  bool Lock();
+  void Release();
+
+ private:
+  std::recursive_mutex _mutex;
+  bool _busy = false;
+};
+
+class BusyCheck {
+ public:
+  BusyCheck(BusyFlag& busyFlag);
+  ~BusyCheck();
+  operator bool();
+
+ private:
+  BusyFlag& _busyFlag;
+  bool _haveLocked;
+};
+
+class StreamControllerComms {
+ public:
+  StreamControllerComms();
+  bool IsPresent();
+  Payload<StatusMessagePayload> GetStatus();
+  std::string GetStatusString(Payload<StatusMessagePayload>& statusPayload);
+  bool ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items);
+  bool Ping();
+  bool Initialize(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests);
+
+ private:
+  bool StatusMessageHandler(uint32_t payloadOffset);
+  MessageType ReceiveMessage();
+  bool SendMessage(MessageType, void* pPayload = nullptr, size_t size = 0);
+  MmdWrapper _mmdWrapper;
+  uint32_t _lastReceiveSequenceID = 0;
+  uint32_t _sendSequenceID = 0;
+  uint32_t _numBadMessages = 0;
+  const int _streamControllerInstance = 0;
+  Payload<StatusMessagePayload> _receivedStatusMessage;
+  BusyFlag _busyFlag;
+};
author	Eric Dao <eric@erickhangdao.com>	2025-03-10 17:54:31 -0400
committer	Eric Dao <eric@erickhangdao.com>	2025-03-10 17:54:31 -0400
commit	ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
tree	a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/coredla_device/inc
parent	40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
download	thesis-master.tar.gz thesis-master.tar.bz2 thesis-master.zip