1 files changed, 144 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_device.h b/python/openvino/runtime/coredla_device/inc/coredla_device.h
new file mode 100644
index 0000000..2a04fa8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_device.h
@@ -0,0 +1,144 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h"          //dla::CompiledResult
+#include "device.h"                   //Device
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "graph_job.h"                //GraphJob
+#include "mmd_wrapper.h"              //MmdWrapper
+
+#include <condition_variable>  //std::condition_variable
+#include <cstdint>             //uint64_t
+#include <map>                 //std::map
+#include <memory>              //std::unique_ptr
+#include <mutex>               //std::mutex
+#include <vector>              //std::vector
+
+class StreamControllerComms;
+
+// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through
+// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job
+// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of
+// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and
+// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector.
+// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR
+// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide
+struct InterruptServiceRoutineData {
+  MmdWrapper* mmdWrapper;
+  std::vector<uint64_t> jobsFinished;
+  std::vector<uint32_t> base_multiplier;
+  std::vector<uint32_t> prevCount;
+  std::vector<uint32_t> desc_queue_diag;
+  std::vector<std::mutex> isrMutex;
+  std::vector<std::condition_variable> isrCondVar;
+};
+
+/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack
+ * On construction, dynamically loads MMD library at runtime and initialized the state of MMD
+ * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts
+ */
+class CoreDlaDevice : public Device {
+ public:
+  GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+                           size_t numPipelines,
+#else
+                           uint64_t numPipelines,
+#endif
+                           int instance,
+                           std::string AES_key,
+                           std::string IV_key,
+                           bool encryption_enabled,
+                           // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers
+                           // for both HW and SW emulator runtime. The software emulator has output file where as currently the
+                           // HW runtime does not.
+                           const std::string export_dir,
+                           const std::string parameter_rom_export_dir);
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); }
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  double GetActiveHWTimeMs(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the input feature reader
+  uint64_t GetNumInputFeatureMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the filter reader
+  uint64_t GetNumFilterMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory writes made by the output feature writer
+  uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override;
+
+ private:
+  // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail
+  // if the module number and address have not been implemented. The debug network is fault tolerant to both read
+  // requests never being accepted as well as read responses never being produced.
+  bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const;
+
+#ifndef USE_OLD_COREDLA_DEVICE
+  // Must be called when there are no active jobs on DLA
+  // Returns total number of clocks by DLA jobs on hardware.
+  uint64_t GetClocksActive(int instance) const;
+
+  // Must be called when there are no active jobs on DLA
+  // Returns the clocks of all jobs
+  uint64_t GetClocksAllJobs(int instance) const;
+#endif
+
+  uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumFilterMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const;
+
+ public:
+  // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+  // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+  // information the debug register contains, and the value is the data of the debug register.
+  DebugNetworkData ReadDebugNetwork(int instance) const override;
+
+  CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds);
+  ~CoreDlaDevice();
+  int GetSizeCsrDescriptorQueue() const override;
+  double GetCoreDlaClockFreq() const override;
+  int GetNumInstances() const override { return numInstances_; }
+  void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override;  // threadId is optional and for debugging purpose only
+  std::string SchedulerGetStatus() const override;
+  bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests,
+                           const std::string source_fifo_file="") override;
+
+ private:
+  std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_;
+  std::vector<std::unique_ptr<GraphJob>> allGraphJobs_;
+  int numInstances_;
+  MmdWrapper mmdWrapper_;
+  InterruptServiceRoutineData isrData_;
+  std::vector<uint64_t> jobsWaited_;
+#ifndef USE_OLD_COREDLA_DEVICE
+  std::vector<uint64_t> startClocksActive;
+  std::vector<uint64_t> startClockAllJobs;
+#endif
+  std::vector<uint64_t> startNumInputFeatureMemoryReads;
+  std::vector<uint64_t> startNumFilterMemoryReads;
+  std::vector<uint64_t> startNumOutputFeatureMemoryWrites;
+  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+  bool runtimePolling_;
+  uint32_t waitForDlaTimeoutSeconds_;
+};