python/openvino/runtime/coredla_device/inc/coredla_device.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

// Copyright 2020-2023 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

#pragma once

#include "compiled_result.h"          //dla::CompiledResult
#include "device.h"                   //Device
#include "device_memory_allocator.h"  //DeviceMemoryAllocator
#include "graph_job.h"                //GraphJob
#include "mmd_wrapper.h"              //MmdWrapper

#include <condition_variable>  //std::condition_variable
#include <cstdint>             //uint64_t
#include <map>                 //std::map
#include <memory>              //std::unique_ptr
#include <mutex>               //std::mutex
#include <vector>              //std::vector

class StreamControllerComms;

// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through
// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job
// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of
// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and
// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector.
// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR
// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide
struct InterruptServiceRoutineData {
  MmdWrapper* mmdWrapper;
  std::vector<uint64_t> jobsFinished;
  std::vector<uint32_t> base_multiplier;
  std::vector<uint32_t> prevCount;
  std::vector<uint32_t> desc_queue_diag;
  std::vector<std::mutex> isrMutex;
  std::vector<std::condition_variable> isrCondVar;
};

/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack
 * On construction, dynamically loads MMD library at runtime and initialized the state of MMD
 * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts
 */
class CoreDlaDevice : public Device {
 public:
  GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
#ifndef USE_OLD_COREDLA_DEVICE
                           size_t numPipelines,
#else
                           uint64_t numPipelines,
#endif
                           int instance,
                           std::string AES_key,
                           std::string IV_key,
                           bool encryption_enabled,
                           // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers
                           // for both HW and SW emulator runtime. The software emulator has output file where as currently the
                           // HW runtime does not.
                           const std::string export_dir,
                           const std::string parameter_rom_export_dir);
  // Return number of DLA jobs completed till now
  // Used for debugging
  int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); }
  // Must be called when there are no active jobs on DLA
  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
  double GetActiveHWTimeMs(int instance) const override;
  // Must be called when there are no active jobs on DLA
  // Returns the average of time taken per job (in milliseconds)
  // Avg Time per job < Active Time
  double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override;
  // Must be called when there are no active jobs on DLA
  // Returns the number of memory read made by the input feature reader
  uint64_t GetNumInputFeatureMemoryReads(int instance) const override;
  // Must be called when there are no active jobs on DLA
  // Returns the number of memory read made by the filter reader
  uint64_t GetNumFilterMemoryReads(int instance) const override;
  // Must be called when there are no active jobs on DLA
  // Returns the number of memory writes made by the output feature writer
  uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override;

 private:
  // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail
  // if the module number and address have not been implemented. The debug network is fault tolerant to both read
  // requests never being accepted as well as read responses never being produced.
  bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const;

#ifndef USE_OLD_COREDLA_DEVICE
  // Must be called when there are no active jobs on DLA
  // Returns total number of clocks by DLA jobs on hardware.
  uint64_t GetClocksActive(int instance) const;

  // Must be called when there are no active jobs on DLA
  // Returns the clocks of all jobs
  uint64_t GetClocksAllJobs(int instance) const;
#endif

  uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const;

  uint64_t GetNumFilterMemoryReadsTotal(int instance) const;

  uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const;

 public:
  // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
  // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
  // information the debug register contains, and the value is the data of the debug register.
  DebugNetworkData ReadDebugNetwork(int instance) const override;

  CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds);
  ~CoreDlaDevice();
  int GetSizeCsrDescriptorQueue() const override;
  double GetCoreDlaClockFreq() const override;
  int GetNumInstances() const override { return numInstances_; }
  void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override;  // threadId is optional and for debugging purpose only
  std::string SchedulerGetStatus() const override;
  bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests,
                           const std::string source_fifo_file="") override;

 private:
  std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_;
  std::vector<std::unique_ptr<GraphJob>> allGraphJobs_;
  int numInstances_;
  MmdWrapper mmdWrapper_;
  InterruptServiceRoutineData isrData_;
  std::vector<uint64_t> jobsWaited_;
#ifndef USE_OLD_COREDLA_DEVICE
  std::vector<uint64_t> startClocksActive;
  std::vector<uint64_t> startClockAllJobs;
#endif
  std::vector<uint64_t> startNumInputFeatureMemoryReads;
  std::vector<uint64_t> startNumFilterMemoryReads;
  std::vector<uint64_t> startNumOutputFeatureMemoryWrites;
  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
  bool runtimePolling_;
  uint32_t waitForDlaTimeoutSeconds_;
};