python/openvino/runtime/coredla_device/src/coredla_device.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574

// Copyright 2020-2023 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

#include "coredla_device.h"     //CoreDlaDevice
#include "coredla_batch_job.h"  //CoreDlaBatchJob
#include "coredla_graph_job.h"  //CoreDlaBatchJob
#include "dla_dma_constants.h"  //DLA_DMA_CSR_OFFSET_***
#include "stream_controller_comms.h"

#include <algorithm>  //std::count
#include <cassert>    //assert
#include <chrono>     //std::chrono::seconds
#include <cstddef>    //size_t
#include <cstdlib>    //std::getenv
#ifndef USE_OLD_COREDLA_DEVICE
#include <cinttypes>  //printf formatters
#endif
#include <mutex>      //std::mutex
#include <stdexcept>  //std::runtime_error
#include <string>     //std::string
#include <iostream>   //std::cerr
#include <stdint.h>   //
#include <thread>
#include <cinttypes>

std::unique_ptr<Device> Device::MakeUnique(const arch_params* archParams,
                                           uint32_t waitForDlaTimeoutSeconds) {
  return std::unique_ptr<Device>(new CoreDlaDevice(waitForDlaTimeoutSeconds));
}

void InterruptServiceRoutine(int handle, void* data) {
  InterruptServiceRoutineData* isrData = static_cast<InterruptServiceRoutineData*>(data);
  // clear interrupt status -- write 1 to clear that bit
  constexpr int writeDataToClearInterruptStatus = 3;
  const int numInstances = static_cast<int>(isrData->jobsFinished.size());
  for (int i = 0; i < numInstances; i++) {
    isrData->mmdWrapper->WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, writeDataToClearInterruptStatus);
  }
  for (int i = 0; i < numInstances; i++) {
    isrData->desc_queue_diag[i] = isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS);
    // ask the csr how many jobs have finished
    uint32_t completionCount =  isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
    // check if the completionCount wraps around (overflow detection) and save this information
    if (isrData->prevCount[i] > completionCount)
      isrData->base_multiplier[i] ++;
    isrData->prevCount[i] = completionCount;
    // we add base_multiplier to account for the fact that a wrap around is actually an increment of 1
    std::unique_lock<std::mutex> isrMutexLock(isrData->isrMutex[i]);
    isrData->jobsFinished[i] = (uint64_t) isrData->base_multiplier[i] * UINT32_MAX + completionCount + isrData->base_multiplier[i];
    isrData->isrCondVar[i].notify_all();
  }
}

CoreDlaDevice::CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds)
: waitForDlaTimeoutSeconds_(waitForDlaTimeoutSeconds) {
#ifdef COREDLA_RUNTIME_POLLING
  runtimePolling_ = true;
#else
  runtimePolling_ = false;
#endif
  // mmdWrapper_ ctor runs first, which will open a handle to the MMD. Now determine the number of hardware instances
  // by writing a nonzero value to some offset and then reading it back. While trying to enable the interrupt
  // mask, test for this.
  numInstances_ = 0;
  for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
    constexpr uint32_t allInterruptsMask = (1<<DLA_DMA_CSR_INTERRUPT_ERROR_BIT) | (1<<DLA_DMA_CSR_INTERRUPT_DONE_BIT);
    // clear any pending interrupts (there may be pending interrupts from last run), then enable mask for instance count
    mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, allInterruptsMask);
    mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, allInterruptsMask);
    uint32_t readData = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK);
    if (allInterruptsMask == readData) numInstances_ = i + 1;
  }
  LOG_AND_PRINT(Logger::INFO, "numInstances_: %d\n", numInstances_);
  assert(numInstances_ >= 1);
  jobsWaited_.resize(numInstances_, 0);

  uint32_t license = mmdWrapper_.ReadFromCsr(0, DLA_DMA_CSR_OFFSET_LICENSE_FLAG);
  if (license == 0) {
    DLA_LOG("Using unlicensed IP\n");
  }
  else if (license == 1) {
    DLA_LOG("Using licensed IP\n");
  }
  else {
    throw std::runtime_error("Unrecongnized license flag");
  }
#ifndef USE_OLD_COREDLA_DEVICE
  startClocksActive.resize(numInstances_, 0);
  startClockAllJobs.resize(numInstances_, 0);
#endif
  startNumInputFeatureMemoryReads.resize(numInstances_, 0);
  startNumFilterMemoryReads.resize(numInstances_, 0);
  startNumOutputFeatureMemoryWrites.resize(numInstances_, 0);

  // Package up the data that interrupt service routine needs
  isrData_.mmdWrapper = &mmdWrapper_;
  isrData_.jobsFinished = std::vector<uint64_t>(numInstances_, 0);
  isrData_.base_multiplier = std::vector<uint32_t>(numInstances_, 0);
  isrData_.prevCount = std::vector<uint32_t>(numInstances_, 0);
  isrData_.desc_queue_diag = std::vector<uint32_t>(numInstances_, 0);
  isrData_.isrMutex = std::vector<std::mutex>(numInstances_);
  isrData_.isrCondVar = std::vector<std::condition_variable>(numInstances_);

  if (runtimePolling_) {
    // disable the interrupt mask -- it was originally enabled to determine how many instances were present
    for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
      constexpr uint32_t disableInterruptMaskValue = 0;
      mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, disableInterruptMaskValue);
    }
  }
  else {
    // register an interrupt handler
    mmdWrapper_.RegisterISR(&InterruptServiceRoutine, &isrData_);
  }

  // Record the current counters
  for(int i=0; i < numInstances_; i++) {
#ifndef USE_OLD_COREDLA_DEVICE
    jobsWaited_[i] = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
    isrData_.jobsFinished[i] = jobsWaited_[i];

    startClocksActive[i] = GetClocksActive(i);
    startClockAllJobs[i] = GetClocksAllJobs(i);
#endif
    startNumInputFeatureMemoryReads.at(i) = GetNumInputFeatureMemoryReadsTotal(i);
    startNumFilterMemoryReads.at(i) = GetNumFilterMemoryReadsTotal(i);
    startNumOutputFeatureMemoryWrites.at(i) = GetNumOutputFeatureMemoryWritesTotal(i);
  }

  // Allocator needs access to mmd to write to CSR the start address of the shared intermediate buffer allocated in DDR
  ddrAllocator_ = std::unique_ptr<DeviceMemoryAllocator[]>(new DeviceMemoryAllocator[numInstances_]);
  for (int i = 0; i < numInstances_; i++) {
    ddrAllocator_[i].Initialize(mmdWrapper_.GetDDRSizePerInstance(), &mmdWrapper_);
  }

// Choose which data pattern you want, all zeros or all ones can also be useful for IP debug purposes
#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR, INDEX) ((ADDR * 12345) + (INDEX * 6789))
  //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0)
  //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0xffffffffffffffffULL)
  bool run_memory_test = getenv("COREDLA_RUNTIME_MEMORY_TEST") != nullptr;
  if (run_memory_test) {
    // Ensure host can access all of the device memory that is accessible by all CoreDLA instances
    // This is not necessarily the total device memory e.g. only 1 CoreDLA instance but 2 DDR banks
    DLA_LOG("starting memory test with %d instances\n", numInstances_);
    constexpr uint64_t CHUNK_SIZE = 1ULL << 20;  // one address check is 1 MB
    const uint64_t ADDR_LIMIT = mmdWrapper_.GetDDRSizePerInstance();
    int mismatch = 0;
    uint64_t expected;
    uint64_t* data = new uint64_t[CHUNK_SIZE / sizeof(uint64_t)];

    for (int inst = 0; inst < numInstances_; ++inst) {
      // write to entire fpga ddr
      for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
        for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++)
          data[index] = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
        mmdWrapper_.WriteToDDR(inst, addr, CHUNK_SIZE, static_cast<const void*>(data));
      }
      // read back entire fpga ddr and compare to expected
      for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
        mmdWrapper_.ReadFromDDR(inst, addr, CHUNK_SIZE, data);
        for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++) {
          expected = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
          if (data[index] != expected) {
            if (mismatch < 10) {
#if (!defined(USE_OLD_COREDLA_DEVICE) || defined(_WIN32))
              DLA_LOG("memory test mismatch, addr %" PRIu64 ", index %" PRIu64 ", got %" PRIu64 ", expected %" PRIu64
                      "\n",
                      addr,
                      index,
                      data[index],
                      expected);
#else
              DLA_LOG("memory test mismatch, addr %lu, index %lu, got %lu, expected %lu\n",
                      addr,
                      index,
                      data[index],
                      expected);
#endif
            }
            mismatch++;
          }
        }
      }
    }
    delete[] data;
    DLA_LOG("finished memory test ");
    if (mismatch == 0) {
      DLA_LOG("SUCCESS\n");
    } else {
      DLA_LOG("FAILURE (%d mismatches)\n", mismatch);
    }
  }
}

CoreDlaDevice::~CoreDlaDevice() {
  // Avoid the scenario where some CoreDLA job has been started but something goes wrong
  // in the runtime which causes it to exit, e.g. assertion failure or uncaught exception.
  // CoreDLA will still raise an interrupt when the job finishes, yet the runtime will no
  // longer be able to deal with it. Better to shut off interurpts.
  for (int instance = 0; instance < numInstances_; instance++) {
    // MmDWrapper.WriteToCSR might throw exception, and the destructor should not have
    // unhandled exception, so we need to handle exceptions internally
    try {
      mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0);
    } catch (const std::exception& e) {
      std::cerr << "Failed to shut off the DMA CSR interrupt mask due to " << e.what() << std::endl;
    }
  }
}

GraphJob* CoreDlaDevice::CreateGraphJob(const dla::CompiledResult* compiledResult,
#ifndef USE_OLD_COREDLA_DEVICE
                                        size_t numPipelines,
#else
                                        uint64_t numPipelines,
#endif
                                        int instance,
                                        std::string AES_key,
                                        std::string IV_key,
                                        bool encryption_enabled,
                                        const std::string export_dir,
                                        const std::string parameter_rom_export_dir) {
  assert(instance < numInstances_);
  (void) export_dir;  // unused in HW runtime. CoreDLA utilizes base pointers, which the SW emulator utilizes this variable. We void it here.
  allGraphJobs_.push_back(move(
      CoreDlaGraphJob::MakeUnique(&ddrAllocator_[instance], &mmdWrapper_, compiledResult, numPipelines, instance, spStreamControllerComms_)));
  return (allGraphJobs_.back()).get();
}

// This function must be called by a single thread
void CoreDlaDevice::WaitForDla(int instance, size_t threadId, std::function<bool()> isCancelledPredicate) {
  // ISR updates jobsFinished, if not enough jobs have finished then sleep until ISR runs again
  // it is possible that several hardware jobs could finish around the same time
  // by the time software handles the first interrupt, hardware could report that 2 jobs have
  // finished, for example the second time that waitForInterrupt runs, software already tracks
  // that the second job has finished and therefore don't need to sleep waiting for ISR
  std::unique_lock<std::mutex> isrMutexLock(isrData_.isrMutex[instance]);
  uint32_t completionCount = 0;
  bool timedOut = false;
  auto timeoutDuration = std::chrono::seconds(waitForDlaTimeoutSeconds_);

  if (runtimePolling_) {
    std::chrono::time_point<std::chrono::system_clock> pollingEndingTime =
        std::chrono::system_clock::now() + timeoutDuration;

    while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
      // Update isrData_.jobsFinished[instance] here (polling)
      if (isCancelledPredicate and isCancelledPredicate()) {
        break;
      }

      completionCount = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
      isrData_.jobsFinished[instance] = completionCount;
      if (std::chrono::system_clock::now() > pollingEndingTime) {
        timedOut = true;
        break;
      }
    }
  } else {
    while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
      // isrData_.jobsFinished[instance] is updated in the ISR
      if (std::cv_status::timeout == isrData_.isrCondVar[instance].wait_for(isrMutexLock, timeoutDuration)) {
        timedOut = true;
        break;
      }
    }
  }

  if (timedOut) {
    std::string str_poll_vs_int = "interrupt";
    if (runtimePolling_) {
      str_poll_vs_int = "polling";
    }
    std::string timeoutMsg = "WaitForDla " + str_poll_vs_int + " timeout with threadId_" + std::to_string(threadId) + "\n";

    // Timeout has happened if we get here
    timeoutMsg += "If inference on one batch is expected to take more than " +
                  std::to_string(waitForDlaTimeoutSeconds_) +
                  " seconds, then increase WAIT_FOR_DLA_TIMEOUT in dlia_plugin.cpp and "
                  "recompile the runtime.\n";
    DLA_LOG("%s", timeoutMsg.c_str());  // this should always print, even if logging
                                        // verbosity is too low
    LOG(Logger::WARNING, "%s", timeoutMsg.c_str());
    std::string exceptionMsg = "FATAL ERROR: inference on FPGA did not complete";
    exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
    exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
    throw std::runtime_error(exceptionMsg);
  }

  if ((isrData_.desc_queue_diag[instance] >> DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT) & 0x01) {
    std::cerr << "ERROR: Out of free inferences on this IP. " <<
                 "The Intel FPGA AI suite cannot continue without a license!" << std::endl;
    std::string exceptionMsg = "Inference on FPGA exited with a license error";
    exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
    exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
    exceptionMsg += "\nPlease check your license. The Intel FPGA AI suite cannot continue without a license!";
    throw std::runtime_error(exceptionMsg);
  }

  jobsWaited_[instance]++;
}

#ifndef USE_OLD_COREDLA_DEVICE
uint64_t CoreDlaDevice::GetClocksActive(int instance) const {
  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
  //the host must first read the lower 32-bit of the counter,
  //then immediately read the higher 32-bit of the counter
  uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
  uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
  return (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
}

double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
  uint64_t clocksActive = GetClocksActive(instance) - startClocksActive[instance];
  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
  return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
}

uint64_t CoreDlaDevice::GetClocksAllJobs(int instance) const {
  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
  //the host must first read the lower 32-bit of the counter,
  //then immediately read the higher 32-bit of the counter
  uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
  uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
  return (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
}

double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
  uint64_t clocksAllJobs = GetClocksAllJobs(instance) - startClockAllJobs[instance];
  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
  return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
}
#else
double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
  //the host must first read the lower 32-bit of the counter,
  //then immediately read the higher 32-bit of the counter
    uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
  uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
  uint64_t clocksActive = (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
  return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
}

double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
  //the host must first read the lower 32-bit of the counter,
  //then immediately read the higher 32-bit of the counter
  uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
  uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
  uint64_t clocksAllJobs = (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
  return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
}
#endif

uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReads(int instance) const {
  return GetNumInputFeatureMemoryReadsTotal(instance) - startNumInputFeatureMemoryReads.at(instance);
}

uint64_t CoreDlaDevice::GetNumFilterMemoryReads(int instance) const {
  return GetNumFilterMemoryReadsTotal(instance) - startNumFilterMemoryReads.at(instance);
}

uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWrites(int instance) const {
  return GetNumOutputFeatureMemoryWritesTotal(instance) - startNumOutputFeatureMemoryWrites.at(instance);
}

uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReadsTotal(int instance) const {
  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
  //the host must first read the lower 32-bit of the counter,
  //then immediately read the higher 32-bit of the counter
  uint32_t numIFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO);
  uint32_t numIFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI);
  uint64_t numIFReads = (((uint64_t) numIFReadsHi) << 32) | ((uint64_t) numIFReadsLo);
  return numIFReads;
}

uint64_t CoreDlaDevice::GetNumFilterMemoryReadsTotal(int instance) const {
  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
  //the host must first read the lower 32-bit of the counter,
  //then immediately read the higher 32-bit of the counter
  uint32_t numWeightReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO);
  uint32_t numWeightReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI);
  uint64_t numWeightReads = (((uint64_t) numWeightReadsHi) << 32) | ((uint64_t) numWeightReadsLo);
  return numWeightReads;
}

uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWritesTotal(int instance) const {
  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
  //the host must first read the lower 32-bit of the counter,
  //then immediately read the higher 32-bit of the counter
  uint32_t numOFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO);
  uint32_t numOFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI);
  uint64_t numOFReads = (((uint64_t) numOFReadsHi) << 32) | ((uint64_t) numOFReadsLo);
  return numOFReads;
}

// Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail if
// the module number and address have not been implemented. The debug network is fault tolerant to both read requests
// never being accepted as well as read responses never being produced.
bool CoreDlaDevice::ReadDebugCsr(
    uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose) const {
  assert(moduleNum <= 0xff);
  assert(address <= 0xffffff);
  uint32_t addr = ((moduleNum & 0xff) << 24) | (address & 0xffffff);

  // Step 1: send the address that the debug network will use to issue a read request. Writing once to this CSR offset
  // will cause the debug network to issue one read request.
  mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR, addr);

  // Optional step: read back the value sent to CSR, sanity check that it is correct. Note this is all handled
  // internally to the CSR, e.g. the CSR does not go ask the debug network what address it sent.
  uint32_t addrCheck = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR);
  if (addr != addrCheck) {
    if (verbose) DLA_LOG("ReadDebugCsr addr read back check failed, expected %u, got %u\n", addr, addrCheck);
    return false;
  }

  // Step 2: the debug network should produce a read response which is cached by the CSR. Poll the corresponding status
  // register inside the CSR until this happens, or until the runtime decides to give up and declare the read a failure.
  // Do not throw an exception if the read fails, it is allowed to fail if the runtime is trying to figure out which
  // external debug-capable modules are attached to the debug network. Once the runtime has determined that a module is
  // attached, only then should read failures should cause an exception.
  uint32_t isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
  int retry = 5;
  while (!isValid && retry) {
    --retry;
    isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
  }
  if (!isValid) {
    if (verbose) DLA_LOG("ReadDebugCsr failed to read at addr %u\n", addr);
    return false;
  }

  // Step 3: runtime has confirmed the CSR has a cached the read response from debug network, now go and get the value.
  readData = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA);
  if (verbose) DLA_LOG("ReadDebugCsr, addr %u, data %u\n", addr, readData);
  return true;
}

// This is a helper function that throws an exception if runtime fails to read from the debug network. This should only
// be called if the runtime has already confirmed that a module is attached to the debug network i.e. a previous read to
// this module number had succeeded.
void ReadDebugNetworkError(int moduleNum, int address, int instance) {
  std::string msg = "ReadDebugNetwork failure, instance " + std::to_string(instance) +
                    ", failed to read at module number " + std::to_string(moduleNum) + " address " +
                    std::to_string(address);
  throw std::runtime_error(msg);
}

// Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
// this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
// information the debug register contains, and the value is the data of the debug register. Note that the runtime must
// completely tranverse the ROM before reading any of the debug register values, and the runtime must read the debug
// register values in the order that they occur inside the ROM. Usually profiling counters are 64-bit values, and since
// there is only a 32-bit read available, it takes more than one read to get all the data. The counters could still be
// updating when the runtime wants to read them, so typically there is a freeze register which can be activated by
// reading from a special address (hardware will see an incoming read request to this address, that is how it knows to
// freeze the counters). The offset for the freeze register will typically go first in the ROM, even if it is not the
// first offset in the address space.
DebugNetworkData CoreDlaDevice::ReadDebugNetwork(int instance) const {
  DebugNetworkData result;
  for (uint32_t moduleNum = 0; moduleNum < 256; moduleNum++) {
    // Read the ROM to get the offsets and descriptions
    std::vector<uint32_t> offset;
    std::vector<std::string> description;
    uint32_t address = 0, readData = 0;
    bool first = true, success = false;
    while (1) {
      // Parse the offset
      success = ReadDebugCsr(moduleNum, address, instance, readData);
      if (!success) {
        // Failure to read is allowed on the very first time, it is assumed that no external debug-capable module is
        // attached to the debug network at this moduleNum
        if (first)
          break;
        else
          ReadDebugNetworkError(moduleNum, address, instance);
      }
      if (!readData) break;  // end of list is indicated with offset = 0
      first = false;
      address += 4;
      offset.push_back(readData);

      // Parse the description string
      std::string str;
      bool endOfStringSeen = false;
      while (!endOfStringSeen) {
        success = ReadDebugCsr(moduleNum, address, instance, readData);
        if (!success) ReadDebugNetworkError(moduleNum, address, instance);
        address += 4;
        for (int i = 0; i < 4; i++) {
          if (readData & 0xff) {
            str += ((char)(readData & 0xff));
            readData >>= 8;
          } else {
            endOfStringSeen = true;
            break;
          }
        }
      }
      description.push_back(str);
    }

    assert(offset.size() == description.size());

    // Read the profiling counters
    for (size_t i = 0; i < offset.size(); i++) {
      address = offset[i];
      success = ReadDebugCsr(moduleNum, address, instance, readData);
      if (!success) ReadDebugNetworkError(moduleNum, address, instance);

      int descriptionOccurenceCnt = result.count(description[i]);
      // Same description name should show up 2 times in maximum
      if (descriptionOccurenceCnt == 2) {
        throw std::runtime_error("More than 2 profiling counter descriptions are the same.");
      } else if (descriptionOccurenceCnt && (address - offset[i - 1] != 4)) {
        // same description existed before
        // check if the two addresses associatede with the same decription are consecutive (offset by 4)
        throw std::runtime_error("Profiling counter addresses with name: " + description[i] + " are not consecutive");
      } else if (std::count(offset.begin(), offset.end(), address) > 1) {
        // same address shows up more than once
        throw std::runtime_error("Duplicate profiling counter address: " + address);
      }

      // Avoid printing special stuff like _Freeze and _Unfreeze
      if (description[i].at(0) != '_') {
        if (descriptionOccurenceCnt) {
          // This key has existed before, concatenate 2 uint32_t into uint64_t
          result[description[i]] |= (((uint64_t)readData) << 32);
        } else {
          result[description[i]] = readData;
        }
      }
    }
  }
  return result;
}

int CoreDlaDevice::GetSizeCsrDescriptorQueue() const { return DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE; }

double CoreDlaDevice::GetCoreDlaClockFreq() const { return mmdWrapper_.GetCoreDlaClockFreq(); }

std::string CoreDlaDevice::SchedulerGetStatus() const {
  if (!spStreamControllerComms_) return "";

  Payload<StatusMessagePayload> statusPayload = spStreamControllerComms_->GetStatus();
  return spStreamControllerComms_->GetStatusString(statusPayload);
}

bool CoreDlaDevice::InitializeScheduler(uint32_t sourceBufferSize,
                                        uint32_t dropSourceBuffers,
                                        uint32_t numInferenceRequests,
                                        const std::string source_fifo_file) {
  spStreamControllerComms_ = std::make_shared<StreamControllerComms>();
  if (spStreamControllerComms_->IsPresent()) {
    bool initOK = spStreamControllerComms_->Initialize(sourceBufferSize, dropSourceBuffers, numInferenceRequests);
    return initOK;
  } else {
    spStreamControllerComms_.reset();
    return false;
  }
}