summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/coredla_device/src/coredla_device.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/runtime/coredla_device/src/coredla_device.cpp')
-rw-r--r--python/openvino/runtime/coredla_device/src/coredla_device.cpp574
1 files changed, 574 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/src/coredla_device.cpp b/python/openvino/runtime/coredla_device/src/coredla_device.cpp
new file mode 100644
index 0000000..b28d8a2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/coredla_device.cpp
@@ -0,0 +1,574 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "coredla_device.h" //CoreDlaDevice
+#include "coredla_batch_job.h" //CoreDlaBatchJob
+#include "coredla_graph_job.h" //CoreDlaBatchJob
+#include "dla_dma_constants.h" //DLA_DMA_CSR_OFFSET_***
+#include "stream_controller_comms.h"
+
+#include <algorithm> //std::count
+#include <cassert> //assert
+#include <chrono> //std::chrono::seconds
+#include <cstddef> //size_t
+#include <cstdlib> //std::getenv
+#ifndef USE_OLD_COREDLA_DEVICE
+#include <cinttypes> //printf formatters
+#endif
+#include <mutex> //std::mutex
+#include <stdexcept> //std::runtime_error
+#include <string> //std::string
+#include <iostream> //std::cerr
+#include <stdint.h> //
+#include <thread>
+#include <cinttypes>
+
+std::unique_ptr<Device> Device::MakeUnique(const arch_params* archParams,
+ uint32_t waitForDlaTimeoutSeconds) {
+ return std::unique_ptr<Device>(new CoreDlaDevice(waitForDlaTimeoutSeconds));
+}
+
+void InterruptServiceRoutine(int handle, void* data) {
+ InterruptServiceRoutineData* isrData = static_cast<InterruptServiceRoutineData*>(data);
+ // clear interrupt status -- write 1 to clear that bit
+ constexpr int writeDataToClearInterruptStatus = 3;
+ const int numInstances = static_cast<int>(isrData->jobsFinished.size());
+ for (int i = 0; i < numInstances; i++) {
+ isrData->mmdWrapper->WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, writeDataToClearInterruptStatus);
+ }
+ for (int i = 0; i < numInstances; i++) {
+ isrData->desc_queue_diag[i] = isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS);
+ // ask the csr how many jobs have finished
+ uint32_t completionCount = isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+ // check if the completionCount wraps around (overflow detection) and save this information
+ if (isrData->prevCount[i] > completionCount)
+ isrData->base_multiplier[i] ++;
+ isrData->prevCount[i] = completionCount;
+ // we add base_multiplier to account for the fact that a wrap around is actually an increment of 1
+ std::unique_lock<std::mutex> isrMutexLock(isrData->isrMutex[i]);
+ isrData->jobsFinished[i] = (uint64_t) isrData->base_multiplier[i] * UINT32_MAX + completionCount + isrData->base_multiplier[i];
+ isrData->isrCondVar[i].notify_all();
+ }
+}
+
+CoreDlaDevice::CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds)
+: waitForDlaTimeoutSeconds_(waitForDlaTimeoutSeconds) {
+#ifdef COREDLA_RUNTIME_POLLING
+ runtimePolling_ = true;
+#else
+ runtimePolling_ = false;
+#endif
+ // mmdWrapper_ ctor runs first, which will open a handle to the MMD. Now determine the number of hardware instances
+ // by writing a nonzero value to some offset and then reading it back. While trying to enable the interrupt
+ // mask, test for this.
+ numInstances_ = 0;
+ for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
+ constexpr uint32_t allInterruptsMask = (1<<DLA_DMA_CSR_INTERRUPT_ERROR_BIT) | (1<<DLA_DMA_CSR_INTERRUPT_DONE_BIT);
+ // clear any pending interrupts (there may be pending interrupts from last run), then enable mask for instance count
+ mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, allInterruptsMask);
+ mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, allInterruptsMask);
+ uint32_t readData = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK);
+ if (allInterruptsMask == readData) numInstances_ = i + 1;
+ }
+ LOG_AND_PRINT(Logger::INFO, "numInstances_: %d\n", numInstances_);
+ assert(numInstances_ >= 1);
+ jobsWaited_.resize(numInstances_, 0);
+
+ uint32_t license = mmdWrapper_.ReadFromCsr(0, DLA_DMA_CSR_OFFSET_LICENSE_FLAG);
+ if (license == 0) {
+ DLA_LOG("Using unlicensed IP\n");
+ }
+ else if (license == 1) {
+ DLA_LOG("Using licensed IP\n");
+ }
+ else {
+ throw std::runtime_error("Unrecongnized license flag");
+ }
+#ifndef USE_OLD_COREDLA_DEVICE
+ startClocksActive.resize(numInstances_, 0);
+ startClockAllJobs.resize(numInstances_, 0);
+#endif
+ startNumInputFeatureMemoryReads.resize(numInstances_, 0);
+ startNumFilterMemoryReads.resize(numInstances_, 0);
+ startNumOutputFeatureMemoryWrites.resize(numInstances_, 0);
+
+ // Package up the data that interrupt service routine needs
+ isrData_.mmdWrapper = &mmdWrapper_;
+ isrData_.jobsFinished = std::vector<uint64_t>(numInstances_, 0);
+ isrData_.base_multiplier = std::vector<uint32_t>(numInstances_, 0);
+ isrData_.prevCount = std::vector<uint32_t>(numInstances_, 0);
+ isrData_.desc_queue_diag = std::vector<uint32_t>(numInstances_, 0);
+ isrData_.isrMutex = std::vector<std::mutex>(numInstances_);
+ isrData_.isrCondVar = std::vector<std::condition_variable>(numInstances_);
+
+ if (runtimePolling_) {
+ // disable the interrupt mask -- it was originally enabled to determine how many instances were present
+ for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
+ constexpr uint32_t disableInterruptMaskValue = 0;
+ mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, disableInterruptMaskValue);
+ }
+ }
+ else {
+ // register an interrupt handler
+ mmdWrapper_.RegisterISR(&InterruptServiceRoutine, &isrData_);
+ }
+
+ // Record the current counters
+ for(int i=0; i < numInstances_; i++) {
+#ifndef USE_OLD_COREDLA_DEVICE
+ jobsWaited_[i] = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+ isrData_.jobsFinished[i] = jobsWaited_[i];
+
+ startClocksActive[i] = GetClocksActive(i);
+ startClockAllJobs[i] = GetClocksAllJobs(i);
+#endif
+ startNumInputFeatureMemoryReads.at(i) = GetNumInputFeatureMemoryReadsTotal(i);
+ startNumFilterMemoryReads.at(i) = GetNumFilterMemoryReadsTotal(i);
+ startNumOutputFeatureMemoryWrites.at(i) = GetNumOutputFeatureMemoryWritesTotal(i);
+ }
+
+ // Allocator needs access to mmd to write to CSR the start address of the shared intermediate buffer allocated in DDR
+ ddrAllocator_ = std::unique_ptr<DeviceMemoryAllocator[]>(new DeviceMemoryAllocator[numInstances_]);
+ for (int i = 0; i < numInstances_; i++) {
+ ddrAllocator_[i].Initialize(mmdWrapper_.GetDDRSizePerInstance(), &mmdWrapper_);
+ }
+
+// Choose which data pattern you want, all zeros or all ones can also be useful for IP debug purposes
+#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR, INDEX) ((ADDR * 12345) + (INDEX * 6789))
+ //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0)
+ //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0xffffffffffffffffULL)
+ bool run_memory_test = getenv("COREDLA_RUNTIME_MEMORY_TEST") != nullptr;
+ if (run_memory_test) {
+ // Ensure host can access all of the device memory that is accessible by all CoreDLA instances
+ // This is not necessarily the total device memory e.g. only 1 CoreDLA instance but 2 DDR banks
+ DLA_LOG("starting memory test with %d instances\n", numInstances_);
+ constexpr uint64_t CHUNK_SIZE = 1ULL << 20; // one address check is 1 MB
+ const uint64_t ADDR_LIMIT = mmdWrapper_.GetDDRSizePerInstance();
+ int mismatch = 0;
+ uint64_t expected;
+ uint64_t* data = new uint64_t[CHUNK_SIZE / sizeof(uint64_t)];
+
+ for (int inst = 0; inst < numInstances_; ++inst) {
+ // write to entire fpga ddr
+ for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
+ for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++)
+ data[index] = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
+ mmdWrapper_.WriteToDDR(inst, addr, CHUNK_SIZE, static_cast<const void*>(data));
+ }
+ // read back entire fpga ddr and compare to expected
+ for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
+ mmdWrapper_.ReadFromDDR(inst, addr, CHUNK_SIZE, data);
+ for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++) {
+ expected = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
+ if (data[index] != expected) {
+ if (mismatch < 10) {
+#if (!defined(USE_OLD_COREDLA_DEVICE) || defined(_WIN32))
+ DLA_LOG("memory test mismatch, addr %" PRIu64 ", index %" PRIu64 ", got %" PRIu64 ", expected %" PRIu64
+ "\n",
+ addr,
+ index,
+ data[index],
+ expected);
+#else
+ DLA_LOG("memory test mismatch, addr %lu, index %lu, got %lu, expected %lu\n",
+ addr,
+ index,
+ data[index],
+ expected);
+#endif
+ }
+ mismatch++;
+ }
+ }
+ }
+ }
+ delete[] data;
+ DLA_LOG("finished memory test ");
+ if (mismatch == 0) {
+ DLA_LOG("SUCCESS\n");
+ } else {
+ DLA_LOG("FAILURE (%d mismatches)\n", mismatch);
+ }
+ }
+}
+
+CoreDlaDevice::~CoreDlaDevice() {
+ // Avoid the scenario where some CoreDLA job has been started but something goes wrong
+ // in the runtime which causes it to exit, e.g. assertion failure or uncaught exception.
+ // CoreDLA will still raise an interrupt when the job finishes, yet the runtime will no
+ // longer be able to deal with it. Better to shut off interurpts.
+ for (int instance = 0; instance < numInstances_; instance++) {
+ // MmDWrapper.WriteToCSR might throw exception, and the destructor should not have
+ // unhandled exception, so we need to handle exceptions internally
+ try {
+ mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0);
+ } catch (const std::exception& e) {
+ std::cerr << "Failed to shut off the DMA CSR interrupt mask due to " << e.what() << std::endl;
+ }
+ }
+}
+
+GraphJob* CoreDlaDevice::CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+ size_t numPipelines,
+#else
+ uint64_t numPipelines,
+#endif
+ int instance,
+ std::string AES_key,
+ std::string IV_key,
+ bool encryption_enabled,
+ const std::string export_dir,
+ const std::string parameter_rom_export_dir) {
+ assert(instance < numInstances_);
+ (void) export_dir; // unused in HW runtime. CoreDLA utilizes base pointers, which the SW emulator utilizes this variable. We void it here.
+ allGraphJobs_.push_back(move(
+ CoreDlaGraphJob::MakeUnique(&ddrAllocator_[instance], &mmdWrapper_, compiledResult, numPipelines, instance, spStreamControllerComms_)));
+ return (allGraphJobs_.back()).get();
+}
+
+// This function must be called by a single thread
+void CoreDlaDevice::WaitForDla(int instance, size_t threadId, std::function<bool()> isCancelledPredicate) {
+ // ISR updates jobsFinished, if not enough jobs have finished then sleep until ISR runs again
+ // it is possible that several hardware jobs could finish around the same time
+ // by the time software handles the first interrupt, hardware could report that 2 jobs have
+ // finished, for example the second time that waitForInterrupt runs, software already tracks
+ // that the second job has finished and therefore don't need to sleep waiting for ISR
+ std::unique_lock<std::mutex> isrMutexLock(isrData_.isrMutex[instance]);
+ uint32_t completionCount = 0;
+ bool timedOut = false;
+ auto timeoutDuration = std::chrono::seconds(waitForDlaTimeoutSeconds_);
+
+ if (runtimePolling_) {
+ std::chrono::time_point<std::chrono::system_clock> pollingEndingTime =
+ std::chrono::system_clock::now() + timeoutDuration;
+
+ while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
+ // Update isrData_.jobsFinished[instance] here (polling)
+ if (isCancelledPredicate and isCancelledPredicate()) {
+ break;
+ }
+
+ completionCount = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+ isrData_.jobsFinished[instance] = completionCount;
+ if (std::chrono::system_clock::now() > pollingEndingTime) {
+ timedOut = true;
+ break;
+ }
+ }
+ } else {
+ while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
+ // isrData_.jobsFinished[instance] is updated in the ISR
+ if (std::cv_status::timeout == isrData_.isrCondVar[instance].wait_for(isrMutexLock, timeoutDuration)) {
+ timedOut = true;
+ break;
+ }
+ }
+ }
+
+ if (timedOut) {
+ std::string str_poll_vs_int = "interrupt";
+ if (runtimePolling_) {
+ str_poll_vs_int = "polling";
+ }
+ std::string timeoutMsg = "WaitForDla " + str_poll_vs_int + " timeout with threadId_" + std::to_string(threadId) + "\n";
+
+ // Timeout has happened if we get here
+ timeoutMsg += "If inference on one batch is expected to take more than " +
+ std::to_string(waitForDlaTimeoutSeconds_) +
+ " seconds, then increase WAIT_FOR_DLA_TIMEOUT in dlia_plugin.cpp and "
+ "recompile the runtime.\n";
+ DLA_LOG("%s", timeoutMsg.c_str()); // this should always print, even if logging
+ // verbosity is too low
+ LOG(Logger::WARNING, "%s", timeoutMsg.c_str());
+ std::string exceptionMsg = "FATAL ERROR: inference on FPGA did not complete";
+ exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
+ exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
+ throw std::runtime_error(exceptionMsg);
+ }
+
+ if ((isrData_.desc_queue_diag[instance] >> DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT) & 0x01) {
+ std::cerr << "ERROR: Out of free inferences on this IP. " <<
+ "The Intel FPGA AI suite cannot continue without a license!" << std::endl;
+ std::string exceptionMsg = "Inference on FPGA exited with a license error";
+ exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
+ exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
+ exceptionMsg += "\nPlease check your license. The Intel FPGA AI suite cannot continue without a license!";
+ throw std::runtime_error(exceptionMsg);
+ }
+
+ jobsWaited_[instance]++;
+}
+
+#ifndef USE_OLD_COREDLA_DEVICE
+uint64_t CoreDlaDevice::GetClocksActive(int instance) const {
+ //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+ //the host must first read the lower 32-bit of the counter,
+ //then immediately read the higher 32-bit of the counter
+ uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
+ uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
+ return (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
+}
+
+double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
+ uint64_t clocksActive = GetClocksActive(instance) - startClocksActive[instance];
+ // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+ return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
+}
+
+uint64_t CoreDlaDevice::GetClocksAllJobs(int instance) const {
+ //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+ //the host must first read the lower 32-bit of the counter,
+ //then immediately read the higher 32-bit of the counter
+ uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
+ uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
+ return (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
+}
+
+double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
+ uint64_t clocksAllJobs = GetClocksAllJobs(instance) - startClockAllJobs[instance];
+ // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+ return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
+}
+#else
+double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
+ //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+ //the host must first read the lower 32-bit of the counter,
+ //then immediately read the higher 32-bit of the counter
+ uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
+ uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
+ uint64_t clocksActive = (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
+ // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+ return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
+}
+
+double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
+ //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+ //the host must first read the lower 32-bit of the counter,
+ //then immediately read the higher 32-bit of the counter
+ uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
+ uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
+ uint64_t clocksAllJobs = (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
+ // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+ return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
+}
+#endif
+
+uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReads(int instance) const {
+ return GetNumInputFeatureMemoryReadsTotal(instance) - startNumInputFeatureMemoryReads.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumFilterMemoryReads(int instance) const {
+ return GetNumFilterMemoryReadsTotal(instance) - startNumFilterMemoryReads.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWrites(int instance) const {
+ return GetNumOutputFeatureMemoryWritesTotal(instance) - startNumOutputFeatureMemoryWrites.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReadsTotal(int instance) const {
+ //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+ //the host must first read the lower 32-bit of the counter,
+ //then immediately read the higher 32-bit of the counter
+ uint32_t numIFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO);
+ uint32_t numIFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI);
+ uint64_t numIFReads = (((uint64_t) numIFReadsHi) << 32) | ((uint64_t) numIFReadsLo);
+ return numIFReads;
+}
+
+uint64_t CoreDlaDevice::GetNumFilterMemoryReadsTotal(int instance) const {
+ //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+ //the host must first read the lower 32-bit of the counter,
+ //then immediately read the higher 32-bit of the counter
+ uint32_t numWeightReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO);
+ uint32_t numWeightReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI);
+ uint64_t numWeightReads = (((uint64_t) numWeightReadsHi) << 32) | ((uint64_t) numWeightReadsLo);
+ return numWeightReads;
+}
+
+uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWritesTotal(int instance) const {
+ //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+ //the host must first read the lower 32-bit of the counter,
+ //then immediately read the higher 32-bit of the counter
+ uint32_t numOFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO);
+ uint32_t numOFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI);
+ uint64_t numOFReads = (((uint64_t) numOFReadsHi) << 32) | ((uint64_t) numOFReadsLo);
+ return numOFReads;
+}
+
+// Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail if
+// the module number and address have not been implemented. The debug network is fault tolerant to both read requests
+// never being accepted as well as read responses never being produced.
+bool CoreDlaDevice::ReadDebugCsr(
+ uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose) const {
+ assert(moduleNum <= 0xff);
+ assert(address <= 0xffffff);
+ uint32_t addr = ((moduleNum & 0xff) << 24) | (address & 0xffffff);
+
+ // Step 1: send the address that the debug network will use to issue a read request. Writing once to this CSR offset
+ // will cause the debug network to issue one read request.
+ mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR, addr);
+
+ // Optional step: read back the value sent to CSR, sanity check that it is correct. Note this is all handled
+ // internally to the CSR, e.g. the CSR does not go ask the debug network what address it sent.
+ uint32_t addrCheck = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR);
+ if (addr != addrCheck) {
+ if (verbose) DLA_LOG("ReadDebugCsr addr read back check failed, expected %u, got %u\n", addr, addrCheck);
+ return false;
+ }
+
+ // Step 2: the debug network should produce a read response which is cached by the CSR. Poll the corresponding status
+ // register inside the CSR until this happens, or until the runtime decides to give up and declare the read a failure.
+ // Do not throw an exception if the read fails, it is allowed to fail if the runtime is trying to figure out which
+ // external debug-capable modules are attached to the debug network. Once the runtime has determined that a module is
+ // attached, only then should read failures should cause an exception.
+ uint32_t isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
+ int retry = 5;
+ while (!isValid && retry) {
+ --retry;
+ isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
+ }
+ if (!isValid) {
+ if (verbose) DLA_LOG("ReadDebugCsr failed to read at addr %u\n", addr);
+ return false;
+ }
+
+ // Step 3: runtime has confirmed the CSR has a cached the read response from debug network, now go and get the value.
+ readData = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA);
+ if (verbose) DLA_LOG("ReadDebugCsr, addr %u, data %u\n", addr, readData);
+ return true;
+}
+
+// This is a helper function that throws an exception if runtime fails to read from the debug network. This should only
+// be called if the runtime has already confirmed that a module is attached to the debug network i.e. a previous read to
+// this module number had succeeded.
+void ReadDebugNetworkError(int moduleNum, int address, int instance) {
+ std::string msg = "ReadDebugNetwork failure, instance " + std::to_string(instance) +
+ ", failed to read at module number " + std::to_string(moduleNum) + " address " +
+ std::to_string(address);
+ throw std::runtime_error(msg);
+}
+
+// Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+// this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+// information the debug register contains, and the value is the data of the debug register. Note that the runtime must
+// completely tranverse the ROM before reading any of the debug register values, and the runtime must read the debug
+// register values in the order that they occur inside the ROM. Usually profiling counters are 64-bit values, and since
+// there is only a 32-bit read available, it takes more than one read to get all the data. The counters could still be
+// updating when the runtime wants to read them, so typically there is a freeze register which can be activated by
+// reading from a special address (hardware will see an incoming read request to this address, that is how it knows to
+// freeze the counters). The offset for the freeze register will typically go first in the ROM, even if it is not the
+// first offset in the address space.
+DebugNetworkData CoreDlaDevice::ReadDebugNetwork(int instance) const {
+ DebugNetworkData result;
+ for (uint32_t moduleNum = 0; moduleNum < 256; moduleNum++) {
+ // Read the ROM to get the offsets and descriptions
+ std::vector<uint32_t> offset;
+ std::vector<std::string> description;
+ uint32_t address = 0, readData = 0;
+ bool first = true, success = false;
+ while (1) {
+ // Parse the offset
+ success = ReadDebugCsr(moduleNum, address, instance, readData);
+ if (!success) {
+ // Failure to read is allowed on the very first time, it is assumed that no external debug-capable module is
+ // attached to the debug network at this moduleNum
+ if (first)
+ break;
+ else
+ ReadDebugNetworkError(moduleNum, address, instance);
+ }
+ if (!readData) break; // end of list is indicated with offset = 0
+ first = false;
+ address += 4;
+ offset.push_back(readData);
+
+ // Parse the description string
+ std::string str;
+ bool endOfStringSeen = false;
+ while (!endOfStringSeen) {
+ success = ReadDebugCsr(moduleNum, address, instance, readData);
+ if (!success) ReadDebugNetworkError(moduleNum, address, instance);
+ address += 4;
+ for (int i = 0; i < 4; i++) {
+ if (readData & 0xff) {
+ str += ((char)(readData & 0xff));
+ readData >>= 8;
+ } else {
+ endOfStringSeen = true;
+ break;
+ }
+ }
+ }
+ description.push_back(str);
+ }
+
+ assert(offset.size() == description.size());
+
+ // Read the profiling counters
+ for (size_t i = 0; i < offset.size(); i++) {
+ address = offset[i];
+ success = ReadDebugCsr(moduleNum, address, instance, readData);
+ if (!success) ReadDebugNetworkError(moduleNum, address, instance);
+
+ int descriptionOccurenceCnt = result.count(description[i]);
+ // Same description name should show up 2 times in maximum
+ if (descriptionOccurenceCnt == 2) {
+ throw std::runtime_error("More than 2 profiling counter descriptions are the same.");
+ } else if (descriptionOccurenceCnt && (address - offset[i - 1] != 4)) {
+ // same description existed before
+ // check if the two addresses associatede with the same decription are consecutive (offset by 4)
+ throw std::runtime_error("Profiling counter addresses with name: " + description[i] + " are not consecutive");
+ } else if (std::count(offset.begin(), offset.end(), address) > 1) {
+ // same address shows up more than once
+ throw std::runtime_error("Duplicate profiling counter address: " + address);
+ }
+
+ // Avoid printing special stuff like _Freeze and _Unfreeze
+ if (description[i].at(0) != '_') {
+ if (descriptionOccurenceCnt) {
+ // This key has existed before, concatenate 2 uint32_t into uint64_t
+ result[description[i]] |= (((uint64_t)readData) << 32);
+ } else {
+ result[description[i]] = readData;
+ }
+ }
+ }
+ }
+ return result;
+}
+
+int CoreDlaDevice::GetSizeCsrDescriptorQueue() const { return DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE; }
+
+double CoreDlaDevice::GetCoreDlaClockFreq() const { return mmdWrapper_.GetCoreDlaClockFreq(); }
+
+std::string CoreDlaDevice::SchedulerGetStatus() const {
+ if (!spStreamControllerComms_) return "";
+
+ Payload<StatusMessagePayload> statusPayload = spStreamControllerComms_->GetStatus();
+ return spStreamControllerComms_->GetStatusString(statusPayload);
+}
+
+bool CoreDlaDevice::InitializeScheduler(uint32_t sourceBufferSize,
+ uint32_t dropSourceBuffers,
+ uint32_t numInferenceRequests,
+ const std::string source_fifo_file) {
+ spStreamControllerComms_ = std::make_shared<StreamControllerComms>();
+ if (spStreamControllerComms_->IsPresent()) {
+ bool initOK = spStreamControllerComms_->Initialize(sourceBufferSize, dropSourceBuffers, numInferenceRequests);
+ return initOK;
+ } else {
+ spStreamControllerComms_.reset();
+ return false;
+ }
+}