summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv')
-rw-r--r--python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv841
1 files changed, 841 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv
new file mode 100644
index 0000000..ae7006c
--- /dev/null
+++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv
@@ -0,0 +1,841 @@
+// Copyright 2020-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+
+// This module implements the CSR for DMA. It also includes the descriptor queue
+// and interrupt request generator. The CSR is implemented with a RAM. Certain
+// values are kept live in registers, such the interrupt control and mask. This
+// makes it easier to detect when a change has happened (instead of trying to a
+// read-modify-write with the RAM).
+//
+// The AXI4 lite slave interface is usually going to backpressure PCIe. There is
+// a state machine which allows one outstanding read request, or one outstanding
+// write request at a time (write requests can be outstanding if the writeack is
+// backpressured which AXI allows). There is a register which tracks whether the
+// last request was a read or write, this enables round robin arbitration. Each
+// request takes a few clock cycles to process, as the address needs to be decoded
+// to determine if a write is allowed to commit to the RAM, or if we need to use
+// read data from one of the registers instead of the RAM.
+//
+// Special offsets are defined as localparams below. Writing to DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4
+// will cause one unit of work to be enqueued in the descriptor queue. Currently
+// this involves writing 8 values to a fifo, which are then consumed by the config
+// reader. Internal to the config reader, 4 values go to the config reader address
+// generator, the other 4 go to the config reader intercept.
+//
+// Beware the following assumptions about how the host issues requests to this CSR:
+// - no bursts (required by AXI4 lite)
+// - byte enables are assumed to be all 1 (no partial word access)
+// - all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0)
+
+`resetall
+`undefineall
+`default_nettype none
+`include "dla_acl_parameter_assert.svh"
+
+module dla_dma_csr #(
+ parameter int CSR_ADDR_WIDTH, //width of the byte address signal, determines CSR address space size, e.g. 11 bit address = 2048 bytes, the largest size that uses only 1 M20K
+ parameter int CSR_DATA_BYTES, //width of the CSR data path, typically 4 bytes
+ parameter int CONFIG_DATA_BYTES, //data width of the config network output port, typically 4 bytes, the descriptor queue matches this so that config decode can be reused
+ parameter int CONFIG_READER_DATA_BYTES, //data width of the config network input port, needed by config reader address generator for loop update
+
+ parameter int ENABLE_INPUT_STREAMING,
+ parameter int ENABLE_OUTPUT_STREAMING,
+ parameter int ENABLE_ON_CHIP_PARAMETERS
+
+ ) (
+ input wire clk_ddr,
+ input wire clk_pcie,
+ input wire clk_dla,
+ input wire i_sclrn_ddr, //active low reset that has already been synchronized to clk_ddr
+ input wire i_resetn_async, //active low reset that has NOT been synchronized to any clock, only to be consumed by dcfifo
+
+ //updates for interrupt, runs on ddr clock
+ input wire i_token_done, //feature writer reports it is done
+ input wire i_token_stream_started, //input streamer is reading the first word
+ input wire i_stream_received_first_word,
+ input wire i_stream_sent_last_word,
+ input wire i_token_error, //dla has encountered some error, assert high for one clock cycle to report it to host (assuming mask bit is 1)
+ input wire i_license_flag,
+ input wire i_token_out_of_inferences,
+
+ //snoop signals for the input feature, output featuer, and filter LSU's core <--> fabric traffic
+ //run on clk_ddr
+ input wire i_input_feature_rvalid,
+ input wire i_input_feature_rready,
+ input wire i_input_filter_rvalid,
+ input wire i_input_filter_rready,
+ input wire i_output_feature_wvalid,
+ input wire i_output_feature_wready,
+
+ //interrupt request to pcie, runs on pcie clock
+ output logic o_interrupt_level, //level sensitive interrupt
+
+ //read side of descriptor queue goes to config reader, runs on ddr clock
+ output logic o_config_valid,
+ output logic [8*CONFIG_DATA_BYTES-1:0] o_config_data,
+ output logic o_config_for_intercept, //0 = goes to config reader addr gen, 1 = goes to config reader intercept
+ input wire i_config_ready,
+
+ //debug network AXI-4 lite interface, read request and read response channels, runs on dla_clock
+ output logic o_debug_network_arvalid,
+ output logic [8*CSR_DATA_BYTES-1:0] o_debug_network_araddr,
+ input wire i_debug_network_arready,
+ input wire i_debug_network_rvalid,
+ input wire [8*CSR_DATA_BYTES-1:0] i_debug_network_rdata,
+ output logic o_debug_network_rready,
+
+ //AXI4-lite slave interface for host control, runs on ddr clock
+ //no bursts, byte enables are assumed to be all 1, all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0)
+ input wire i_csr_arvalid,
+ input wire [CSR_ADDR_WIDTH-1:0] i_csr_araddr,
+ output logic o_csr_arready,
+ output logic o_csr_rvalid,
+ output logic [8*CSR_DATA_BYTES-1:0] o_csr_rdata,
+ input wire i_csr_rready,
+ input wire i_csr_awvalid,
+ input wire [CSR_ADDR_WIDTH-1:0] i_csr_awaddr,
+ output logic o_csr_awready,
+ input wire i_csr_wvalid,
+ input wire [8*CSR_DATA_BYTES-1:0] i_csr_wdata,
+ output logic o_csr_wready,
+ output logic o_csr_bvalid,
+ input wire i_csr_bready,
+
+ //reset request for the whole ip, runs on ddr clock
+ output logic o_request_ip_reset,
+
+ //output bit to start/stop streaming interface
+ output logic o_streaming_active
+);
+
+
+ /////////////////////////////////
+ // Parameter legality checks //
+ /////////////////////////////////
+
+ //signal widths cannot be trivial
+ `DLA_ACL_PARAMETER_ASSERT(CSR_DATA_BYTES >= 1)
+ `DLA_ACL_PARAMETER_ASSERT(CONFIG_DATA_BYTES >= 1)
+
+ //csr address space cannot be trivial
+ `DLA_ACL_PARAMETER_ASSERT(2**CSR_ADDR_WIDTH > CONFIG_DATA_BYTES)
+
+ //offsets must be within address space
+ localparam int CSR_LO_ADDR = $clog2(CSR_DATA_BYTES); //number of LSBs that must be 0 in order for byte address to be word aligned
+ localparam int CSR_WORD_ADDR_WIDTH = CSR_ADDR_WIDTH - CSR_LO_ADDR;
+
+
+
+ /////////////////
+ // Constants //
+ /////////////////
+ `include "dla_dma_constants.svh"
+ //special offsets -- these values are defined in one place and shared between hardware and software
+ //the constants from the dla_dma_constants.svh header file that CSR cares about are named DLA_DMA_CSR_OFFSET_**** and DLA_DMA_CSR_INTERRUPT_****
+
+ //state machine
+ enum {
+ STATE_IDLE_BIT,
+ STATE_READ_ACCEPT_BIT,
+ STATE_READ_ADDR_BIT,
+ STATE_READ_DATA_BIT,
+ STATE_WRITE_ACCEPT_BIT,
+ STATE_WRITE_COMMIT_BIT,
+ STATE_DESCRIPTOR_BIT,
+ STATE_AWAIT_RESET_BIT
+ } index;
+
+ enum logic [index.num()-1:0] {
+ //1-hot encodings
+ STATE_IDLE = 1 << STATE_IDLE_BIT,
+ STATE_READ_ACCEPT = 1 << STATE_READ_ACCEPT_BIT,
+ STATE_READ_ADDR = 1 << STATE_READ_ADDR_BIT,
+ STATE_READ_DATA = 1 << STATE_READ_DATA_BIT,
+ STATE_WRITE_ACCEPT = 1 << STATE_WRITE_ACCEPT_BIT,
+ STATE_WRITE_COMMIT = 1 << STATE_WRITE_COMMIT_BIT,
+ STATE_DESCRIPTOR = 1 << STATE_DESCRIPTOR_BIT,
+ STATE_AWAIT_RESET = 1 << STATE_AWAIT_RESET_BIT,
+ XXX = 'x
+ } state;
+
+ localparam int MAX_JOBS_ACTIVE = 64; //upper bounded by how many descriptors the queue can hold
+ localparam int JOBS_ACTIVE_WIDTH = $clog2(MAX_JOBS_ACTIVE+1);
+
+
+
+ ///////////////
+ // Signals //
+ ///////////////
+
+ //ram
+ logic ram_wr_en;
+ logic [CSR_WORD_ADDR_WIDTH-1:0] ram_wr_addr, ram_rd_addr;
+ logic [8*CSR_DATA_BYTES-1:0] ram_wr_data, ram_rd_data;
+
+ //descriptor queue
+ logic descriptor_queue_forced_write, descriptor_queue_full, descriptor_diagnostics_almost_full;
+ logic [8*CONFIG_DATA_BYTES:0] descriptor_queue_data;
+ logic [2:0] descriptor_words_read;
+ logic first_word_of_descriptor_being_read, jobs_active_is_nonzero, core_jobs_active_is_nonzero;
+ logic [JOBS_ACTIVE_WIDTH-1:0] jobs_active, core_jobs_active;
+
+ //Perfomance counters connections
+ logic [31:0] total_clocks_active_lo, total_clocks_active_hi;
+ logic [31:0] total_core_clocks_active_lo, total_core_clocks_active_hi;
+ logic [31:0] total_clocks_for_all_jobs_lo, total_clocks_for_all_jobs_hi;
+ logic [31:0] number_of_input_feature_reads_lo, number_of_input_feature_reads_hi;
+ logic [31:0] number_of_input_filter_reads_lo, number_of_input_filter_reads_hi;
+ logic [31:0] number_of_output_feature_writes_lo, number_of_output_feature_writes_hi;
+
+ //state machine
+ logic previous_was_write;
+ logic [3:0] descriptor_count;
+
+ //specific offsets are implemented in registers instead of RAM
+ logic interrupt_control_error, interrupt_control_done, interrupt_mask_error, interrupt_mask_done;
+ logic [8*CSR_DATA_BYTES-1:0] completion_count;
+ logic descriptor_diagnostics_overflow;
+
+ //address decode for specific offsets that are implemented in registers or require some action to be taken
+ logic write_to_interrupt_control, read_from_interrupt_control, write_to_interrupt_mask, read_from_interrupt_mask;
+ logic write_to_ram, read_from_desc_diagnostics, read_from_completion_count, enqueue_descriptor;
+ logic read_from_clocks_active_lo, read_from_clocks_active_hi, read_from_clocks_all_jobs_lo, read_from_clocks_all_jobs_hi;
+ logic read_from_core_clocks_active_lo, read_from_core_clocks_active_hi;
+ logic read_from_input_feature_reads_lo, read_from_input_feature_reads_hi;
+ logic read_from_input_filter_reads_lo, read_from_input_filter_reads_hi;
+ logic read_from_output_feature_writes_lo, read_from_output_feature_writes_hi;
+ logic write_to_debug_network_addr, read_from_debug_network_valid, read_from_debug_network_data;
+ logic read_from_license_flag;
+ logic read_from_ip_reset, write_to_ip_reset;
+
+ //clock crosser for interrupt
+ logic ddr_interrupt_level;
+
+ //debug network read request address
+ logic debug_network_arvalid, not_o_debug_network_arvalid;
+ logic [8*CSR_DATA_BYTES-1:0] debug_network_araddr;
+
+ //debug network read response data
+ logic not_o_debug_network_rready, debug_network_dcfifo_empty, debug_network_rvalid, debug_network_rready;
+ logic [8*CSR_DATA_BYTES-1:0] debug_network_dcfifo_data, debug_network_rdata;
+
+ //streaming states
+ logic write_ready_streaming_interface;
+ logic read_ready_streaming_interface;
+
+ logic dla_sclrn;
+
+ //reset parameterization
+ localparam int RESET_USE_SYNCHRONIZER = 1;
+ localparam int RESET_PIPE_DEPTH = 3;
+ localparam int RESET_NUM_COPIES = 1;
+ dla_reset_handler_simple #(
+ .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
+ .PIPE_DEPTH (RESET_PIPE_DEPTH),
+ .NUM_COPIES (RESET_NUM_COPIES)
+ )
+ ddr_reset_synchronizer
+ (
+ .clk (clk_dla),
+ .i_resetn (i_resetn_async),
+ .o_sclrn (dla_sclrn)
+ );
+
+ ///////////
+ // RAM //
+ ///////////
+
+ //could use hld_ram, but this simple ram doesn't need the depth stitching or clock enable magic that hld_ram provides
+
+ altera_syncram
+ #(
+ .address_aclr_b ("NONE"),
+ .address_reg_b ("CLOCK0"),
+ .clock_enable_input_a ("BYPASS"),
+ .clock_enable_input_b ("BYPASS"),
+ .clock_enable_output_b ("BYPASS"),
+ .enable_ecc ("FALSE"),
+ .init_file ("dla_dma_csr_discovery_rom.mif"),
+ .intended_device_family ("Arria 10"), //Quartus will fix this automatically
+ .lpm_type ("altera_syncram"),
+ .numwords_a (2**CSR_WORD_ADDR_WIDTH),
+ .numwords_b (2**CSR_WORD_ADDR_WIDTH),
+ .operation_mode ("DUAL_PORT"),
+ .outdata_aclr_b ("NONE"),
+ .outdata_sclr_b ("NONE"),
+ .outdata_reg_b ("CLOCK0"),
+ .power_up_uninitialized ("FALSE"),
+ .ram_block_type ("M20K"),
+ .read_during_write_mode_mixed_ports ("DONT_CARE"),
+ .widthad_a (CSR_WORD_ADDR_WIDTH),
+ .widthad_b (CSR_WORD_ADDR_WIDTH),
+ .width_a (8*CSR_DATA_BYTES),
+ .width_b (8*CSR_DATA_BYTES),
+ .width_byteena_a (1)
+ )
+ csr_ram
+ (
+ .address_a (ram_wr_addr),
+ .address_b (ram_rd_addr),
+ .clock0 (clk_ddr),
+ .data_a (ram_wr_data),
+ .wren_a (ram_wr_en),
+ .q_b (ram_rd_data),
+ .address2_a (1'b1),
+ .address2_b (1'b1),
+ .addressstall_a (1'b0),
+ .addressstall_b (1'b0),
+ .byteena_a (1'b1),
+ .byteena_b (1'b1),
+ .clock1 (1'b1),
+ .clocken0 (1'b1),
+ .clocken1 (1'b1),
+ .clocken2 (1'b1),
+ .clocken3 (1'b1),
+ .data_b ({(8*CSR_DATA_BYTES){1'b1}}),
+ .eccencbypass (1'b0),
+ .eccencparity (8'b0),
+ .eccstatus (),
+ .q_a (),
+ .rden_a (1'b1),
+ .rden_b (1'b1),
+ .wren_b (1'b0)
+ );
+
+
+
+ ////////////////////////
+ // Descriptor Queue //
+ ////////////////////////
+
+ //runtime knows how many jobs it has enqueued and how many jobs have finished
+ //runtime is responsible for not overflowing the descriptor queue, it must limit the number of outstanding jobs queued in hardware
+
+ localparam int DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF = DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB; //almost full asserts when queue only has space for 1 more job
+
+ dla_hld_fifo #(
+ .WIDTH (8*CONFIG_DATA_BYTES + 1),
+ .DEPTH (DLA_DMA_CSR_DESCRIPTOR_QUEUE_PHYSICAL_SIZE), //this is set to 512 in dla_dma_constants.svh, may as well use up full depth of M20K
+ .ALMOST_FULL_CUTOFF (DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF),
+ .ASYNC_RESET (0), //consume reset synchronously
+ .SYNCHRONIZE_RESET (0), //reset is already synchronized
+ .STYLE ("ms")
+ )
+ descriptor_queue
+ (
+ .clock (clk_ddr),
+ .resetn (i_sclrn_ddr),
+
+ .i_valid (descriptor_queue_forced_write),
+ .i_data (descriptor_queue_data),
+ .o_stall (descriptor_queue_full), //software is responsible for not overflowing this fifo
+ .o_almost_full (descriptor_diagnostics_almost_full),
+
+ .o_valid (o_config_valid),
+ .o_data ({o_config_for_intercept, o_config_data}),
+ .i_stall (~i_config_ready | i_token_out_of_inferences)
+ );
+
+
+
+ ////////////////////////////
+ // Performance counters //
+ ////////////////////////////
+
+ //Auxillary logic that controls the jobs active counters
+ assign first_word_of_descriptor_being_read = o_config_valid & i_config_ready & (descriptor_words_read==3'h0); //desc words read was 0, going to be 1
+ always_ff @(posedge clk_ddr) begin
+ if (o_config_valid & i_config_ready) descriptor_words_read <= descriptor_words_read + 1'b1;
+
+ if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin
+ // In this case, we should only track the cycles between the feature data being read, and
+ // results being streamed out, since we continually read the on-chip config params
+ if (i_token_stream_started & ~i_token_done) jobs_active <= jobs_active + 1'b1;
+ if (~i_token_stream_started & i_token_done) jobs_active <= jobs_active - 1'b1;
+ end else begin
+ if (first_word_of_descriptor_being_read & ~i_token_done) jobs_active <= jobs_active + 1'b1;
+ if (~first_word_of_descriptor_being_read & i_token_done) jobs_active <= jobs_active - 1'b1;
+ end
+
+ if (~i_sclrn_ddr) begin
+ descriptor_words_read <= 3'h0;
+ jobs_active <= '0;
+ jobs_active_is_nonzero <= 1'b0;
+ end
+ end
+
+ logic core_jobs_active_is_nonzero_ddr_clk;
+
+ always_ff @(posedge clk_dla) begin
+ if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin
+ // In this case, we should only track the cycles between the feature data being read, and
+ // results being streamed out, since we continually read the on-chip config params
+ if (i_stream_received_first_word & ~i_stream_sent_last_word) core_jobs_active <= core_jobs_active + 1'b1;
+ if (~i_stream_received_first_word & i_stream_sent_last_word) core_jobs_active <= core_jobs_active - 1'b1;
+ core_jobs_active_is_nonzero <= core_jobs_active != 0;
+ end
+ if (~dla_sclrn) begin
+ core_jobs_active <= '0;
+ core_jobs_active_is_nonzero <= 1'b0;
+ end
+ end
+
+ // crossover core_jobs_active_is_nonzero from dla to ddr clk
+ dla_clock_cross_full_sync dla_to_ddr_clock_cross_sync
+ (
+ .clk_src (clk_dla),
+ .i_src_async_resetn (1'b1),
+ .i_src_data (core_jobs_active_is_nonzero),
+ .o_src_data (),
+
+ .clk_dst (clk_ddr),
+ .i_dst_async_resetn (1'b1),
+ .o_dst_data (core_jobs_active_is_nonzero_ddr_clk)
+ );
+
+
+ //track the number of active jobs
+ dla_dma_counter_64 count_total_core_clocks_active (
+ .i_clk (clk_ddr),
+ .i_sclrn (i_sclrn_ddr),
+ .i_increment_en (core_jobs_active_is_nonzero_ddr_clk),
+ .i_increment_val (32'b1),
+ .i_read_counter_low_bits (read_from_core_clocks_active_lo),
+ .o_counter_low_bits (total_core_clocks_active_lo),
+ .o_counter_high_bits_latch (total_core_clocks_active_hi)
+ );
+ //a job is active once the first word of its descriptor is read from the queue
+ //a job is finished once the feature writer sends a done token
+ dla_dma_counter_64 count_total_clocks_active (
+ .i_clk (clk_ddr),
+ .i_sclrn (i_sclrn_ddr),
+ .i_increment_en (jobs_active != 0),
+ .i_increment_val (32'b1),
+ .i_read_counter_low_bits (read_from_clocks_active_lo),
+ .o_counter_low_bits (total_clocks_active_lo),
+ .o_counter_high_bits_latch (total_clocks_active_hi)
+ );
+
+ dla_dma_counter_64 count_total_clocks_for_all_jobs (
+ .i_clk (clk_ddr),
+ .i_sclrn (i_sclrn_ddr),
+ .i_increment_en (1'b1),
+ .i_increment_val (jobs_active),
+ .i_read_counter_low_bits (read_from_clocks_all_jobs_lo),
+ .o_counter_low_bits (total_clocks_for_all_jobs_lo),
+ .o_counter_high_bits_latch (total_clocks_for_all_jobs_hi)
+ );
+
+ //tracks the number of input feature reads in terms of memory words transfers.
+ dla_dma_counter_64 count_input_feature_reads (
+ .i_clk (clk_ddr),
+ .i_sclrn (i_sclrn_ddr),
+ .i_increment_en (i_input_feature_rready & i_input_feature_rvalid),
+ .i_increment_val (32'b1),
+ .i_read_counter_low_bits (read_from_input_feature_reads_lo),
+ .o_counter_low_bits (number_of_input_feature_reads_lo),
+ .o_counter_high_bits_latch (number_of_input_feature_reads_hi)
+ );
+
+ //tracks the number of output feature writes in terms of memory words transfers.
+ dla_dma_counter_64 count_output_feature_writes (
+ .i_clk (clk_ddr),
+ .i_sclrn (i_sclrn_ddr),
+ .i_increment_en (i_output_feature_wready & i_output_feature_wvalid),
+ .i_increment_val (32'b1),
+ .i_read_counter_low_bits (read_from_output_feature_writes_lo),
+ .o_counter_low_bits (number_of_output_feature_writes_lo),
+ .o_counter_high_bits_latch (number_of_output_feature_writes_hi)
+ );
+
+ //tracks the number of input filter reads in terms of memory words transfers.
+ dla_dma_counter_64 count_input_filter_reads (
+ .i_clk (clk_ddr),
+ .i_sclrn (i_sclrn_ddr),
+ .i_increment_en (i_input_filter_rready & i_input_filter_rvalid),
+ .i_increment_val (32'b1),
+ .i_read_counter_low_bits (read_from_input_filter_reads_lo),
+ .o_counter_low_bits (number_of_input_filter_reads_lo),
+ .o_counter_high_bits_latch (number_of_input_filter_reads_hi)
+ );
+
+
+ //////////////////////
+ // Address decode //
+ //////////////////////
+
+ always_ff @(posedge clk_ddr) begin
+ //the csr address space is mostly read only, except for a few specific offsets listed below
+ write_to_ram <= 1'b0;
+ if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4) write_to_ram <= 1'b1;
+ if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4) write_to_ram <= 1'b1;
+ if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4) write_to_ram <= 1'b1;
+ if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4) write_to_ram <= 1'b1;
+ if (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4) write_to_ram <= 1'b1;
+ if (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4) write_to_ram <= 1'b1;
+
+ //decode specific addresses in which the storage lives in registers
+ write_to_interrupt_control <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4);
+ read_from_interrupt_control <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4);
+ write_to_interrupt_mask <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4);
+ read_from_interrupt_mask <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4);
+ read_from_desc_diagnostics <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS/4);
+ read_from_completion_count <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_COMPLETION_COUNT/4);
+ read_from_clocks_active_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO/4);
+ read_from_clocks_active_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI/4);
+ read_from_core_clocks_active_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_LO/4);
+ read_from_core_clocks_active_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_HI/4);
+ read_from_clocks_all_jobs_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO/4);
+ read_from_clocks_all_jobs_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI/4);
+ write_to_debug_network_addr <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4);
+ read_from_debug_network_valid <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID/4);
+ read_from_debug_network_data <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA/4);
+ read_from_license_flag <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_LICENSE_FLAG /4);
+ read_from_ip_reset <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4);
+ read_from_input_filter_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO/4);
+ read_from_input_filter_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI/4);
+ read_from_input_feature_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO/4);
+ read_from_input_feature_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI/4);
+ read_from_output_feature_writes_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO/4);
+ read_from_output_feature_writes_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI/4);
+ read_ready_streaming_interface<= (ram_rd_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4);
+
+ //decode specific addresses in which an action must be taken
+ enqueue_descriptor <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4);
+ write_to_ip_reset <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4);
+ if (ENABLE_INPUT_STREAMING) begin
+ write_ready_streaming_interface <= (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4);
+ end
+ end
+
+
+
+ /////////////////////
+ // State machine //
+ /////////////////////
+
+ always_ff @(posedge clk_ddr) begin
+ //default behavior
+ o_csr_arready <= 1'b0;
+ o_csr_rvalid <= 1'b0;
+ o_csr_awready <= 1'b0;
+ o_csr_wready <= 1'b0;
+ o_csr_bvalid <= 1'b0;
+ ram_wr_en <= 1'b0;
+ descriptor_queue_forced_write <= 1'b0;
+ descriptor_queue_data <= 'x;
+ debug_network_arvalid <= 1'b0;
+ debug_network_rready <= 1'b0;
+ o_request_ip_reset <= 1'b0;
+ o_streaming_active <= o_streaming_active;
+
+ unique case (1'b1)
+ state[STATE_IDLE_BIT]: begin
+ if (i_csr_arvalid && (previous_was_write || ~(i_csr_awvalid && i_csr_wvalid))) begin
+ o_csr_arready <= 1'b1;
+ state <= STATE_READ_ACCEPT;
+ ram_rd_addr <= i_csr_araddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR];
+ end
+ if (i_csr_awvalid && i_csr_wvalid && (~previous_was_write || ~i_csr_arvalid)) begin
+ o_csr_awready <= 1'b1;
+ o_csr_wready <= 1'b1;
+ state <= STATE_WRITE_ACCEPT;
+ ram_wr_addr <= i_csr_awaddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR];
+ ram_wr_data <= i_csr_wdata;
+ end
+ end
+
+ state[STATE_READ_ACCEPT_BIT]: begin
+ //o_csr_arready is asserted now, indicates csr has accepted a read
+ //ram_rd_addr valid now
+ state <= STATE_READ_ADDR;
+ previous_was_write <= 1'b0;
+ end
+ state[STATE_READ_ADDR_BIT]: begin
+ //hardened input register inside m20k valid now
+ state <= STATE_READ_DATA;
+ end
+ state[STATE_READ_DATA_BIT]: begin
+ //hardened output register inside m20k valid now
+ o_csr_rvalid <= 1'b1;
+ o_csr_rdata <= ram_rd_data;
+ if (read_from_interrupt_control) begin
+ o_csr_rdata <= '0;
+ o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_control_error;
+ o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT] <= interrupt_control_done;
+ end
+ if (read_from_interrupt_mask) begin
+ o_csr_rdata <= '0;
+ o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_mask_error;
+ o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT] <= interrupt_mask_done;
+ end
+ if (read_from_desc_diagnostics) begin
+ o_csr_rdata <= '0;
+ o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OVERFLOW_BIT] <= descriptor_diagnostics_overflow;
+ o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_ALMOST_FULL_BIT] <= descriptor_diagnostics_almost_full;
+ o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT] <= i_token_out_of_inferences;
+ end
+ if (read_from_completion_count) o_csr_rdata <= completion_count;
+ if (read_from_clocks_active_lo) o_csr_rdata <= total_clocks_active_lo;
+ if (read_from_clocks_active_hi) o_csr_rdata <= total_clocks_active_hi;
+ if (read_from_core_clocks_active_lo) o_csr_rdata <= total_core_clocks_active_lo;
+ if (read_from_core_clocks_active_hi) o_csr_rdata <= total_core_clocks_active_hi;
+ if (read_from_clocks_all_jobs_lo) o_csr_rdata <= total_clocks_for_all_jobs_lo;
+ if (read_from_clocks_all_jobs_hi) o_csr_rdata <= total_clocks_for_all_jobs_hi;
+ if (read_from_input_feature_reads_lo) o_csr_rdata <= number_of_input_feature_reads_lo;
+ if (read_from_input_feature_reads_hi) o_csr_rdata <= number_of_input_feature_reads_hi;
+ if (read_from_input_filter_reads_lo) o_csr_rdata <= number_of_input_filter_reads_lo;
+ if (read_from_input_filter_reads_hi) o_csr_rdata <= number_of_input_filter_reads_hi;
+ if (read_from_output_feature_writes_lo) o_csr_rdata <= number_of_output_feature_writes_lo;
+ if (read_from_output_feature_writes_hi) o_csr_rdata <= number_of_output_feature_writes_hi;
+ if (read_from_debug_network_valid) o_csr_rdata <= debug_network_rvalid; //read prefetch after dcfifo has valid data
+ if (read_from_debug_network_data) begin
+ o_csr_rdata <= debug_network_rdata; //read prefetch after dcfifo
+ debug_network_rready <= 1'b1; //rdack the read prefetch
+ end
+ if (read_from_license_flag) o_csr_rdata <= i_license_flag;
+ if (read_from_ip_reset) o_csr_rdata <= '0; //this read will always return 0
+ if (read_ready_streaming_interface) o_csr_rdata <= o_streaming_active;
+
+ if (o_csr_rvalid && i_csr_rready) begin
+ o_csr_rvalid <= 1'b0;
+ state <= STATE_IDLE;
+ end
+ end
+
+ state[STATE_WRITE_ACCEPT_BIT]: begin
+ //o_csr_awready and o_csr_wready are asserted now, indicates csr has accepted a write
+ //ram_wr_addr valid now
+ previous_was_write <= 1'b1;
+ state <= STATE_WRITE_COMMIT;
+ end
+ state[STATE_WRITE_COMMIT_BIT]: begin
+ //write_to_ram valid now
+ ram_wr_en <= write_to_ram;
+ if (write_to_interrupt_control) begin //write 1 to clear
+ if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT]) interrupt_control_error <= 1'b0;
+ if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT]) interrupt_control_done <= 1'b0;
+ end
+ if (write_to_interrupt_mask) begin
+ interrupt_mask_error <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT];
+ interrupt_mask_done <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT];
+ end
+ if (write_to_debug_network_addr) begin
+ //don't care if dcfifo is full, handshaking scheme is already tolerant to debug network not responding to requests
+ debug_network_arvalid <= 1'b1;
+ debug_network_araddr <= ram_wr_data;
+ end
+ o_csr_bvalid <= 1'b1;
+ if (o_csr_bvalid && i_csr_bready) begin
+ o_csr_bvalid <= 1'b0;
+ if (enqueue_descriptor) state <= STATE_DESCRIPTOR;
+ else if (write_to_ip_reset) state <= (ram_wr_data != '0) ? STATE_AWAIT_RESET : STATE_IDLE;
+ else if (write_ready_streaming_interface) begin
+ if (ram_wr_data == 1) begin
+ state <= STATE_IDLE;
+ if (~ENABLE_ON_CHIP_PARAMETERS) state <= STATE_DESCRIPTOR;
+ o_streaming_active <= 1'b1;
+ end else begin
+ state <= STATE_IDLE;
+ o_streaming_active <= 1'b0;
+ end
+ end
+ else state <= STATE_IDLE;
+ end
+ descriptor_count <= 0;
+ end
+
+ state[STATE_DESCRIPTOR_BIT]: begin
+ descriptor_count <= descriptor_count + 1'b1;
+ case (descriptor_count)
+ 4'h0: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4; //addr gen 0: config reader base addr
+ 4'h1: ram_rd_addr <= 'x; //addr gen 1: token
+ 4'h2: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4; //addr gen 2: config reader num words minus two
+ 4'h3: ram_rd_addr <= 'x; //addr gen 3: addr update
+ 4'h4: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4; //intercept 0: config reader num words minus two
+ 4'h5: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4; //intercept 1: filter reader offset correction
+ 4'h6: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4; //intercept 2: feature input/output offset
+ 4'h7: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4; //intercept 3: feature intermediate offset
+ default: ram_rd_addr <= 'x;
+ endcase
+
+ //there are 3 clocks of latency from the time ram_rd_addr is set until ram_rd_data is valid
+ //This is why the config_reader struct in the dma/dual_inc folder has to be laid out in that order
+ case (descriptor_count)
+ 4'h3: descriptor_queue_data <= {1'b0, ram_rd_data}; //addr gen 0: config reader base addr
+ 4'h4: descriptor_queue_data <= '0; //addr gen 1: token
+ 4'h5: descriptor_queue_data <= {1'b0, ram_rd_data}; //addr gen 2: config reader num words minus two
+ 4'h6: descriptor_queue_data <= CONFIG_READER_DATA_BYTES; //addr gen 3: addr update
+ 4'h7: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 0: config reader num words minus two
+ 4'h8: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 1: filter reader offset correction
+ 4'h9: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 2: feature input/output offset
+ 4'ha: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 3: feature intermediate offset
+ default: descriptor_queue_data <= 'x;
+ endcase
+
+ descriptor_queue_forced_write <= (descriptor_count >= 4'h3);
+ if (descriptor_count == 4'ha) state <= STATE_IDLE;
+ end
+
+ state[STATE_AWAIT_RESET_BIT]: begin
+ //reset request was triggered by a CSR write
+ // -we completed the axi4-lite write response handshake in STATE_WRITE_COMMIT
+ // -we don't want to return to STATE_IDLE, since a new transaction might get initiated and then interrupted when reset hits
+ // -we should assert o_request_ip_reset for multiple cycles to ensure the async signal is synchronized into all clock domains
+ //so, just hang out here and wait for reset
+ o_request_ip_reset <= 1'b1;
+ state <= STATE_AWAIT_RESET;
+ end
+
+ default: begin
+ state <= STATE_IDLE;
+ end
+ endcase
+
+ //completion tracking
+ completion_count <= completion_count + i_token_done;
+
+ //interrupt tracking
+ if (i_token_error) interrupt_control_error <= 1'b1;
+ if (i_token_done) interrupt_control_done <= 1'b1;
+
+ //sticky bit for detecting if descriptor queue has overflowed
+ if (descriptor_queue_forced_write & descriptor_queue_full) descriptor_diagnostics_overflow <= 1'b1;
+
+ if (~i_sclrn_ddr) begin
+ //state
+ state <= STATE_IDLE;
+ previous_was_write <= 1'b0;
+
+ //AXI4-lite outputs to host control
+ o_csr_arready <= 1'b0;
+ o_csr_rvalid <= 1'b0;
+ o_csr_awready <= 1'b0;
+ o_csr_wready <= 1'b0;
+ o_csr_bvalid <= 1'b0;
+
+ //ram
+ ram_wr_en <= 1'b0;
+
+ //specific offsets implemented in registers
+ interrupt_control_error <= 1'b0;
+ interrupt_control_done <= 1'b0;
+ interrupt_mask_error <= 1'b0;
+ interrupt_mask_done <= 1'b0;
+ completion_count <= '0;
+ descriptor_diagnostics_overflow <= 1'b0;
+
+ //descriptor queue
+ descriptor_queue_forced_write <= 1'b0;
+
+ //debug network
+ debug_network_arvalid <= 1'b0;
+ debug_network_rready <= 1'b0;
+
+ // stops streaming reload
+ o_streaming_active <= 1'b0;
+ end
+ end
+
+
+
+ //////////////////////////////////////////////////////////
+ // Bring the level interrupt to the host clock domain //
+ //////////////////////////////////////////////////////////
+
+ always_ff @(posedge clk_ddr) begin
+ ddr_interrupt_level <= 1'b0;
+ if (interrupt_mask_error & interrupt_control_error) ddr_interrupt_level <= 1'b1;
+ if (interrupt_mask_done & interrupt_control_done ) ddr_interrupt_level <= 1'b1;
+ end
+
+ //this is a 3-stage register-based synchonizer
+ dla_clock_cross_full_sync dla_clock_cross_sync
+ (
+ .clk_src (clk_ddr),
+ .i_src_async_resetn (1'b1),
+ .i_src_data (ddr_interrupt_level),
+ .o_src_data (),
+
+ .clk_dst (clk_pcie),
+ .i_dst_async_resetn (1'b1),
+ .o_dst_data (o_interrupt_level)
+ );
+
+
+
+ ///////////////////////////
+ // Clock crossing FIFOS //
+ ///////////////////////////
+
+ localparam int DCFIFO_DEPTH = 32; //dcfifo is RAM-based, may as well use an entire MLAB
+
+ dla_acl_dcfifo #(
+ .WIDTH (8*CSR_DATA_BYTES),
+ .DEPTH (DCFIFO_DEPTH)
+ )
+ clock_cross_debug_network_request
+ (
+ .async_resetn (i_resetn_async), //reset synchronization is handled internally
+
+ //write side -- write is ignored if fifo is full, this is okay since debug network handshaking is fault tolerant
+ .wr_clock (clk_ddr),
+ .wr_req (debug_network_arvalid),
+ .wr_data (debug_network_araddr),
+
+ //read side
+ .rd_clock (clk_dla),
+ .rd_empty (not_o_debug_network_arvalid),
+ .rd_data (o_debug_network_araddr),
+ .rd_ack (i_debug_network_arready)
+ );
+ assign o_debug_network_arvalid = ~not_o_debug_network_arvalid;
+
+ dla_acl_dcfifo #(
+ .WIDTH (8*CSR_DATA_BYTES),
+ .DEPTH (DCFIFO_DEPTH)
+ )
+ clock_cross_debug_network_response
+ (
+ .async_resetn (i_resetn_async), //reset synchronization is handled internally
+
+ //write side
+ .wr_clock (clk_dla),
+ .wr_req (i_debug_network_rvalid),
+ .wr_data (i_debug_network_rdata),
+ .wr_full (not_o_debug_network_rready),
+
+ //read side
+ .rd_clock (clk_ddr),
+ .rd_empty (debug_network_dcfifo_empty),
+ .rd_data (debug_network_dcfifo_data),
+ .rd_ack (~debug_network_dcfifo_empty) //consume read data immediately, cached in a read prefetch
+ );
+ assign o_debug_network_rready = ~not_o_debug_network_rready;
+
+ //cache the most recent value returned from the debug network
+ always_ff @(posedge clk_ddr) begin
+ if (~debug_network_dcfifo_empty) begin
+ debug_network_rdata <= debug_network_dcfifo_data;
+ debug_network_rvalid <= 1'b1;
+ end
+ if (debug_network_rready) begin
+ debug_network_rvalid <= 1'b0;
+ end
+ if (~i_sclrn_ddr) begin
+ debug_network_rvalid <= 1'b0;
+ end
+ end
+
+endmodule