diff options
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv')
| -rw-r--r-- | python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv | 841 |
1 files changed, 841 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv new file mode 100644 index 0000000..ae7006c --- /dev/null +++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv @@ -0,0 +1,841 @@ +// Copyright 2020-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + + +// This module implements the CSR for DMA. It also includes the descriptor queue +// and interrupt request generator. The CSR is implemented with a RAM. Certain +// values are kept live in registers, such the interrupt control and mask. This +// makes it easier to detect when a change has happened (instead of trying to a +// read-modify-write with the RAM). +// +// The AXI4 lite slave interface is usually going to backpressure PCIe. There is +// a state machine which allows one outstanding read request, or one outstanding +// write request at a time (write requests can be outstanding if the writeack is +// backpressured which AXI allows). There is a register which tracks whether the +// last request was a read or write, this enables round robin arbitration. Each +// request takes a few clock cycles to process, as the address needs to be decoded +// to determine if a write is allowed to commit to the RAM, or if we need to use +// read data from one of the registers instead of the RAM. +// +// Special offsets are defined as localparams below. Writing to DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4 +// will cause one unit of work to be enqueued in the descriptor queue. Currently +// this involves writing 8 values to a fifo, which are then consumed by the config +// reader. Internal to the config reader, 4 values go to the config reader address +// generator, the other 4 go to the config reader intercept. +// +// Beware the following assumptions about how the host issues requests to this CSR: +// - no bursts (required by AXI4 lite) +// - byte enables are assumed to be all 1 (no partial word access) +// - all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0) + +`resetall +`undefineall +`default_nettype none +`include "dla_acl_parameter_assert.svh" + +module dla_dma_csr #( + parameter int CSR_ADDR_WIDTH, //width of the byte address signal, determines CSR address space size, e.g. 11 bit address = 2048 bytes, the largest size that uses only 1 M20K + parameter int CSR_DATA_BYTES, //width of the CSR data path, typically 4 bytes + parameter int CONFIG_DATA_BYTES, //data width of the config network output port, typically 4 bytes, the descriptor queue matches this so that config decode can be reused + parameter int CONFIG_READER_DATA_BYTES, //data width of the config network input port, needed by config reader address generator for loop update + + parameter int ENABLE_INPUT_STREAMING, + parameter int ENABLE_OUTPUT_STREAMING, + parameter int ENABLE_ON_CHIP_PARAMETERS + + ) ( + input wire clk_ddr, + input wire clk_pcie, + input wire clk_dla, + input wire i_sclrn_ddr, //active low reset that has already been synchronized to clk_ddr + input wire i_resetn_async, //active low reset that has NOT been synchronized to any clock, only to be consumed by dcfifo + + //updates for interrupt, runs on ddr clock + input wire i_token_done, //feature writer reports it is done + input wire i_token_stream_started, //input streamer is reading the first word + input wire i_stream_received_first_word, + input wire i_stream_sent_last_word, + input wire i_token_error, //dla has encountered some error, assert high for one clock cycle to report it to host (assuming mask bit is 1) + input wire i_license_flag, + input wire i_token_out_of_inferences, + + //snoop signals for the input feature, output featuer, and filter LSU's core <--> fabric traffic + //run on clk_ddr + input wire i_input_feature_rvalid, + input wire i_input_feature_rready, + input wire i_input_filter_rvalid, + input wire i_input_filter_rready, + input wire i_output_feature_wvalid, + input wire i_output_feature_wready, + + //interrupt request to pcie, runs on pcie clock + output logic o_interrupt_level, //level sensitive interrupt + + //read side of descriptor queue goes to config reader, runs on ddr clock + output logic o_config_valid, + output logic [8*CONFIG_DATA_BYTES-1:0] o_config_data, + output logic o_config_for_intercept, //0 = goes to config reader addr gen, 1 = goes to config reader intercept + input wire i_config_ready, + + //debug network AXI-4 lite interface, read request and read response channels, runs on dla_clock + output logic o_debug_network_arvalid, + output logic [8*CSR_DATA_BYTES-1:0] o_debug_network_araddr, + input wire i_debug_network_arready, + input wire i_debug_network_rvalid, + input wire [8*CSR_DATA_BYTES-1:0] i_debug_network_rdata, + output logic o_debug_network_rready, + + //AXI4-lite slave interface for host control, runs on ddr clock + //no bursts, byte enables are assumed to be all 1, all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0) + input wire i_csr_arvalid, + input wire [CSR_ADDR_WIDTH-1:0] i_csr_araddr, + output logic o_csr_arready, + output logic o_csr_rvalid, + output logic [8*CSR_DATA_BYTES-1:0] o_csr_rdata, + input wire i_csr_rready, + input wire i_csr_awvalid, + input wire [CSR_ADDR_WIDTH-1:0] i_csr_awaddr, + output logic o_csr_awready, + input wire i_csr_wvalid, + input wire [8*CSR_DATA_BYTES-1:0] i_csr_wdata, + output logic o_csr_wready, + output logic o_csr_bvalid, + input wire i_csr_bready, + + //reset request for the whole ip, runs on ddr clock + output logic o_request_ip_reset, + + //output bit to start/stop streaming interface + output logic o_streaming_active +); + + + ///////////////////////////////// + // Parameter legality checks // + ///////////////////////////////// + + //signal widths cannot be trivial + `DLA_ACL_PARAMETER_ASSERT(CSR_DATA_BYTES >= 1) + `DLA_ACL_PARAMETER_ASSERT(CONFIG_DATA_BYTES >= 1) + + //csr address space cannot be trivial + `DLA_ACL_PARAMETER_ASSERT(2**CSR_ADDR_WIDTH > CONFIG_DATA_BYTES) + + //offsets must be within address space + localparam int CSR_LO_ADDR = $clog2(CSR_DATA_BYTES); //number of LSBs that must be 0 in order for byte address to be word aligned + localparam int CSR_WORD_ADDR_WIDTH = CSR_ADDR_WIDTH - CSR_LO_ADDR; + + + + ///////////////// + // Constants // + ///////////////// + `include "dla_dma_constants.svh" + //special offsets -- these values are defined in one place and shared between hardware and software + //the constants from the dla_dma_constants.svh header file that CSR cares about are named DLA_DMA_CSR_OFFSET_**** and DLA_DMA_CSR_INTERRUPT_**** + + //state machine + enum { + STATE_IDLE_BIT, + STATE_READ_ACCEPT_BIT, + STATE_READ_ADDR_BIT, + STATE_READ_DATA_BIT, + STATE_WRITE_ACCEPT_BIT, + STATE_WRITE_COMMIT_BIT, + STATE_DESCRIPTOR_BIT, + STATE_AWAIT_RESET_BIT + } index; + + enum logic [index.num()-1:0] { + //1-hot encodings + STATE_IDLE = 1 << STATE_IDLE_BIT, + STATE_READ_ACCEPT = 1 << STATE_READ_ACCEPT_BIT, + STATE_READ_ADDR = 1 << STATE_READ_ADDR_BIT, + STATE_READ_DATA = 1 << STATE_READ_DATA_BIT, + STATE_WRITE_ACCEPT = 1 << STATE_WRITE_ACCEPT_BIT, + STATE_WRITE_COMMIT = 1 << STATE_WRITE_COMMIT_BIT, + STATE_DESCRIPTOR = 1 << STATE_DESCRIPTOR_BIT, + STATE_AWAIT_RESET = 1 << STATE_AWAIT_RESET_BIT, + XXX = 'x + } state; + + localparam int MAX_JOBS_ACTIVE = 64; //upper bounded by how many descriptors the queue can hold + localparam int JOBS_ACTIVE_WIDTH = $clog2(MAX_JOBS_ACTIVE+1); + + + + /////////////// + // Signals // + /////////////// + + //ram + logic ram_wr_en; + logic [CSR_WORD_ADDR_WIDTH-1:0] ram_wr_addr, ram_rd_addr; + logic [8*CSR_DATA_BYTES-1:0] ram_wr_data, ram_rd_data; + + //descriptor queue + logic descriptor_queue_forced_write, descriptor_queue_full, descriptor_diagnostics_almost_full; + logic [8*CONFIG_DATA_BYTES:0] descriptor_queue_data; + logic [2:0] descriptor_words_read; + logic first_word_of_descriptor_being_read, jobs_active_is_nonzero, core_jobs_active_is_nonzero; + logic [JOBS_ACTIVE_WIDTH-1:0] jobs_active, core_jobs_active; + + //Perfomance counters connections + logic [31:0] total_clocks_active_lo, total_clocks_active_hi; + logic [31:0] total_core_clocks_active_lo, total_core_clocks_active_hi; + logic [31:0] total_clocks_for_all_jobs_lo, total_clocks_for_all_jobs_hi; + logic [31:0] number_of_input_feature_reads_lo, number_of_input_feature_reads_hi; + logic [31:0] number_of_input_filter_reads_lo, number_of_input_filter_reads_hi; + logic [31:0] number_of_output_feature_writes_lo, number_of_output_feature_writes_hi; + + //state machine + logic previous_was_write; + logic [3:0] descriptor_count; + + //specific offsets are implemented in registers instead of RAM + logic interrupt_control_error, interrupt_control_done, interrupt_mask_error, interrupt_mask_done; + logic [8*CSR_DATA_BYTES-1:0] completion_count; + logic descriptor_diagnostics_overflow; + + //address decode for specific offsets that are implemented in registers or require some action to be taken + logic write_to_interrupt_control, read_from_interrupt_control, write_to_interrupt_mask, read_from_interrupt_mask; + logic write_to_ram, read_from_desc_diagnostics, read_from_completion_count, enqueue_descriptor; + logic read_from_clocks_active_lo, read_from_clocks_active_hi, read_from_clocks_all_jobs_lo, read_from_clocks_all_jobs_hi; + logic read_from_core_clocks_active_lo, read_from_core_clocks_active_hi; + logic read_from_input_feature_reads_lo, read_from_input_feature_reads_hi; + logic read_from_input_filter_reads_lo, read_from_input_filter_reads_hi; + logic read_from_output_feature_writes_lo, read_from_output_feature_writes_hi; + logic write_to_debug_network_addr, read_from_debug_network_valid, read_from_debug_network_data; + logic read_from_license_flag; + logic read_from_ip_reset, write_to_ip_reset; + + //clock crosser for interrupt + logic ddr_interrupt_level; + + //debug network read request address + logic debug_network_arvalid, not_o_debug_network_arvalid; + logic [8*CSR_DATA_BYTES-1:0] debug_network_araddr; + + //debug network read response data + logic not_o_debug_network_rready, debug_network_dcfifo_empty, debug_network_rvalid, debug_network_rready; + logic [8*CSR_DATA_BYTES-1:0] debug_network_dcfifo_data, debug_network_rdata; + + //streaming states + logic write_ready_streaming_interface; + logic read_ready_streaming_interface; + + logic dla_sclrn; + + //reset parameterization + localparam int RESET_USE_SYNCHRONIZER = 1; + localparam int RESET_PIPE_DEPTH = 3; + localparam int RESET_NUM_COPIES = 1; + dla_reset_handler_simple #( + .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER), + .PIPE_DEPTH (RESET_PIPE_DEPTH), + .NUM_COPIES (RESET_NUM_COPIES) + ) + ddr_reset_synchronizer + ( + .clk (clk_dla), + .i_resetn (i_resetn_async), + .o_sclrn (dla_sclrn) + ); + + /////////// + // RAM // + /////////// + + //could use hld_ram, but this simple ram doesn't need the depth stitching or clock enable magic that hld_ram provides + + altera_syncram + #( + .address_aclr_b ("NONE"), + .address_reg_b ("CLOCK0"), + .clock_enable_input_a ("BYPASS"), + .clock_enable_input_b ("BYPASS"), + .clock_enable_output_b ("BYPASS"), + .enable_ecc ("FALSE"), + .init_file ("dla_dma_csr_discovery_rom.mif"), + .intended_device_family ("Arria 10"), //Quartus will fix this automatically + .lpm_type ("altera_syncram"), + .numwords_a (2**CSR_WORD_ADDR_WIDTH), + .numwords_b (2**CSR_WORD_ADDR_WIDTH), + .operation_mode ("DUAL_PORT"), + .outdata_aclr_b ("NONE"), + .outdata_sclr_b ("NONE"), + .outdata_reg_b ("CLOCK0"), + .power_up_uninitialized ("FALSE"), + .ram_block_type ("M20K"), + .read_during_write_mode_mixed_ports ("DONT_CARE"), + .widthad_a (CSR_WORD_ADDR_WIDTH), + .widthad_b (CSR_WORD_ADDR_WIDTH), + .width_a (8*CSR_DATA_BYTES), + .width_b (8*CSR_DATA_BYTES), + .width_byteena_a (1) + ) + csr_ram + ( + .address_a (ram_wr_addr), + .address_b (ram_rd_addr), + .clock0 (clk_ddr), + .data_a (ram_wr_data), + .wren_a (ram_wr_en), + .q_b (ram_rd_data), + .address2_a (1'b1), + .address2_b (1'b1), + .addressstall_a (1'b0), + .addressstall_b (1'b0), + .byteena_a (1'b1), + .byteena_b (1'b1), + .clock1 (1'b1), + .clocken0 (1'b1), + .clocken1 (1'b1), + .clocken2 (1'b1), + .clocken3 (1'b1), + .data_b ({(8*CSR_DATA_BYTES){1'b1}}), + .eccencbypass (1'b0), + .eccencparity (8'b0), + .eccstatus (), + .q_a (), + .rden_a (1'b1), + .rden_b (1'b1), + .wren_b (1'b0) + ); + + + + //////////////////////// + // Descriptor Queue // + //////////////////////// + + //runtime knows how many jobs it has enqueued and how many jobs have finished + //runtime is responsible for not overflowing the descriptor queue, it must limit the number of outstanding jobs queued in hardware + + localparam int DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF = DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB; //almost full asserts when queue only has space for 1 more job + + dla_hld_fifo #( + .WIDTH (8*CONFIG_DATA_BYTES + 1), + .DEPTH (DLA_DMA_CSR_DESCRIPTOR_QUEUE_PHYSICAL_SIZE), //this is set to 512 in dla_dma_constants.svh, may as well use up full depth of M20K + .ALMOST_FULL_CUTOFF (DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF), + .ASYNC_RESET (0), //consume reset synchronously + .SYNCHRONIZE_RESET (0), //reset is already synchronized + .STYLE ("ms") + ) + descriptor_queue + ( + .clock (clk_ddr), + .resetn (i_sclrn_ddr), + + .i_valid (descriptor_queue_forced_write), + .i_data (descriptor_queue_data), + .o_stall (descriptor_queue_full), //software is responsible for not overflowing this fifo + .o_almost_full (descriptor_diagnostics_almost_full), + + .o_valid (o_config_valid), + .o_data ({o_config_for_intercept, o_config_data}), + .i_stall (~i_config_ready | i_token_out_of_inferences) + ); + + + + //////////////////////////// + // Performance counters // + //////////////////////////// + + //Auxillary logic that controls the jobs active counters + assign first_word_of_descriptor_being_read = o_config_valid & i_config_ready & (descriptor_words_read==3'h0); //desc words read was 0, going to be 1 + always_ff @(posedge clk_ddr) begin + if (o_config_valid & i_config_ready) descriptor_words_read <= descriptor_words_read + 1'b1; + + if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin + // In this case, we should only track the cycles between the feature data being read, and + // results being streamed out, since we continually read the on-chip config params + if (i_token_stream_started & ~i_token_done) jobs_active <= jobs_active + 1'b1; + if (~i_token_stream_started & i_token_done) jobs_active <= jobs_active - 1'b1; + end else begin + if (first_word_of_descriptor_being_read & ~i_token_done) jobs_active <= jobs_active + 1'b1; + if (~first_word_of_descriptor_being_read & i_token_done) jobs_active <= jobs_active - 1'b1; + end + + if (~i_sclrn_ddr) begin + descriptor_words_read <= 3'h0; + jobs_active <= '0; + jobs_active_is_nonzero <= 1'b0; + end + end + + logic core_jobs_active_is_nonzero_ddr_clk; + + always_ff @(posedge clk_dla) begin + if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin + // In this case, we should only track the cycles between the feature data being read, and + // results being streamed out, since we continually read the on-chip config params + if (i_stream_received_first_word & ~i_stream_sent_last_word) core_jobs_active <= core_jobs_active + 1'b1; + if (~i_stream_received_first_word & i_stream_sent_last_word) core_jobs_active <= core_jobs_active - 1'b1; + core_jobs_active_is_nonzero <= core_jobs_active != 0; + end + if (~dla_sclrn) begin + core_jobs_active <= '0; + core_jobs_active_is_nonzero <= 1'b0; + end + end + + // crossover core_jobs_active_is_nonzero from dla to ddr clk + dla_clock_cross_full_sync dla_to_ddr_clock_cross_sync + ( + .clk_src (clk_dla), + .i_src_async_resetn (1'b1), + .i_src_data (core_jobs_active_is_nonzero), + .o_src_data (), + + .clk_dst (clk_ddr), + .i_dst_async_resetn (1'b1), + .o_dst_data (core_jobs_active_is_nonzero_ddr_clk) + ); + + + //track the number of active jobs + dla_dma_counter_64 count_total_core_clocks_active ( + .i_clk (clk_ddr), + .i_sclrn (i_sclrn_ddr), + .i_increment_en (core_jobs_active_is_nonzero_ddr_clk), + .i_increment_val (32'b1), + .i_read_counter_low_bits (read_from_core_clocks_active_lo), + .o_counter_low_bits (total_core_clocks_active_lo), + .o_counter_high_bits_latch (total_core_clocks_active_hi) + ); + //a job is active once the first word of its descriptor is read from the queue + //a job is finished once the feature writer sends a done token + dla_dma_counter_64 count_total_clocks_active ( + .i_clk (clk_ddr), + .i_sclrn (i_sclrn_ddr), + .i_increment_en (jobs_active != 0), + .i_increment_val (32'b1), + .i_read_counter_low_bits (read_from_clocks_active_lo), + .o_counter_low_bits (total_clocks_active_lo), + .o_counter_high_bits_latch (total_clocks_active_hi) + ); + + dla_dma_counter_64 count_total_clocks_for_all_jobs ( + .i_clk (clk_ddr), + .i_sclrn (i_sclrn_ddr), + .i_increment_en (1'b1), + .i_increment_val (jobs_active), + .i_read_counter_low_bits (read_from_clocks_all_jobs_lo), + .o_counter_low_bits (total_clocks_for_all_jobs_lo), + .o_counter_high_bits_latch (total_clocks_for_all_jobs_hi) + ); + + //tracks the number of input feature reads in terms of memory words transfers. + dla_dma_counter_64 count_input_feature_reads ( + .i_clk (clk_ddr), + .i_sclrn (i_sclrn_ddr), + .i_increment_en (i_input_feature_rready & i_input_feature_rvalid), + .i_increment_val (32'b1), + .i_read_counter_low_bits (read_from_input_feature_reads_lo), + .o_counter_low_bits (number_of_input_feature_reads_lo), + .o_counter_high_bits_latch (number_of_input_feature_reads_hi) + ); + + //tracks the number of output feature writes in terms of memory words transfers. + dla_dma_counter_64 count_output_feature_writes ( + .i_clk (clk_ddr), + .i_sclrn (i_sclrn_ddr), + .i_increment_en (i_output_feature_wready & i_output_feature_wvalid), + .i_increment_val (32'b1), + .i_read_counter_low_bits (read_from_output_feature_writes_lo), + .o_counter_low_bits (number_of_output_feature_writes_lo), + .o_counter_high_bits_latch (number_of_output_feature_writes_hi) + ); + + //tracks the number of input filter reads in terms of memory words transfers. + dla_dma_counter_64 count_input_filter_reads ( + .i_clk (clk_ddr), + .i_sclrn (i_sclrn_ddr), + .i_increment_en (i_input_filter_rready & i_input_filter_rvalid), + .i_increment_val (32'b1), + .i_read_counter_low_bits (read_from_input_filter_reads_lo), + .o_counter_low_bits (number_of_input_filter_reads_lo), + .o_counter_high_bits_latch (number_of_input_filter_reads_hi) + ); + + + ////////////////////// + // Address decode // + ////////////////////// + + always_ff @(posedge clk_ddr) begin + //the csr address space is mostly read only, except for a few specific offsets listed below + write_to_ram <= 1'b0; + if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4) write_to_ram <= 1'b1; + if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4) write_to_ram <= 1'b1; + if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4) write_to_ram <= 1'b1; + if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4) write_to_ram <= 1'b1; + if (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4) write_to_ram <= 1'b1; + if (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4) write_to_ram <= 1'b1; + + //decode specific addresses in which the storage lives in registers + write_to_interrupt_control <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4); + read_from_interrupt_control <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4); + write_to_interrupt_mask <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4); + read_from_interrupt_mask <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4); + read_from_desc_diagnostics <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS/4); + read_from_completion_count <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_COMPLETION_COUNT/4); + read_from_clocks_active_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO/4); + read_from_clocks_active_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI/4); + read_from_core_clocks_active_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_LO/4); + read_from_core_clocks_active_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_HI/4); + read_from_clocks_all_jobs_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO/4); + read_from_clocks_all_jobs_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI/4); + write_to_debug_network_addr <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4); + read_from_debug_network_valid <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID/4); + read_from_debug_network_data <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA/4); + read_from_license_flag <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_LICENSE_FLAG /4); + read_from_ip_reset <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4); + read_from_input_filter_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO/4); + read_from_input_filter_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI/4); + read_from_input_feature_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO/4); + read_from_input_feature_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI/4); + read_from_output_feature_writes_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO/4); + read_from_output_feature_writes_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI/4); + read_ready_streaming_interface<= (ram_rd_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4); + + //decode specific addresses in which an action must be taken + enqueue_descriptor <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4); + write_to_ip_reset <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4); + if (ENABLE_INPUT_STREAMING) begin + write_ready_streaming_interface <= (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4); + end + end + + + + ///////////////////// + // State machine // + ///////////////////// + + always_ff @(posedge clk_ddr) begin + //default behavior + o_csr_arready <= 1'b0; + o_csr_rvalid <= 1'b0; + o_csr_awready <= 1'b0; + o_csr_wready <= 1'b0; + o_csr_bvalid <= 1'b0; + ram_wr_en <= 1'b0; + descriptor_queue_forced_write <= 1'b0; + descriptor_queue_data <= 'x; + debug_network_arvalid <= 1'b0; + debug_network_rready <= 1'b0; + o_request_ip_reset <= 1'b0; + o_streaming_active <= o_streaming_active; + + unique case (1'b1) + state[STATE_IDLE_BIT]: begin + if (i_csr_arvalid && (previous_was_write || ~(i_csr_awvalid && i_csr_wvalid))) begin + o_csr_arready <= 1'b1; + state <= STATE_READ_ACCEPT; + ram_rd_addr <= i_csr_araddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR]; + end + if (i_csr_awvalid && i_csr_wvalid && (~previous_was_write || ~i_csr_arvalid)) begin + o_csr_awready <= 1'b1; + o_csr_wready <= 1'b1; + state <= STATE_WRITE_ACCEPT; + ram_wr_addr <= i_csr_awaddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR]; + ram_wr_data <= i_csr_wdata; + end + end + + state[STATE_READ_ACCEPT_BIT]: begin + //o_csr_arready is asserted now, indicates csr has accepted a read + //ram_rd_addr valid now + state <= STATE_READ_ADDR; + previous_was_write <= 1'b0; + end + state[STATE_READ_ADDR_BIT]: begin + //hardened input register inside m20k valid now + state <= STATE_READ_DATA; + end + state[STATE_READ_DATA_BIT]: begin + //hardened output register inside m20k valid now + o_csr_rvalid <= 1'b1; + o_csr_rdata <= ram_rd_data; + if (read_from_interrupt_control) begin + o_csr_rdata <= '0; + o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_control_error; + o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT] <= interrupt_control_done; + end + if (read_from_interrupt_mask) begin + o_csr_rdata <= '0; + o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_mask_error; + o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT] <= interrupt_mask_done; + end + if (read_from_desc_diagnostics) begin + o_csr_rdata <= '0; + o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OVERFLOW_BIT] <= descriptor_diagnostics_overflow; + o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_ALMOST_FULL_BIT] <= descriptor_diagnostics_almost_full; + o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT] <= i_token_out_of_inferences; + end + if (read_from_completion_count) o_csr_rdata <= completion_count; + if (read_from_clocks_active_lo) o_csr_rdata <= total_clocks_active_lo; + if (read_from_clocks_active_hi) o_csr_rdata <= total_clocks_active_hi; + if (read_from_core_clocks_active_lo) o_csr_rdata <= total_core_clocks_active_lo; + if (read_from_core_clocks_active_hi) o_csr_rdata <= total_core_clocks_active_hi; + if (read_from_clocks_all_jobs_lo) o_csr_rdata <= total_clocks_for_all_jobs_lo; + if (read_from_clocks_all_jobs_hi) o_csr_rdata <= total_clocks_for_all_jobs_hi; + if (read_from_input_feature_reads_lo) o_csr_rdata <= number_of_input_feature_reads_lo; + if (read_from_input_feature_reads_hi) o_csr_rdata <= number_of_input_feature_reads_hi; + if (read_from_input_filter_reads_lo) o_csr_rdata <= number_of_input_filter_reads_lo; + if (read_from_input_filter_reads_hi) o_csr_rdata <= number_of_input_filter_reads_hi; + if (read_from_output_feature_writes_lo) o_csr_rdata <= number_of_output_feature_writes_lo; + if (read_from_output_feature_writes_hi) o_csr_rdata <= number_of_output_feature_writes_hi; + if (read_from_debug_network_valid) o_csr_rdata <= debug_network_rvalid; //read prefetch after dcfifo has valid data + if (read_from_debug_network_data) begin + o_csr_rdata <= debug_network_rdata; //read prefetch after dcfifo + debug_network_rready <= 1'b1; //rdack the read prefetch + end + if (read_from_license_flag) o_csr_rdata <= i_license_flag; + if (read_from_ip_reset) o_csr_rdata <= '0; //this read will always return 0 + if (read_ready_streaming_interface) o_csr_rdata <= o_streaming_active; + + if (o_csr_rvalid && i_csr_rready) begin + o_csr_rvalid <= 1'b0; + state <= STATE_IDLE; + end + end + + state[STATE_WRITE_ACCEPT_BIT]: begin + //o_csr_awready and o_csr_wready are asserted now, indicates csr has accepted a write + //ram_wr_addr valid now + previous_was_write <= 1'b1; + state <= STATE_WRITE_COMMIT; + end + state[STATE_WRITE_COMMIT_BIT]: begin + //write_to_ram valid now + ram_wr_en <= write_to_ram; + if (write_to_interrupt_control) begin //write 1 to clear + if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT]) interrupt_control_error <= 1'b0; + if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT]) interrupt_control_done <= 1'b0; + end + if (write_to_interrupt_mask) begin + interrupt_mask_error <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT]; + interrupt_mask_done <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT]; + end + if (write_to_debug_network_addr) begin + //don't care if dcfifo is full, handshaking scheme is already tolerant to debug network not responding to requests + debug_network_arvalid <= 1'b1; + debug_network_araddr <= ram_wr_data; + end + o_csr_bvalid <= 1'b1; + if (o_csr_bvalid && i_csr_bready) begin + o_csr_bvalid <= 1'b0; + if (enqueue_descriptor) state <= STATE_DESCRIPTOR; + else if (write_to_ip_reset) state <= (ram_wr_data != '0) ? STATE_AWAIT_RESET : STATE_IDLE; + else if (write_ready_streaming_interface) begin + if (ram_wr_data == 1) begin + state <= STATE_IDLE; + if (~ENABLE_ON_CHIP_PARAMETERS) state <= STATE_DESCRIPTOR; + o_streaming_active <= 1'b1; + end else begin + state <= STATE_IDLE; + o_streaming_active <= 1'b0; + end + end + else state <= STATE_IDLE; + end + descriptor_count <= 0; + end + + state[STATE_DESCRIPTOR_BIT]: begin + descriptor_count <= descriptor_count + 1'b1; + case (descriptor_count) + 4'h0: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4; //addr gen 0: config reader base addr + 4'h1: ram_rd_addr <= 'x; //addr gen 1: token + 4'h2: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4; //addr gen 2: config reader num words minus two + 4'h3: ram_rd_addr <= 'x; //addr gen 3: addr update + 4'h4: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4; //intercept 0: config reader num words minus two + 4'h5: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4; //intercept 1: filter reader offset correction + 4'h6: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4; //intercept 2: feature input/output offset + 4'h7: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4; //intercept 3: feature intermediate offset + default: ram_rd_addr <= 'x; + endcase + + //there are 3 clocks of latency from the time ram_rd_addr is set until ram_rd_data is valid + //This is why the config_reader struct in the dma/dual_inc folder has to be laid out in that order + case (descriptor_count) + 4'h3: descriptor_queue_data <= {1'b0, ram_rd_data}; //addr gen 0: config reader base addr + 4'h4: descriptor_queue_data <= '0; //addr gen 1: token + 4'h5: descriptor_queue_data <= {1'b0, ram_rd_data}; //addr gen 2: config reader num words minus two + 4'h6: descriptor_queue_data <= CONFIG_READER_DATA_BYTES; //addr gen 3: addr update + 4'h7: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 0: config reader num words minus two + 4'h8: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 1: filter reader offset correction + 4'h9: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 2: feature input/output offset + 4'ha: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 3: feature intermediate offset + default: descriptor_queue_data <= 'x; + endcase + + descriptor_queue_forced_write <= (descriptor_count >= 4'h3); + if (descriptor_count == 4'ha) state <= STATE_IDLE; + end + + state[STATE_AWAIT_RESET_BIT]: begin + //reset request was triggered by a CSR write + // -we completed the axi4-lite write response handshake in STATE_WRITE_COMMIT + // -we don't want to return to STATE_IDLE, since a new transaction might get initiated and then interrupted when reset hits + // -we should assert o_request_ip_reset for multiple cycles to ensure the async signal is synchronized into all clock domains + //so, just hang out here and wait for reset + o_request_ip_reset <= 1'b1; + state <= STATE_AWAIT_RESET; + end + + default: begin + state <= STATE_IDLE; + end + endcase + + //completion tracking + completion_count <= completion_count + i_token_done; + + //interrupt tracking + if (i_token_error) interrupt_control_error <= 1'b1; + if (i_token_done) interrupt_control_done <= 1'b1; + + //sticky bit for detecting if descriptor queue has overflowed + if (descriptor_queue_forced_write & descriptor_queue_full) descriptor_diagnostics_overflow <= 1'b1; + + if (~i_sclrn_ddr) begin + //state + state <= STATE_IDLE; + previous_was_write <= 1'b0; + + //AXI4-lite outputs to host control + o_csr_arready <= 1'b0; + o_csr_rvalid <= 1'b0; + o_csr_awready <= 1'b0; + o_csr_wready <= 1'b0; + o_csr_bvalid <= 1'b0; + + //ram + ram_wr_en <= 1'b0; + + //specific offsets implemented in registers + interrupt_control_error <= 1'b0; + interrupt_control_done <= 1'b0; + interrupt_mask_error <= 1'b0; + interrupt_mask_done <= 1'b0; + completion_count <= '0; + descriptor_diagnostics_overflow <= 1'b0; + + //descriptor queue + descriptor_queue_forced_write <= 1'b0; + + //debug network + debug_network_arvalid <= 1'b0; + debug_network_rready <= 1'b0; + + // stops streaming reload + o_streaming_active <= 1'b0; + end + end + + + + ////////////////////////////////////////////////////////// + // Bring the level interrupt to the host clock domain // + ////////////////////////////////////////////////////////// + + always_ff @(posedge clk_ddr) begin + ddr_interrupt_level <= 1'b0; + if (interrupt_mask_error & interrupt_control_error) ddr_interrupt_level <= 1'b1; + if (interrupt_mask_done & interrupt_control_done ) ddr_interrupt_level <= 1'b1; + end + + //this is a 3-stage register-based synchonizer + dla_clock_cross_full_sync dla_clock_cross_sync + ( + .clk_src (clk_ddr), + .i_src_async_resetn (1'b1), + .i_src_data (ddr_interrupt_level), + .o_src_data (), + + .clk_dst (clk_pcie), + .i_dst_async_resetn (1'b1), + .o_dst_data (o_interrupt_level) + ); + + + + /////////////////////////// + // Clock crossing FIFOS // + /////////////////////////// + + localparam int DCFIFO_DEPTH = 32; //dcfifo is RAM-based, may as well use an entire MLAB + + dla_acl_dcfifo #( + .WIDTH (8*CSR_DATA_BYTES), + .DEPTH (DCFIFO_DEPTH) + ) + clock_cross_debug_network_request + ( + .async_resetn (i_resetn_async), //reset synchronization is handled internally + + //write side -- write is ignored if fifo is full, this is okay since debug network handshaking is fault tolerant + .wr_clock (clk_ddr), + .wr_req (debug_network_arvalid), + .wr_data (debug_network_araddr), + + //read side + .rd_clock (clk_dla), + .rd_empty (not_o_debug_network_arvalid), + .rd_data (o_debug_network_araddr), + .rd_ack (i_debug_network_arready) + ); + assign o_debug_network_arvalid = ~not_o_debug_network_arvalid; + + dla_acl_dcfifo #( + .WIDTH (8*CSR_DATA_BYTES), + .DEPTH (DCFIFO_DEPTH) + ) + clock_cross_debug_network_response + ( + .async_resetn (i_resetn_async), //reset synchronization is handled internally + + //write side + .wr_clock (clk_dla), + .wr_req (i_debug_network_rvalid), + .wr_data (i_debug_network_rdata), + .wr_full (not_o_debug_network_rready), + + //read side + .rd_clock (clk_ddr), + .rd_empty (debug_network_dcfifo_empty), + .rd_data (debug_network_dcfifo_data), + .rd_ack (~debug_network_dcfifo_empty) //consume read data immediately, cached in a read prefetch + ); + assign o_debug_network_rready = ~not_o_debug_network_rready; + + //cache the most recent value returned from the debug network + always_ff @(posedge clk_ddr) begin + if (~debug_network_dcfifo_empty) begin + debug_network_rdata <= debug_network_dcfifo_data; + debug_network_rvalid <= 1'b1; + end + if (debug_network_rready) begin + debug_network_rvalid <= 1'b0; + end + if (~i_sclrn_ddr) begin + debug_network_rvalid <= 1'b0; + end + end + +endmodule |
