summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv')
-rw-r--r--python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv432
1 files changed, 432 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv
new file mode 100644
index 0000000..90b7fb8
--- /dev/null
+++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv
@@ -0,0 +1,432 @@
+// Copyright 2020-2024 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// Description of functionality:
+// This module is responsible for receiving DLA data from the cross bar (essentially coming from PE array)
+// in HWC format and dispatches it to an AXI stream interface
+// This module is fed by width adapter that converts between xbar_k_vec and the AXI bus width (specified at the arch file
+// through the output_stream_interface:bus_width)
+// Once data is converted to the correct interface width, it gets stored in a dual-clock FIFO which acts
+// as a clock crosser between the clk_dla and the clk_axi. It also acts as a conversion from the simple
+// ready-valid protocol in DLA to an AXI stream protocol (most of the signals won't be used)
+// Some control logic is used to specify which bytes are valid (through the tstrb signal), which will be used
+// when the number of output channels is not a multiple of k_vec. For example, if k_vec = c_vec = 8,
+// output_channels (O_C) = 12, we will have two transfers of size 8, the first transfer will have 8 valid outputs,
+// but the second transfer will have 4 valid outputs and 4 unvalid outputs (zeros), so we use the t_strb to indicate
+// which bytes (essentially which FP16 elements) are valid, and it's expected to be consumed by the downstream blocks
+// (receiver of the AXI signals)
+
+`resetall
+`undefineall
+`default_nettype none
+`include "dla_acl_parameter_assert.svh"
+
+module dla_output_streamer import dla_common_pkg::*, dla_output_streamer_pkg::*; #(
+ // DLA (input data) side parameters
+ parameter int CONFIG_WIDTH = 32,
+ // AXI side parameters
+ parameter int TDATA_WIDTH = 128, // an integer number of bits (typically a power of 2 from 8 - 1024)
+ parameter int TID_WIDTH = 8, // recommended to be no more than 8.
+ parameter int TDEST_WIDTH = 8, // recommended to be no more than 8.
+
+ // Data DC FIFO Depth
+ parameter int FIFO_DEPTH = 1024,
+
+
+ parameter int INPUT_WIDTH_ELEMENTS = 1,
+ parameter int INPUT_ELEMENT_WIDTH = 1,
+
+ // Decide if Width adaptaion resides before or after the data CDC FIFO
+ localparam int INPUT_DATA_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH,
+ localparam int OUTPUT_WIDTH_ELEMENTS = TDATA_WIDTH / INPUT_ELEMENT_WIDTH,
+ localparam int WA_BEFORE_CDC = INPUT_WIDTH_ELEMENTS < OUTPUT_WIDTH_ELEMENTS,
+
+ // DLA (input data) side derived parameters
+
+ // AXI side derived parameters
+ localparam int TSTRB_WIDTH = TDATA_WIDTH / 8,
+ localparam int TUSER_WIDTH = TDATA_WIDTH / 8
+) (
+ // Master/driver (DLA) signals
+ input wire clk_dla,
+ input wire i_aresetn,
+
+ // config input for output streaming
+ input wire [CONFIG_WIDTH-1:0] i_config_data,
+ input wire i_config_valid,
+ output logic o_config_ready,
+
+ // input data
+ output logic o_ready, // backpressure to xbar
+ input wire i_valid, // valid from xbar
+ input wire [INPUT_DATA_BITS-1:0] i_data, // data from xbar after width adaptation
+ input wire i_data_done, // data from xbar sent was the last one (the actual last data comes after WA)
+ output logic o_last_data_received, // got the last data
+
+ // config input for flush handling
+ input wire [CONFIG_WIDTH-1:0] i_config_flush_data,
+ input wire i_config_flush_valid,
+ output logic o_config_flush_ready,
+
+ // input signals for flush generation
+ output wire o_input_done, // xbar input for a layer is done and received
+
+ // Receiver (AXI) signals
+ input wire clk_axi,
+ input wire i_axi_aresetn,
+ output logic o_axi_t_valid, // indicates the Transmitter is driving a valid transfer
+ input wire i_axi_t_ready, // indicates that a Receiver can accept a transfer.
+ output wire o_axi_t_last, // Unused - indicates the boundary of a packet
+ output wire [TDATA_WIDTH-1:0] o_axi_t_data, // the primary payload used to provide the data that is passing across the interface
+ output wire [TSTRB_WIDTH-1:0] o_axi_t_strb, // the byte qualifier that indicates whether the content of the associated byte of TDATA is valid
+ output wire [TSTRB_WIDTH-1:0] o_axi_t_keep, // Unused
+ output wire [TID_WIDTH-1:0] o_axi_t_id, // Unused - data stream identifier
+ output wire [TDEST_WIDTH-1:0] o_axi_t_dest, // Unused - provides routing information for the data stream
+ output wire [TUSER_WIDTH-1:0] o_axi_t_user, // Unused - user-defined sideband information that can be transmitted along the data stream.
+ output wire o_axi_t_wakeup // Unused - identifies any activity associated with AXI-Stream interface
+);
+ //reset parameterization
+ localparam int RESET_USE_SYNCHRONIZER = 1;
+ localparam int RESET_PIPE_DEPTH = 3;
+ localparam int RESET_NUM_COPIES = 1;
+
+ //////////////////////////////////////////
+ // Reset Synchronization onto DLA clk //
+ /////////////////////////////////////////
+
+ logic [RESET_NUM_COPIES-1:0] sync_dla_resetn;
+
+ dla_reset_handler_simple #(
+ .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
+ .PIPE_DEPTH (RESET_PIPE_DEPTH),
+ .NUM_COPIES (RESET_NUM_COPIES)
+ ) dla_resetn_synchronizer (
+ .clk (clk_dla),
+ .i_resetn (i_aresetn),
+ .o_sclrn (sync_dla_resetn)
+ );
+
+ //////////////////////////////////////////
+ // Reset Synchronization onto AXI clk //
+ /////////////////////////////////////////
+
+ logic [RESET_NUM_COPIES-1:0] sync_axi_resetn;
+
+ dla_reset_handler_simple #(
+ .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
+ .PIPE_DEPTH (RESET_PIPE_DEPTH),
+ .NUM_COPIES (RESET_NUM_COPIES)
+ ) axi_resetn_synchronizer (
+ .clk (clk_axi),
+ .i_resetn (i_aresetn),
+ .o_sclrn (sync_axi_resetn)
+ );
+
+ // last data
+ logic received_last_data;
+ logic xbar_sent_last_data;
+ always_ff @ (posedge clk_dla) begin
+ received_last_data <= 1'b0;
+ if (i_data_done) begin
+ xbar_sent_last_data <= 1'b1;
+ end
+ if (xbar_sent_last_data & i_valid & o_ready) begin
+ received_last_data <= 1'b1;
+ xbar_sent_last_data <= 1'b0;
+ end
+ if (~sync_dla_resetn) begin
+ received_last_data <= 1'b0;
+ xbar_sent_last_data <= 1'b0;
+ end
+ end
+ assign o_last_data_received = received_last_data;
+
+ logic w_flush;
+ // Instaniate the flush generation block
+ dla_output_streamer_flush_handler # (
+ .CONFIG_WIDTH(CONFIG_WIDTH)
+ ) flush_generator (
+ .clk_dla(clk_dla),
+ .i_aresetn(sync_dla_resetn[0]),
+ .i_config_data(i_config_flush_data),
+ .i_config_valid(i_config_flush_valid),
+ .o_config_ready(o_config_flush_ready),
+ .i_ready(o_ready),
+ .i_valid(i_valid),
+ .o_flush(w_flush),
+ .o_input_done(o_input_done)
+ );
+
+ // Handle Config data and strb generation
+ // Writing side dla_clk
+ logic config_is_loaded;
+ logic cfg_rd_empty, cfg_rd_ack, cfg_rd_amost_empty, cfg_wr_almost_full;
+ logic [CONFIG_WIDTH-1:0] cfg_rd_data;
+ dla_acl_dcfifo #(
+ .DEPTH (32),
+ .WIDTH (CONFIG_WIDTH),
+ .ALMOST_FULL_CUTOFF (2)
+ ) dla_acl_fifo_inst_cfg (
+ .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally
+ .wr_clock (clk_dla),
+ .wr_req (i_config_valid),
+ .wr_data (i_config_data),
+ .wr_almost_full (cfg_wr_almost_full), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF
+ .wr_full (), // inform upstream that we cannot accept data
+
+ .rd_clock (clk_axi),
+ .rd_empty (cfg_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
+ .rd_ack (~config_is_loaded), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
+ .rd_data (cfg_rd_data),
+ .rd_almost_empty (cfg_rd_amost_empty) // early indication to downstream that soon fifo may no longer be able to supply data, threshold controlled by ALMOST_EMPTY_CUTOFF
+ );
+
+ assign o_config_ready = ~cfg_wr_almost_full;
+
+ // Reading side AXI clock
+ logic [CONFIG_WIDTH-1:0] config_offset;
+ output_streamer_config_t cfg;
+ localparam int NUM_CONFIG_OFFSETS = divCeil($bits(cfg), CONFIG_WIDTH);
+
+ // For now, ensure size of config is exact multiple of CONFIG_WIDTH
+ `DLA_ACL_PARAMETER_ASSERT($bits(cfg) == NUM_CONFIG_OFFSETS * CONFIG_WIDTH);
+
+ logic [CONFIG_WIDTH-1:0] config_total_transfers; // total number of axi_data transfers for a layer
+ logic [CONFIG_WIDTH-1:0] config_total_transfers_adjusted; // total number of axi_data transfers for a layer miuns any invalid last transactions
+ logic [CONFIG_WIDTH-1:0] config_transfers_per_hw_pixel; // Decides the total number of transfers needed to send a full set of output channels for a single piexel
+ // for a single width/height pixel given a specific data_width for the axi interface.
+ logic [CONFIG_WIDTH-1:0] config_valid_bytes_stream_width; // Determines how many elements of the last transfer are valid.
+ logic [CONFIG_WIDTH-1:0] config_last_index; // Determines index of last valid transaction per height/width.
+ logic [CONFIG_WIDTH-1:0] config_last_stream; // Determines if this stream is the last stream to geenrate tlast
+ logic [CONFIG_WIDTH-1:0] channel_chunks_counter; // Counter for the config_transfers_per_hw_pixel
+ logic [CONFIG_WIDTH-1:0] total_counter_out;
+ logic ostreamer_downstream_ready;
+
+ assign config_total_transfers = cfg.total_transfers;
+ assign config_total_transfers_adjusted = cfg.total_transfers_adjusted;
+ assign config_transfers_per_hw_pixel = cfg.transfers_per_hw_pixel;
+ assign config_valid_bytes_stream_width = cfg.valid_bytes_stream_width;
+ assign config_last_index = cfg.last_index;
+ assign config_last_stream = cfg.last_stream;
+ logic output_valid;
+ logic output_tx_received;
+ always_ff @(posedge clk_axi) begin
+ // config state machine
+ if (~config_is_loaded & ~cfg_rd_empty) begin
+ // update progress in accepting NUM_CONFIG_OFFSETS transactions
+ if (config_offset == NUM_CONFIG_OFFSETS-1) begin
+ config_offset <= '0;
+ config_is_loaded <= 1'b1;
+ end
+ else begin
+ config_offset <= config_offset + 1'b1;
+ end
+ cfg <= (cfg_rd_data[CONFIG_WIDTH-1:0] << ($bits(cfg) - CONFIG_WIDTH)) | (cfg >> CONFIG_WIDTH);
+ end else begin
+ // keep track of how many transactions are read by AXI to drive t_strb
+ if (config_is_loaded & output_tx_received) begin
+ total_counter_out <= total_counter_out + 1;
+ if (total_counter_out == (config_total_transfers - 1)) begin
+ config_is_loaded <= 1'b0;
+ total_counter_out <= '0;
+ end
+ channel_chunks_counter <= channel_chunks_counter + 1; // increment counter
+ if (channel_chunks_counter == (config_transfers_per_hw_pixel - 1)) begin
+ channel_chunks_counter <= '0; //
+ end
+ end
+ end
+ // resetn
+ if (~sync_axi_resetn[0]) begin
+ config_is_loaded <= 1'b0;
+ channel_chunks_counter <= '0;
+ total_counter_out <= '0;
+ config_offset <= '0;
+ end
+ end
+
+ logic [TDATA_WIDTH-1:0] ostreamer_output_data;
+ localparam int FIFO_CUTOFF = 0; // No need for slack cycles as the full goes back and gets handled in the same cycle in the width adapter
+
+ if (!WA_BEFORE_CDC) begin : GEN_WA_AFTER_DC_FIFO
+ // In this situation we want the width adaptation to happen in the slow clock domain
+ // so that the upstream IP can continue producing data that goes straight into the fifo
+ localparam int XBAR_WIDTH_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH;
+ logic [XBAR_WIDTH_BITS-1:0] fifo_data;
+ logic fifo_rd_empty, fifo_downstream_ready;
+ logic wr_full;
+
+ dla_acl_dcfifo #(
+ .DEPTH (FIFO_DEPTH),
+ .WIDTH (XBAR_WIDTH_BITS),
+ .ALMOST_FULL_CUTOFF (FIFO_CUTOFF)
+ ) dla_acl_fifo_inst (
+ .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally
+ .wr_clock (clk_dla),
+ .wr_req (i_valid && o_ready),
+ .wr_data (i_data),
+ .wr_full (wr_full), // inform upstream that we cannot accept data
+
+ .rd_clock (clk_axi),
+ .rd_empty (fifo_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
+ .rd_ack (fifo_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
+ .rd_data (fifo_data)
+ );
+
+ logic adapted_valid;
+ // Instantiate a width adapter to convert from xbar_k_vec width to AXI width
+ dla_width_adapter #(
+ .GROUP_NUM ( 1 ), // hardcoded
+ .GROUP_DELAY ( 0 ),
+ .INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ),
+ .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS),
+ .ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ),
+ .FLUSH_ENABLE ( 0 )
+ ) wa_output_stream_inst (
+ .clock ( clk_axi ),
+ .i_aresetn ( i_aresetn ),
+ .i_flush ( 1'b0 ),
+ .o_din_ready ( fifo_downstream_ready ),
+ .i_din_valid ( ~fifo_rd_empty ),
+ .i_din_data ( fifo_data ),
+ .i_dout_ready ( ostreamer_downstream_ready ), // to be received from output streamer
+ .o_dout_valid ( adapted_valid ),
+ .o_dout_data ( ostreamer_output_data )
+ );
+ assign output_tx_received = adapted_valid & ostreamer_downstream_ready;
+ assign output_valid = adapted_valid;
+ // We backpressure the upstream if the fifo is full, or if we want to flush (empty) the exit fifo
+ // out of any leftover invalid transactions that might come out of cvec != axi
+ assign o_ready = ~wr_full;
+
+ end else begin: GEN_WA_BEFORE_DC_FIFO
+ logic adapted_valid;
+ logic [TDATA_WIDTH-1:0] adapted_data;
+ logic wr_full; // dc fifo for data after width adaptation
+
+ // Instantiate a width adapter to convert from xbar_k_vec width to AXI width
+ dla_width_adapter #(
+ .GROUP_NUM ( 1 ), // hardcoded
+ .GROUP_DELAY ( 0 ),
+ .INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ),
+ .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS ),
+ .ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ),
+ .FLUSH_ENABLE ( 1 )
+ ) wa_output_stream_inst (
+ .clock ( clk_dla ),
+ .i_aresetn ( i_aresetn ),
+ .i_flush ( w_flush ), // flush only activated with an incoming valid transaction
+ .o_din_ready ( o_ready ),
+ .i_din_valid ( i_valid ),
+ .i_din_data ( i_data ),
+ .i_dout_ready ( ~wr_full ), // to be received from output streamer
+ .o_dout_valid ( adapted_valid ),
+ .o_dout_data ( adapted_data )
+ );
+
+ logic rd_empty;
+ // Instantiate the output FIFO to perform clock domain crossing
+ dla_acl_dcfifo #(
+ .DEPTH (FIFO_DEPTH),
+ .WIDTH (TDATA_WIDTH),
+ .ALMOST_FULL_CUTOFF (FIFO_CUTOFF)
+ ) dla_acl_fifo_inst (
+ .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally
+ .wr_clock (clk_dla),
+ .wr_req (adapted_valid && !wr_full),
+ .wr_data (adapted_data),
+ .wr_almost_full (), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF
+ .wr_full (wr_full), // inform upstream that we cannot accept data
+
+ .rd_clock (clk_axi),
+ .rd_empty (rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
+ .rd_ack (ostreamer_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
+ .rd_data (ostreamer_output_data)
+ );
+ assign output_tx_received = ~rd_empty & ostreamer_downstream_ready;
+ assign output_valid = ~rd_empty;
+ end
+
+ // Convert FIFO interface to AXI signals
+ logic flush_exit_fifo; // signal used to flush the fifo out of any leftover invalid transactions
+ logic [TSTRB_WIDTH-1:0] strb_signal_normal, strb_signal_last;
+
+ assign o_axi_t_valid = output_valid & config_is_loaded & ((o_axi_t_strb != '0)) & ~flush_exit_fifo;
+ assign ostreamer_downstream_ready = (i_axi_t_ready & config_is_loaded) || (flush_exit_fifo);
+ assign o_axi_t_data = ostreamer_output_data;
+ assign o_axi_t_last = (total_counter_out == (config_total_transfers_adjusted - 1)) &
+ output_valid & config_is_loaded & ostreamer_downstream_ready & config_last_stream;
+
+
+ // Control logic to produce the o_axi_t_strb signal
+ assign strb_signal_normal = '1;
+ assign strb_signal_last = ((1 << config_valid_bytes_stream_width) - 1);
+ assign o_axi_t_strb = flush_exit_fifo ? '0 :
+ channel_chunks_counter > config_last_index ? '0 :
+ channel_chunks_counter == config_last_index ? strb_signal_last :
+ strb_signal_normal;
+ //
+ // state machine to decide if output streamer is producing output from the exit fifo
+ // or flushing (emptying) the exit fifo
+ // we need the ability to flush the exit fifo in some situations where part of the last transactions from the xbar
+ // do not carry actual data (all zeros). In these situations, we need the tlast to come out with the
+ // actual last valid transaction with valid data, and these invalid transactions to be removed from the exit fifo
+ //
+ // for example, if cvec=32 elements, axi=8 elements (128 bits), and channels=6, each cvec produces 4 axi transactions
+ // In the last cvec transaction (4 axi transactions), we would have the first with valid data
+ // and the remaining three with zeros
+ // Up until 2024.3 release, we would produce tlast at the last axi transaction (#4), but this is
+ // not efficient since the last valid transaction happens three transactions earlier, TX number 1
+ // out of the last 4 transactions. With the state machine, we produce tlast at the first transaction
+ // of the last four alongside the tlast. Finally we enter a flush state and flush the exit fifo
+ // to empty it out of these last three invalid transactions
+ //
+ typedef enum logic {
+ ACTIVE = 1'b0,
+ FLUSH = 1'b1
+ } state_t;
+ state_t state, state_next;
+
+ always_ff @(posedge clk_axi) begin
+ if (~sync_axi_resetn[0]) begin
+ state <= ACTIVE;
+ end else begin
+ state <= state_next;
+ end
+ end
+ logic no_exit_fifo_flush_needed;
+
+ assign no_exit_fifo_flush_needed = config_is_loaded && (config_total_transfers == config_total_transfers_adjusted);
+ always_comb begin
+ state_next = state;
+ flush_exit_fifo = 0;
+ case(state)
+ ACTIVE: begin
+ flush_exit_fifo = 0;
+ if (no_exit_fifo_flush_needed) begin
+ state_next = ACTIVE;
+ end else if ((total_counter_out == config_total_transfers_adjusted - 1) && config_is_loaded) begin
+ state_next = FLUSH;
+ end
+ end
+ FLUSH: begin
+ flush_exit_fifo = 1;
+ if ((total_counter_out == config_total_transfers - 1) && config_is_loaded) begin
+ state_next = ACTIVE;
+ end
+ end
+ default: state_next = ACTIVE; // Default state
+ endcase
+ end
+
+endmodule