summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv
blob: 90b7fb8dfb7a294e71c5ae48c1f28206fa3c4f00 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
// Copyright 2020-2024 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

// Description of functionality:
// This module is responsible for receiving DLA data from the cross bar (essentially coming from PE array)
// in HWC format and dispatches it to an AXI stream interface
// This module is fed by width adapter that converts between xbar_k_vec and the AXI bus width (specified at the arch file
// through the output_stream_interface:bus_width)
// Once data is converted to the correct interface width, it gets stored in a dual-clock FIFO which acts
// as a clock crosser between the clk_dla and the clk_axi. It also acts as a conversion from the simple
// ready-valid protocol in DLA to an AXI stream protocol (most of the signals won't be used)
// Some control logic is used to specify which bytes are valid (through the tstrb signal), which will be used
// when the number of output channels is not a multiple of k_vec. For example, if k_vec = c_vec = 8,
// output_channels (O_C) = 12, we will have two transfers of size 8, the first transfer will have 8 valid outputs,
// but the second transfer will have 4 valid outputs and 4 unvalid outputs (zeros), so we use the t_strb to indicate
// which bytes (essentially which FP16 elements) are valid, and it's expected to be consumed by the downstream blocks
// (receiver of the AXI signals)

`resetall
`undefineall
`default_nettype none
`include "dla_acl_parameter_assert.svh"

module dla_output_streamer import dla_common_pkg::*, dla_output_streamer_pkg::*; #(
  // DLA (input data) side parameters
  parameter   int CONFIG_WIDTH                  = 32,
  // AXI side parameters
  parameter   int TDATA_WIDTH                   = 128,  // an integer number of bits (typically a power of 2 from 8 - 1024)
  parameter   int TID_WIDTH                     = 8,    // recommended to be no more than 8.
  parameter   int TDEST_WIDTH                   = 8,    // recommended to be no more than 8.

  // Data DC FIFO Depth
  parameter   int FIFO_DEPTH                    = 1024,


  parameter   int INPUT_WIDTH_ELEMENTS          = 1,
  parameter   int INPUT_ELEMENT_WIDTH           = 1,

  // Decide if Width adaptaion resides before or after the data CDC FIFO
  localparam  int INPUT_DATA_BITS               = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH,
  localparam  int OUTPUT_WIDTH_ELEMENTS         = TDATA_WIDTH / INPUT_ELEMENT_WIDTH,
  localparam  int WA_BEFORE_CDC                 = INPUT_WIDTH_ELEMENTS < OUTPUT_WIDTH_ELEMENTS,

  // DLA (input data) side derived parameters

  // AXI side derived  parameters
  localparam  int TSTRB_WIDTH                   = TDATA_WIDTH / 8,
  localparam  int TUSER_WIDTH                   = TDATA_WIDTH / 8
) (
  // Master/driver (DLA) signals
  input  wire                                 clk_dla,
  input  wire                                 i_aresetn,

  // config input for output streaming
  input  wire  [CONFIG_WIDTH-1:0]             i_config_data,
  input  wire                                 i_config_valid,
  output logic                                o_config_ready,

  // input data
  output logic                                o_ready,      // backpressure to xbar
  input  wire                                 i_valid,      // valid from xbar
  input  wire  [INPUT_DATA_BITS-1:0]          i_data,       // data from xbar after width adaptation
  input wire                                  i_data_done,  // data from xbar sent was the last one (the actual last data comes after WA)
  output logic                                o_last_data_received, // got the last data

  // config input for flush handling
  input  wire  [CONFIG_WIDTH-1:0]             i_config_flush_data,
  input  wire                                 i_config_flush_valid,
  output logic                                o_config_flush_ready,

  // input signals for flush generation
  output wire                                 o_input_done,       // xbar input for a layer is done and received

  // Receiver (AXI) signals
  input wire                                  clk_axi,
  input wire                                  i_axi_aresetn,
  output logic                                o_axi_t_valid,      // indicates the Transmitter is driving a valid transfer
  input wire                                  i_axi_t_ready,      // indicates that a Receiver can accept a transfer.
  output wire                                 o_axi_t_last,       // Unused - indicates the boundary of a packet
  output wire [TDATA_WIDTH-1:0]               o_axi_t_data,       // the primary payload used to provide the data that is passing across the interface
  output wire [TSTRB_WIDTH-1:0]               o_axi_t_strb,       // the byte qualifier that indicates whether the content of the associated byte of TDATA is valid
  output wire [TSTRB_WIDTH-1:0]               o_axi_t_keep,       // Unused
  output wire [TID_WIDTH-1:0]                 o_axi_t_id,         // Unused - data stream identifier
  output wire [TDEST_WIDTH-1:0]               o_axi_t_dest,       // Unused - provides routing information for the data stream
  output wire [TUSER_WIDTH-1:0]               o_axi_t_user,       // Unused -  user-defined sideband information that can be transmitted along the data stream.
  output wire                                 o_axi_t_wakeup      // Unused -  identifies any activity associated with AXI-Stream interface
);
    //reset parameterization
    localparam int RESET_USE_SYNCHRONIZER = 1;
    localparam int RESET_PIPE_DEPTH       = 3;
    localparam int RESET_NUM_COPIES       = 1;

    //////////////////////////////////////////
    //  Reset Synchronization onto DLA clk  //
    /////////////////////////////////////////

    logic [RESET_NUM_COPIES-1:0] sync_dla_resetn;

    dla_reset_handler_simple #(
        .USE_SYNCHRONIZER   (RESET_USE_SYNCHRONIZER),
        .PIPE_DEPTH         (RESET_PIPE_DEPTH),
        .NUM_COPIES         (RESET_NUM_COPIES)
    ) dla_resetn_synchronizer (
        .clk                (clk_dla),
        .i_resetn           (i_aresetn),
        .o_sclrn            (sync_dla_resetn)
    );

    //////////////////////////////////////////
    //  Reset Synchronization onto AXI clk  //
    /////////////////////////////////////////

    logic [RESET_NUM_COPIES-1:0] sync_axi_resetn;

    dla_reset_handler_simple #(
        .USE_SYNCHRONIZER   (RESET_USE_SYNCHRONIZER),
        .PIPE_DEPTH         (RESET_PIPE_DEPTH),
        .NUM_COPIES         (RESET_NUM_COPIES)
    ) axi_resetn_synchronizer (
        .clk                (clk_axi),
        .i_resetn           (i_aresetn),
        .o_sclrn            (sync_axi_resetn)
    );

    // last data
    logic received_last_data;
    logic xbar_sent_last_data;
    always_ff @ (posedge clk_dla) begin
      received_last_data <= 1'b0;
      if (i_data_done) begin
        xbar_sent_last_data <= 1'b1;
      end
      if (xbar_sent_last_data & i_valid & o_ready) begin
        received_last_data <= 1'b1;
        xbar_sent_last_data <= 1'b0;
      end
      if (~sync_dla_resetn) begin
        received_last_data <= 1'b0;
        xbar_sent_last_data <= 1'b0;
      end
    end
    assign o_last_data_received = received_last_data;

    logic w_flush;
    // Instaniate the flush generation block
    dla_output_streamer_flush_handler # (
        .CONFIG_WIDTH(CONFIG_WIDTH)
    ) flush_generator (
        .clk_dla(clk_dla),
        .i_aresetn(sync_dla_resetn[0]),
        .i_config_data(i_config_flush_data),
        .i_config_valid(i_config_flush_valid),
        .o_config_ready(o_config_flush_ready),
        .i_ready(o_ready),
        .i_valid(i_valid),
        .o_flush(w_flush),
        .o_input_done(o_input_done)
    );

    // Handle Config data and strb generation
    // Writing side dla_clk
    logic config_is_loaded;
    logic cfg_rd_empty, cfg_rd_ack, cfg_rd_amost_empty, cfg_wr_almost_full;
    logic [CONFIG_WIDTH-1:0] cfg_rd_data;
    dla_acl_dcfifo #(
        .DEPTH                  (32),
        .WIDTH                  (CONFIG_WIDTH),
        .ALMOST_FULL_CUTOFF     (2)
    ) dla_acl_fifo_inst_cfg (
        .async_resetn           (i_aresetn),  // dcfifo will synchronize the reset internally
        .wr_clock               (clk_dla),
        .wr_req                 (i_config_valid),
        .wr_data                (i_config_data),
        .wr_almost_full         (cfg_wr_almost_full), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF
        .wr_full                (), // inform upstream that we cannot accept data

        .rd_clock               (clk_axi),
        .rd_empty               (cfg_rd_empty),  // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
        .rd_ack                 (~config_is_loaded), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
        .rd_data                (cfg_rd_data),
        .rd_almost_empty        (cfg_rd_amost_empty) // early indication to downstream that soon fifo may no longer be able to supply data, threshold controlled by ALMOST_EMPTY_CUTOFF
    );

    assign o_config_ready = ~cfg_wr_almost_full;

    // Reading side AXI clock
    logic   [CONFIG_WIDTH-1:0] config_offset;
    output_streamer_config_t cfg;
    localparam int NUM_CONFIG_OFFSETS = divCeil($bits(cfg), CONFIG_WIDTH);

    // For now, ensure size of config is exact multiple of CONFIG_WIDTH
    `DLA_ACL_PARAMETER_ASSERT($bits(cfg) == NUM_CONFIG_OFFSETS * CONFIG_WIDTH);

    logic [CONFIG_WIDTH-1:0]     config_total_transfers;                // total number of axi_data transfers for a layer
    logic [CONFIG_WIDTH-1:0]     config_total_transfers_adjusted;       // total number of axi_data transfers for a layer miuns any invalid last transactions
    logic [CONFIG_WIDTH-1:0]     config_transfers_per_hw_pixel;         // Decides the total number of transfers needed to send a full set of output channels for a single piexel
                                                                        // for a single width/height pixel given a specific data_width for the axi interface.
    logic [CONFIG_WIDTH-1:0]     config_valid_bytes_stream_width;       // Determines how many elements of the last transfer are valid.
    logic [CONFIG_WIDTH-1:0]     config_last_index;                     // Determines index of last valid transaction per height/width.
    logic [CONFIG_WIDTH-1:0]     config_last_stream;                    // Determines if this stream is the last stream to geenrate tlast
    logic [CONFIG_WIDTH-1:0]     channel_chunks_counter;                   // Counter for the config_transfers_per_hw_pixel
    logic [CONFIG_WIDTH-1:0]     total_counter_out;
    logic                        ostreamer_downstream_ready;

    assign config_total_transfers = cfg.total_transfers;
    assign config_total_transfers_adjusted = cfg.total_transfers_adjusted;
    assign config_transfers_per_hw_pixel = cfg.transfers_per_hw_pixel;
    assign config_valid_bytes_stream_width = cfg.valid_bytes_stream_width;
    assign config_last_index = cfg.last_index;
    assign config_last_stream = cfg.last_stream;
    logic output_valid;
    logic output_tx_received;
    always_ff @(posedge clk_axi) begin
        // config state machine
        if (~config_is_loaded & ~cfg_rd_empty) begin
            // update progress in accepting NUM_CONFIG_OFFSETS transactions
            if (config_offset == NUM_CONFIG_OFFSETS-1) begin
                config_offset    <= '0;
                config_is_loaded <= 1'b1;
            end
            else begin
                config_offset  <= config_offset + 1'b1;
            end
            cfg <= (cfg_rd_data[CONFIG_WIDTH-1:0] << ($bits(cfg) - CONFIG_WIDTH)) | (cfg >> CONFIG_WIDTH);
        end else begin
            // keep track of how many transactions are read by AXI to drive t_strb
            if (config_is_loaded & output_tx_received) begin
                total_counter_out <= total_counter_out + 1;
                if (total_counter_out == (config_total_transfers - 1)) begin
                    config_is_loaded <= 1'b0;
                    total_counter_out <=  '0;
                end
                channel_chunks_counter <= channel_chunks_counter + 1;  // increment counter
                if (channel_chunks_counter == (config_transfers_per_hw_pixel - 1)) begin
                    channel_chunks_counter <= '0; //
                end
            end
        end
        // resetn
        if (~sync_axi_resetn[0]) begin
            config_is_loaded <= 1'b0;
            channel_chunks_counter <= '0;
            total_counter_out <= '0;
            config_offset <= '0;
        end
    end

    logic [TDATA_WIDTH-1:0] ostreamer_output_data;
    localparam  int FIFO_CUTOFF = 0; // No need for slack cycles as the full goes back and gets handled in the same cycle in the width adapter

    if (!WA_BEFORE_CDC) begin : GEN_WA_AFTER_DC_FIFO
        // In this situation we want the width adaptation to happen in the slow clock domain
        // so that the upstream IP can continue producing data that goes straight into the fifo
        localparam int XBAR_WIDTH_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH;
        logic [XBAR_WIDTH_BITS-1:0] fifo_data;
        logic                       fifo_rd_empty, fifo_downstream_ready;
        logic                       wr_full;

        dla_acl_dcfifo #(
            .DEPTH                  (FIFO_DEPTH),
            .WIDTH                  (XBAR_WIDTH_BITS),
            .ALMOST_FULL_CUTOFF     (FIFO_CUTOFF)
        ) dla_acl_fifo_inst (
            .async_resetn           (i_aresetn),  // dcfifo will synchronize the reset internally
            .wr_clock               (clk_dla),
            .wr_req                 (i_valid && o_ready),
            .wr_data                (i_data),
            .wr_full                (wr_full), // inform upstream that we cannot accept data

            .rd_clock               (clk_axi),
            .rd_empty               (fifo_rd_empty),  // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
            .rd_ack                 (fifo_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
            .rd_data                (fifo_data)
        );

        logic                                adapted_valid;
        // Instantiate a width adapter to convert from xbar_k_vec width to AXI width
        dla_width_adapter #(
            .GROUP_NUM                     ( 1                    ), // hardcoded
            .GROUP_DELAY                   ( 0                    ),
            .INPUT_DATA_WIDTH_IN_ELEMENTS  ( INPUT_WIDTH_ELEMENTS ),
            .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS),
            .ELEMENT_WIDTH                 ( INPUT_ELEMENT_WIDTH  ),
            .FLUSH_ENABLE                  ( 0                    )
        ) wa_output_stream_inst (
            .clock        ( clk_axi                    ),
            .i_aresetn    ( i_aresetn                  ),
            .i_flush      ( 1'b0                       ),
            .o_din_ready  ( fifo_downstream_ready      ),
            .i_din_valid  ( ~fifo_rd_empty             ),
            .i_din_data   ( fifo_data                  ),
            .i_dout_ready ( ostreamer_downstream_ready ), // to be received from output streamer
            .o_dout_valid ( adapted_valid              ),
            .o_dout_data  ( ostreamer_output_data      )
        );
        assign output_tx_received = adapted_valid & ostreamer_downstream_ready;
        assign output_valid = adapted_valid;
        // We backpressure the upstream if the fifo is full, or if we want to flush (empty) the exit fifo
        // out of any leftover invalid transactions that might come out of cvec != axi
        assign o_ready = ~wr_full;

    end else begin: GEN_WA_BEFORE_DC_FIFO
        logic                     adapted_valid;
        logic [TDATA_WIDTH-1:0]   adapted_data;
        logic wr_full; // dc fifo for data after width adaptation

        // Instantiate a width adapter to convert from xbar_k_vec width to AXI width
        dla_width_adapter #(
            .GROUP_NUM                     ( 1                     ), // hardcoded
            .GROUP_DELAY                   ( 0                     ),
            .INPUT_DATA_WIDTH_IN_ELEMENTS  ( INPUT_WIDTH_ELEMENTS  ),
            .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS ),
            .ELEMENT_WIDTH                 ( INPUT_ELEMENT_WIDTH   ),
            .FLUSH_ENABLE                  ( 1                     )
        ) wa_output_stream_inst (
            .clock        ( clk_dla        ),
            .i_aresetn    ( i_aresetn      ),
            .i_flush      ( w_flush        ),  // flush only activated with an incoming valid transaction
            .o_din_ready  ( o_ready        ),
            .i_din_valid  ( i_valid        ),
            .i_din_data   ( i_data         ),
            .i_dout_ready ( ~wr_full       ), // to be received from output streamer
            .o_dout_valid ( adapted_valid  ),
            .o_dout_data  ( adapted_data   )
        );

        logic rd_empty;
        // Instantiate the output FIFO to perform clock domain crossing
        dla_acl_dcfifo #(
            .DEPTH                  (FIFO_DEPTH),
            .WIDTH                  (TDATA_WIDTH),
            .ALMOST_FULL_CUTOFF     (FIFO_CUTOFF)
        ) dla_acl_fifo_inst (
            .async_resetn           (i_aresetn),  // dcfifo will synchronize the reset internally
            .wr_clock               (clk_dla),
            .wr_req                 (adapted_valid && !wr_full),
            .wr_data                (adapted_data),
            .wr_almost_full         (), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF
            .wr_full                (wr_full), // inform upstream that we cannot accept data

            .rd_clock               (clk_axi),
            .rd_empty               (rd_empty),  // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
            .rd_ack                 (ostreamer_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
            .rd_data                (ostreamer_output_data)
        );
        assign output_tx_received = ~rd_empty & ostreamer_downstream_ready;
        assign output_valid = ~rd_empty;
    end

    // Convert FIFO interface to AXI signals
    logic flush_exit_fifo; // signal used to flush the fifo out of any leftover invalid transactions
    logic [TSTRB_WIDTH-1:0] strb_signal_normal, strb_signal_last;

    assign o_axi_t_valid = output_valid & config_is_loaded & ((o_axi_t_strb != '0)) & ~flush_exit_fifo;
    assign ostreamer_downstream_ready = (i_axi_t_ready & config_is_loaded) || (flush_exit_fifo);
    assign o_axi_t_data = ostreamer_output_data;
    assign o_axi_t_last = (total_counter_out == (config_total_transfers_adjusted - 1)) &
                          output_valid & config_is_loaded & ostreamer_downstream_ready & config_last_stream;


    // Control logic to produce the o_axi_t_strb signal
    assign strb_signal_normal = '1;
    assign strb_signal_last = ((1 << config_valid_bytes_stream_width) - 1);
    assign o_axi_t_strb = flush_exit_fifo ? '0 :
                          channel_chunks_counter > config_last_index ? '0 :
                          channel_chunks_counter == config_last_index ? strb_signal_last :
                          strb_signal_normal;
    //
    // state machine to decide if output streamer is producing output from the exit fifo
    // or flushing (emptying) the exit fifo
    // we need the ability to flush the exit fifo in some situations where part of the last transactions from the xbar
    // do not carry actual data (all zeros). In these situations, we need the tlast to come out with the
    // actual last valid transaction with valid data, and these invalid transactions to be removed from the exit fifo
    //
    // for example, if cvec=32 elements, axi=8 elements (128 bits), and channels=6, each cvec produces 4 axi transactions
    // In the last cvec transaction (4 axi transactions), we would have the first with valid data
    // and the remaining three with zeros
    // Up until 2024.3 release, we would produce tlast at the last axi transaction (#4), but this is
    // not efficient since the last valid transaction happens three transactions earlier, TX number 1
    // out of the last 4 transactions. With the state machine, we produce tlast at the first transaction
    // of the last four alongside the tlast. Finally we enter a flush state and flush the exit fifo
    // to empty it out of these last three invalid transactions
    //
    typedef enum logic  {
            ACTIVE = 1'b0,
            FLUSH = 1'b1
        } state_t;
    state_t state, state_next;

    always_ff @(posedge clk_axi) begin
        if (~sync_axi_resetn[0]) begin
            state <= ACTIVE;
        end else begin
            state <= state_next;
        end
    end
    logic no_exit_fifo_flush_needed;

    assign no_exit_fifo_flush_needed = config_is_loaded && (config_total_transfers  ==  config_total_transfers_adjusted);
    always_comb begin
      state_next = state;
      flush_exit_fifo = 0;
      case(state)
        ACTIVE: begin
          flush_exit_fifo = 0;
          if (no_exit_fifo_flush_needed) begin
            state_next = ACTIVE;
          end else if ((total_counter_out == config_total_transfers_adjusted - 1) && config_is_loaded) begin
            state_next = FLUSH;
          end
        end
        FLUSH: begin
          flush_exit_fifo = 1;
          if ((total_counter_out == config_total_transfers - 1) && config_is_loaded) begin
            state_next = ACTIVE;
          end
        end
        default: state_next = ACTIVE; // Default state
      endcase
    end

endmodule