summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_output_logic.sv
blob: 3721522debfaf6fe6af7f636c6b0e3bfe3b05421 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
// Copyright 2020-2024 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

/**
 * dla_lt_output_logic.sv
 *
 * Keeps track of finished CVEC lines, and writes the finished lines to the output.
 * Also keeps track of the dimensions of the output, and outputs padding lines when
 * required.
 *
 */

`resetall
`undefineall
`default_nettype none

module dla_lt_output_logic import dla_lt_pkg::*; #(
  parameter int NUM_BUFFER_POOLS,
  parameter int CVEC_PER_BUFFER,
  parameter int ELEMENTS_PER_CYCLE,
  parameter int N_POOL_BITS,
  parameter int MAX_OUTPUT_W,
  parameter int MAX_OUTPUT_H,
  parameter int MAX_OUTPUT_D,
  parameter int MAX_OUTPUT_C,
  parameter int CVEC,
  parameter int CNT_BITS,
  parameter int MAX_DIM_BITS,
  localparam int MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC)
) (
  input wire clk,
  input wire i_rstn,
  input wire [MAX_OUTPUT_C-1:0][16-1:0] i_output_line_data [NUM_BUFFER_POOLS-1:0],
  input wire [($clog2(CVEC_PER_BUFFER))-1:0] i_curr_out_line [NUM_BUFFER_POOLS-1:0],
  input wire [ELEMENTS_PER_CYCLE-1:0] i_completed_vol_tally,
  layout_transform_config_if if_lt_config,
  input wire i_ready,

  output logic [($clog2(CVEC_PER_BUFFER))-1:0] o_line_num [NUM_BUFFER_POOLS-1:0],
  output logic [NUM_BUFFER_POOLS-1:0] o_read_req,
  output logic [CVEC-1:0][16-1:0] o_data,
  output logic o_valid,
  output logic o_last,
  input wire i_stall,
  output logic [$clog2((MAX_OUTPUT_VOLUME))-1:0] o_lines_written,
  output int o_finished_lines
);

  logic [MAX_DIM_BITS-1:0] top_full_padding, left_full_padding;
  assign top_full_padding = if_lt_config.data.top_full_padding;
  assign left_full_padding = if_lt_config.data.left_full_padding;

  logic [$clog2((MAX_OUTPUT_VOLUME))-1:0] lines_written;
  logic [16-1:0] finished_lines_comb, finished_lines_reg;
  assign o_finished_lines = finished_lines_reg;
  assign o_lines_written = lines_written;

  // Add up all the lines in the current cycle that are complete CVEC lines, represented in the tally vector.
  always_comb
  begin
    finished_lines_comb = 0;
    for (int i =0 ; i < ELEMENTS_PER_CYCLE; i++)
    begin
      finished_lines_comb = finished_lines_comb + i_completed_vol_tally[i];
    end
  end

  logic [N_POOL_BITS-1:0] pool_out_comb;
  logic [N_POOL_BITS-1:0] pool_out;
  logic [$clog2(CVEC_PER_BUFFER):0] line_out_comb;
  logic [$clog2((NUM_BUFFER_POOLS) * (CVEC_PER_BUFFER)):0] output_cursor;

  logic inside_padding;
  logic write_line;
  logic extra_cvec;

  shortint unsigned output_w_cnt;
  shortint unsigned output_h_cnt;
  shortint unsigned total_output;
  shortint unsigned cvec_rollover;
  shortint unsigned cvec_count;
  logic [MAX_DIM_BITS-1:0] left_pad, top_pad;

  assign pool_out_comb = output_cursor[N_POOL_BITS-1:0];
  assign line_out_comb = ((output_cursor >> N_POOL_BITS) & (CVEC_PER_BUFFER-1));


  logic overflow;
  logic front_pad;
  assign overflow = (output_w_cnt >= if_lt_config.data.w_overflow
    || output_h_cnt >= if_lt_config.data.h_overflow) && total_output < (if_lt_config.data.output_volume);
  assign front_pad = (~top_full_padding[MAX_DIM_BITS-1] && top_pad <= top_full_padding) || (~left_full_padding[MAX_DIM_BITS-1] && left_pad <= left_full_padding);


  assign inside_padding = i_stall == 0 && i_ready && (overflow || front_pad);

  assign extra_cvec = (total_output >= if_lt_config.data.output_face_area) && total_output < if_lt_config.data.output_volume;

  assign write_line = (i_stall == 0 && i_ready && (( lines_written < finished_lines_reg) || (extra_cvec)));


  always_ff @( posedge clk)
  begin
    o_valid <= 1'b0;
    finished_lines_reg <= finished_lines_reg + finished_lines_comb;
    o_read_req <= '{default: '0};
    if (!i_rstn)
    begin
      finished_lines_reg <= '0;
      lines_written <= '0;
      output_cursor <= '0;
      o_data <= '0;
      o_line_num <= '{default: '0};
      pool_out <= '0;

      output_h_cnt <= 0;
      output_w_cnt <= 0;

      total_output <= 0;
      cvec_rollover <= 0;
      o_last <= 0;
      cvec_count <= '0;
      left_pad <= '0;
      top_pad <= '0;
    end
    else if (if_lt_config.valid)
    begin
      o_line_num[pool_out_comb] <= line_out_comb;
      pool_out <= pool_out_comb;
      o_data <= i_output_line_data[pool_out_comb][CVEC*(cvec_count)+:CVEC];
      o_last <= lines_written == (if_lt_config.data.output_volume - 1) || total_output == (if_lt_config.data.output_volume - 1);
      output_cursor <= output_cursor;
      lines_written <= lines_written;
      cvec_rollover <= cvec_rollover;
      total_output <= total_output;

      // Could do the following with a state machine..?
      o_valid <= '0;
      o_read_req[pool_out_comb] <= 1'b1;
      if (inside_padding) begin
        o_data <= '0;
        o_valid <= '1;
      end
      else if (write_line) begin
        o_read_req[pool_out_comb] <= 1'b1;
      end
      if (i_stall) begin
        o_valid <= o_valid;
        o_data <= o_data;
      end else begin
        if ((write_line == 1 || extra_cvec) && inside_padding == 0 ) begin
          o_read_req[pool_out_comb] <= 1'b1;

          o_valid <= i_curr_out_line[pool_out_comb] == line_out_comb; // This makes it so that output stalls when all RAM ports are busy...
          if (i_curr_out_line[pool_out_comb] == line_out_comb) begin
            output_cursor <= ((output_cursor + 1) & ((NUM_BUFFER_POOLS) * (CVEC_PER_BUFFER) - 1));

            lines_written <= lines_written + 1;
          end
        end

        if (inside_padding || write_line == 1 && i_curr_out_line[pool_out_comb] == line_out_comb) begin
          total_output <= total_output + 1;
          cvec_rollover <= cvec_rollover + 1;

          // Keep track of dimensionality of output:
          output_w_cnt <= output_w_cnt + 1;
          left_pad <= left_pad+1;

          if (output_w_cnt >= if_lt_config.data.output_w-1)
          begin
            output_w_cnt <= 0;
            left_pad <= 0;
            top_pad <= top_pad + 1;
            output_h_cnt <= output_h_cnt + 1;
            if (output_h_cnt >= if_lt_config.data.output_h-1)
            begin
              output_h_cnt <= 0;
            end
          end
          if ((cvec_rollover) == if_lt_config.data.output_face_area - 1) // start going to overflow...
          begin
            output_cursor <= 0;
            cvec_rollover <= 0;
            cvec_count <= cvec_count + 1;
            left_pad<= 0;
            top_pad <= 0;
            output_w_cnt <= 0;
            output_h_cnt <= 0;
          end
        end
      end
    end
  end
endmodule