// Copyright 2020-2024 Intel Corporation. // // This software and the related documents are Intel copyrighted materials, // and your use of them is governed by the express license under which they // were provided to you ("License"). Unless the License provides otherwise, // you may not use, modify, copy, publish, distribute, disclose or transmit // this software or the related documents without Intel's prior written // permission. // // This software and the related documents are provided as is, with no express // or implied warranties, other than those that are expressly stated in the // License. /** * dla_lt_output_logic.sv * * Keeps track of finished CVEC lines, and writes the finished lines to the output. * Also keeps track of the dimensions of the output, and outputs padding lines when * required. * */ `resetall `undefineall `default_nettype none module dla_lt_output_logic import dla_lt_pkg::*; #( parameter int NUM_BUFFER_POOLS, parameter int CVEC_PER_BUFFER, parameter int ELEMENTS_PER_CYCLE, parameter int N_POOL_BITS, parameter int MAX_OUTPUT_W, parameter int MAX_OUTPUT_H, parameter int MAX_OUTPUT_D, parameter int MAX_OUTPUT_C, parameter int CVEC, parameter int CNT_BITS, parameter int MAX_DIM_BITS, localparam int MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC) ) ( input wire clk, input wire i_rstn, input wire [MAX_OUTPUT_C-1:0][16-1:0] i_output_line_data [NUM_BUFFER_POOLS-1:0], input wire [($clog2(CVEC_PER_BUFFER))-1:0] i_curr_out_line [NUM_BUFFER_POOLS-1:0], input wire [ELEMENTS_PER_CYCLE-1:0] i_completed_vol_tally, layout_transform_config_if if_lt_config, input wire i_ready, output logic [($clog2(CVEC_PER_BUFFER))-1:0] o_line_num [NUM_BUFFER_POOLS-1:0], output logic [NUM_BUFFER_POOLS-1:0] o_read_req, output logic [CVEC-1:0][16-1:0] o_data, output logic o_valid, output logic o_last, input wire i_stall, output logic [$clog2((MAX_OUTPUT_VOLUME))-1:0] o_lines_written, output int o_finished_lines ); logic [MAX_DIM_BITS-1:0] top_full_padding, left_full_padding; assign top_full_padding = if_lt_config.data.top_full_padding; assign left_full_padding = if_lt_config.data.left_full_padding; logic [$clog2((MAX_OUTPUT_VOLUME))-1:0] lines_written; logic [16-1:0] finished_lines_comb, finished_lines_reg; assign o_finished_lines = finished_lines_reg; assign o_lines_written = lines_written; // Add up all the lines in the current cycle that are complete CVEC lines, represented in the tally vector. always_comb begin finished_lines_comb = 0; for (int i =0 ; i < ELEMENTS_PER_CYCLE; i++) begin finished_lines_comb = finished_lines_comb + i_completed_vol_tally[i]; end end logic [N_POOL_BITS-1:0] pool_out_comb; logic [N_POOL_BITS-1:0] pool_out; logic [$clog2(CVEC_PER_BUFFER):0] line_out_comb; logic [$clog2((NUM_BUFFER_POOLS) * (CVEC_PER_BUFFER)):0] output_cursor; logic inside_padding; logic write_line; logic extra_cvec; shortint unsigned output_w_cnt; shortint unsigned output_h_cnt; shortint unsigned total_output; shortint unsigned cvec_rollover; shortint unsigned cvec_count; logic [MAX_DIM_BITS-1:0] left_pad, top_pad; assign pool_out_comb = output_cursor[N_POOL_BITS-1:0]; assign line_out_comb = ((output_cursor >> N_POOL_BITS) & (CVEC_PER_BUFFER-1)); logic overflow; logic front_pad; assign overflow = (output_w_cnt >= if_lt_config.data.w_overflow || output_h_cnt >= if_lt_config.data.h_overflow) && total_output < (if_lt_config.data.output_volume); assign front_pad = (~top_full_padding[MAX_DIM_BITS-1] && top_pad <= top_full_padding) || (~left_full_padding[MAX_DIM_BITS-1] && left_pad <= left_full_padding); assign inside_padding = i_stall == 0 && i_ready && (overflow || front_pad); assign extra_cvec = (total_output >= if_lt_config.data.output_face_area) && total_output < if_lt_config.data.output_volume; assign write_line = (i_stall == 0 && i_ready && (( lines_written < finished_lines_reg) || (extra_cvec))); always_ff @( posedge clk) begin o_valid <= 1'b0; finished_lines_reg <= finished_lines_reg + finished_lines_comb; o_read_req <= '{default: '0}; if (!i_rstn) begin finished_lines_reg <= '0; lines_written <= '0; output_cursor <= '0; o_data <= '0; o_line_num <= '{default: '0}; pool_out <= '0; output_h_cnt <= 0; output_w_cnt <= 0; total_output <= 0; cvec_rollover <= 0; o_last <= 0; cvec_count <= '0; left_pad <= '0; top_pad <= '0; end else if (if_lt_config.valid) begin o_line_num[pool_out_comb] <= line_out_comb; pool_out <= pool_out_comb; o_data <= i_output_line_data[pool_out_comb][CVEC*(cvec_count)+:CVEC]; o_last <= lines_written == (if_lt_config.data.output_volume - 1) || total_output == (if_lt_config.data.output_volume - 1); output_cursor <= output_cursor; lines_written <= lines_written; cvec_rollover <= cvec_rollover; total_output <= total_output; // Could do the following with a state machine..? o_valid <= '0; o_read_req[pool_out_comb] <= 1'b1; if (inside_padding) begin o_data <= '0; o_valid <= '1; end else if (write_line) begin o_read_req[pool_out_comb] <= 1'b1; end if (i_stall) begin o_valid <= o_valid; o_data <= o_data; end else begin if ((write_line == 1 || extra_cvec) && inside_padding == 0 ) begin o_read_req[pool_out_comb] <= 1'b1; o_valid <= i_curr_out_line[pool_out_comb] == line_out_comb; // This makes it so that output stalls when all RAM ports are busy... if (i_curr_out_line[pool_out_comb] == line_out_comb) begin output_cursor <= ((output_cursor + 1) & ((NUM_BUFFER_POOLS) * (CVEC_PER_BUFFER) - 1)); lines_written <= lines_written + 1; end end if (inside_padding || write_line == 1 && i_curr_out_line[pool_out_comb] == line_out_comb) begin total_output <= total_output + 1; cvec_rollover <= cvec_rollover + 1; // Keep track of dimensionality of output: output_w_cnt <= output_w_cnt + 1; left_pad <= left_pad+1; if (output_w_cnt >= if_lt_config.data.output_w-1) begin output_w_cnt <= 0; left_pad <= 0; top_pad <= top_pad + 1; output_h_cnt <= output_h_cnt + 1; if (output_h_cnt >= if_lt_config.data.output_h-1) begin output_h_cnt <= 0; end end if ((cvec_rollover) == if_lt_config.data.output_face_area - 1) // start going to overflow... begin output_cursor <= 0; cvec_rollover <= 0; cvec_count <= cvec_count + 1; left_pad<= 0; top_pad <= 0; output_w_cnt <= 0; output_h_cnt <= 0; end end end end end endmodule