// Copyright 2020-2024 Intel Corporation. // // This software and the related documents are Intel copyrighted materials, // and your use of them is governed by the express license under which they // were provided to you ("License"). Unless the License provides otherwise, // you may not use, modify, copy, publish, distribute, disclose or transmit // this software or the related documents without Intel's prior written // permission. // // This software and the related documents are provided as is, with no express // or implied warranties, other than those that are expressly stated in the // License. /** * dla_layout_transform.sv * * Top level of the DLA layout transform (LT) module. The transform can u8 data to FP16 (can be disabled) * and converts DHWC tensors to CDHWCvec (which is the format required by the PE array), it will also * fold the data into the CVEC dimension whenever the stride dimensions of the first convolution are non-1. * The folding feature cannot be turned off here - it can be turned off my ensuring that the transform node * in the compiler has strides of 1. See `dla_pass_folding.cpp` to see how this is done in the compiler. * * The main feature of this layout transform is that it can fold input tensor dimensions into the channel dimension * which improves the efficiency of the PE array. The parameters of the first convolution in the graph are * required as input to this module. The input tensor is partitioned into volumes equal to the * STRIDE_HEIGHTxSTRIDE_WIDTHxSTRIDE_DEPTHxCHANNELS of the input convolution. * The partitioned volume is then copied into one "CVEC" line and output once the CVEC line is complete. * * To achieve the folding transform, the DLA layout transform module instantiates the following modules: * > dla_layout_transform.sv - This module, serves as the interface for users, and instantiates top-level * signals and submodules. * * > dla_lt_conversion.sv - If enabled, converts input data from U8 to FP16 data types. * * > dla_lt_dimension_counter.sv - Generates tensor indexes for all tokens in incoming data packet. * * > dla_lt_gen_index_info.sv - Uses tensor indexes from the lt_dimension_counter to calculate the mapping * target output position; this includes which RAM module, RAM line, and posisiton within the RAM line * (each line holds a CVEC line of output data) each output is mapped to. The memory manager uses this * data to emplace the incoming data into its position in the RAM. * * > dla_lt_memory_manager.sv - Uses the addressing information from the lt_gen_index_info module to emplace * incoming data into the RAM. The RAM is used to store intermediate results because often, when we fold * data, we have to buffer an output CVEC for many cycles before all the data becomes available. * Has these submodules, * > dla_lt_ram_arb.sv - Arbitrates write requests and read requests from two sources. * > dla_lt_funnel.sv - Maps data from incoming data packet to the correct position within CVEC in * a single cycle for all data in the input packet. Uses the indexing info from the lt_gen_index_info * module. * * > dla_lt_output_logic.sv - Keeps track of the number of completed CVEC lines, and writes them to output. * This module is also responsible for keeping track of the output dimensions and writing padding lines when * required. * */ `resetall `undefineall `default_nettype none `include "dla_acl_parameter_assert.svh" function int calc_output_channels( input int cvec, channels, stride_height, stride_width, stride_depth ); integer div_result; div_result = ((channels * stride_width * stride_height * stride_depth) + cvec - 1) / cvec; calc_output_channels = div_result * cvec; endfunction function int calc_output_dim_max( input int feature_dim, filter_dim, dilation_dim, pad_dim, stride_dim ); integer conv_dim; // conv_dim = (feature_dim - ((filter_dim - 1) * dilation_dim + 1) + pad_dim) / stride_dim + 1; conv_dim = (feature_dim + pad_dim) + 1; // ceil_value = (filter_dim + stride_dim - 1) / stride_dim;//((filter_dim % stride_dim) == 0) ? 0 : 1; calc_output_dim_max = conv_dim + filter_dim - 1; endfunction module dla_layout_transform import dla_common_pkg::*,dla_lt_pkg::*; #( // Convolution parameters: parameter int MAX_CHANNELS =0, parameter int MAX_FEATURE_HEIGHT=0, parameter int MAX_FEATURE_WIDTH=0, parameter int MAX_FEATURE_DEPTH=0, parameter int MAX_STRIDE_HEIGHT=0, parameter int MAX_STRIDE_WIDTH=0, parameter int MAX_STRIDE_DEPTH=0, parameter int MAX_PAD_FRONT=0, parameter int MAX_PAD_LEFT=0, parameter int MAX_PAD_TOP=0, parameter int MAX_FILTER_WIDTH=4, parameter int MAX_FILTER_HEIGHT=4, parameter int MAX_FILTER_DEPTH=4, parameter int MAX_DILATION_WIDTH, parameter int MAX_DILATION_HEIGHT, parameter int MAX_DILATION_DEPTH, // Exact parameters parameter int CVEC=0, parameter bit DO_U8_CONV=1, parameter int DATA_ELEMENT_WIDTH = 32, parameter int CNT_BITS = 32, parameter int DDR_BYTES = 4, parameter int CONFIG_DATA_BYTES, device_family_t DEVICE, // Derived Params localparam int MAX_DIM_BITS = $clog2((MAX_FEATURE_DEPTH + MAX_PAD_FRONT) * (MAX_FEATURE_HEIGHT + MAX_PAD_TOP) * (MAX_FEATURE_WIDTH + MAX_PAD_LEFT) * CVEC), localparam int unsigned MAX_INPUT_VOLUME = MAX_CHANNELS * MAX_FEATURE_WIDTH * MAX_FEATURE_HEIGHT * MAX_FEATURE_DEPTH, //todo: Capitalize constants... localparam int unsigned ELEM_PER_DDR = (DDR_BYTES*8)/DATA_ELEMENT_WIDTH, localparam int unsigned OUTPUT_DATA_WIDTH = 16, localparam int MAX_TRANSFERS = (MAX_INPUT_VOLUME + ELEM_PER_DDR -1) / ELEM_PER_DDR ) ( // Module connections input wire clk, input wire i_rstn, input wire [CONFIG_DATA_BYTES*8-1:0] i_config_data, input wire i_config_valid, output logic o_config_ready, input wire [8*DDR_BYTES-1:0] i_data, input wire i_valid, input wire i_stall, output logic o_ready, output logic o_stall, output logic [CVEC-1:0][OUTPUT_DATA_WIDTH-1:0] o_data, output logic o_valid, output logic o_last, output logic o_param_error ); `DLA_ACL_PARAMETER_ASSERT((DO_U8_CONV == 0 && DATA_ELEMENT_WIDTH == 16) || (DO_U8_CONV == 1 && DATA_ELEMENT_WIDTH == 8)); // only supported combinations currently. localparam int unsigned MAX_OUTPUT_C = calc_output_channels(CVEC, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, MAX_STRIDE_DEPTH); localparam int unsigned MAX_OUTPUT_W = calc_output_dim_max(MAX_FEATURE_WIDTH, MAX_FILTER_WIDTH, MAX_DILATION_WIDTH, MAX_PAD_LEFT, MAX_STRIDE_WIDTH); localparam int unsigned MAX_OUTPUT_H = calc_output_dim_max(MAX_FEATURE_HEIGHT, MAX_FILTER_HEIGHT, MAX_DILATION_HEIGHT, MAX_PAD_TOP, MAX_STRIDE_HEIGHT); localparam int unsigned MAX_OUTPUT_D = calc_output_dim_max(MAX_FEATURE_DEPTH, MAX_FILTER_DEPTH, MAX_DILATION_DEPTH, MAX_PAD_FRONT, MAX_STRIDE_DEPTH); localparam int unsigned MAX_INNER_ROWS = MAX_OUTPUT_C > CVEC ? MAX_OUTPUT_C/CVEC-1 : 0; localparam int unsigned MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_C; // if CHANNELS > ELEM_PER_DDR then only 2 CVECs are modified per cycle in the worst case. Bump up to 4 to redue congestion; this value heavily effects area! localparam shortint unsigned max_num_partitions = MAX_CHANNELS > ELEM_PER_DDR ? 4 : calc_max_partitions( MAX_FEATURE_HEIGHT, MAX_FEATURE_WIDTH, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, ELEM_PER_DDR ) + 20; // TODO: The max_partition calculation does a pretty good job, but there are some edge conditions that are not accounted for. Review. For now, +N works. localparam integer num_buffers = (max_num_partitions)+(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH) + MAX_INNER_ROWS*(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH)*(MAX_FEATURE_HEIGHT/MAX_STRIDE_HEIGHT)*(MAX_FEATURE_DEPTH/MAX_STRIDE_DEPTH);// minimum number of buffers!!! localparam int n_pool_bits = $clog2((max_num_partitions)); localparam int n_buffer_pools = $rtoi($pow(2, n_pool_bits)); localparam int cvec_per_buffer = $rtoi($pow(2, $clog2(($rtoi($ceil((num_buffers*1.0)/(max_num_partitions)))+1) + 30))); // round to nearst power of 2 since we mod by the value in a few places. localparam int total_buffers = n_buffer_pools * cvec_per_buffer; localparam int buffers_in_progress = max_num_partitions*7; localparam int available = total_buffers - buffers_in_progress; logic [$clog2((MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC)))-1:0] lines_written; logic [ELEM_PER_DDR-1:0] completed_vol_tally [1:0]; logic [address_info_e.num()-1:0][CNT_BITS-1:0] addr_queue [n_buffer_pools-1:0][ELEM_PER_DDR-1:0]; logic ready_for_config; logic ready_for_transfer; logic input_data_valid; logic cnt_ready; logic next_transfer_overflow; logic internal_reset; logic resetn_condition, start_rx, done_frame; logic config_ready; logic [$clog2(MAX_TRANSFERS):0] frame_finished, transfer_count; layout_transform_config_if layout_transform_config(); shortint unsigned finished_lines_reg; int buffer_usage; // counters used by dimension counter logic: logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_d [ELEM_PER_DDR-1:0]; logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_d [ELEM_PER_DDR-1:0]; logic [MAX_DIM_BITS-1:0] Index_d [ELEM_PER_DDR-1:0]; assign resetn_condition = i_rstn & !internal_reset; assign o_config_ready = config_ready & ready_for_config; dla_config_deserialize #( .CONFIG_WIDTH(CONFIG_DATA_BYTES*8) ) lt_config_deserialize ( .clk(clk), .i_resetn(resetn_condition), .i_valid(i_config_valid), .i_config(i_config_data), .o_ready(config_ready), .if_config(layout_transform_config) ); always_ff @( posedge clk ) begin : latch_last_frame frame_finished <= frame_finished; if (i_valid & o_ready & ~done_frame) begin // We've started to accept output for this inference. This // signals to the output logic that we can start writing outputs. start_rx <= 1; frame_finished <= frame_finished - 1; done_frame <= frame_finished[$clog2(MAX_TRANSFERS)]; end else if (layout_transform_config.valid & ~start_rx) begin // Don't accept a new config until we're finished with this transform. ready_for_config <= 1'b0; frame_finished <= ((layout_transform_config.data.feature_volume + ELEM_PER_DDR - 1) / ELEM_PER_DDR) - 2; end if (~resetn_condition) begin start_rx <= 0; frame_finished <= MAX_TRANSFERS-2; done_frame <= 0; ready_for_config <= 1'b1; end end assign buffer_usage = (finished_lines_reg - lines_written); assign next_transfer_overflow = available <= buffer_usage; // may need to pipeline this. Maybe change to 'almost full signal'. assign o_ready = ready_for_transfer == 1'b1 & next_transfer_overflow == 1'b0 & !done_frame; assign o_stall = ready_for_transfer == 1'b0 | next_transfer_overflow == 1'b1 | done_frame; assign internal_reset = o_last & o_valid & !i_stall; // Dimension counter: Keeps track of position within tensor of incoming data. dla_lt_dimension_counter #( .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), .MAX_CHANNELS(MAX_CHANNELS), .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH), .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT), .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH), .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH), .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT), .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH), .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME), .MAX_DIM_BITS(MAX_DIM_BITS) ) dim_counter ( .clk(clk), .i_rstn(resetn_condition), .i_increment(input_data_valid | ~ready_for_transfer), .if_lt_config(layout_transform_config), .o_ready(cnt_ready), .o_c_dim(C_d), .o_w_dim(W_d), .o_h_dim(H_d), .o_d_dim(D_d), .o_w_inner(IN_W_d), .o_h_inner(IN_H_d), .o_d_inner(IN_D_d), .o_w_stride(S_W_d), .o_h_stride(S_H_d), .o_d_stride(S_D_d), .o_index(Index_d) ); // TODO(arooney): add more conversions logic [ELEM_PER_DDR-1:0][15:0] fp16_val; if (DO_U8_CONV) begin dla_lt_data_conversion #( .DDR_BYTES(DDR_BYTES), .DATA_ELEMENT_WIDTH(DATA_ELEMENT_WIDTH), .ELEMENTS_PER_CYCLE(ELEM_PER_DDR) ) data_conversion ( .clk(clk), .i_valid(i_valid & o_ready), .i_data(i_data), .o_fp16_val(fp16_val), .o_valid(input_data_valid) ); end else begin assign input_data_valid = i_valid & o_ready; always_ff @(posedge clk) begin fp16_val <= i_data; end end dla_lt_gen_index_info #( .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), .N_BUFFER_POOLS(n_buffer_pools), .CVEC_PER_BUFFER(cvec_per_buffer), .N_POOL_BITS(n_pool_bits), .CNT_BITS(CNT_BITS), .MAX_CHANNELS(MAX_CHANNELS), .CVEC(CVEC), .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH), .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT), .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH), .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH), .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT), .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH), .MAX_DIM_BITS(MAX_DIM_BITS), .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME) ) gen_index_info ( .clk(clk), .i_rstn(resetn_condition), .i_next_overflow(next_transfer_overflow), .i_valid(input_data_valid), .i_ready(cnt_ready), .i_c_dim(C_d), .i_w_inner(IN_W_d), .i_h_inner(IN_H_d), .i_d_inner(IN_D_d), .i_w_stride(S_W_d), .i_h_stride(S_H_d), .i_d_stride(S_D_d), .i_index(Index_d), .if_lt_config(layout_transform_config), .o_addr_queue(addr_queue), .o_completed_vol_tally(completed_vol_tally[0]), .o_ready_for_transfer(ready_for_transfer) ); logic [($clog2(cvec_per_buffer))-1:0] output_line_num [n_buffer_pools-1:0]; logic [($clog2(cvec_per_buffer))-1:0] curr_out_line [n_buffer_pools-1:0]; logic [n_buffer_pools-1:0] actively_reading; logic [MAX_OUTPUT_C-1:0][16-1:0] output_line_data [n_buffer_pools-1:0]; dla_lt_memory_manager #( .NUM_BUFFER_POOLS(n_buffer_pools), .CVEC_PER_BUFFER(cvec_per_buffer), .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), .CNT_BITS(CNT_BITS), .CVEC(CVEC), .MAX_OUTPUT_C(MAX_OUTPUT_C), .DEVICE(DEVICE) ) memory_manager ( .clk(clk), .i_rstn(resetn_condition), .i_addr_queue(addr_queue), .i_output_line_num(output_line_num), .i_actively_reading(actively_reading), .i_fp16_data(fp16_val), .i_completed_vol_tally(completed_vol_tally[0]), .o_completed_vol_tally(completed_vol_tally[1]), .o_output_line_data(output_line_data), .o_curr_out_line(curr_out_line) ); dla_lt_output_logic #( .NUM_BUFFER_POOLS(n_buffer_pools), .CVEC_PER_BUFFER(cvec_per_buffer), .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), .N_POOL_BITS(n_pool_bits), .MAX_OUTPUT_W(MAX_OUTPUT_W), .MAX_OUTPUT_H(MAX_OUTPUT_H), .MAX_OUTPUT_D(MAX_OUTPUT_D), .MAX_OUTPUT_C(MAX_OUTPUT_C), .CVEC(CVEC), .CNT_BITS(CNT_BITS), .MAX_DIM_BITS(MAX_DIM_BITS) ) output_logic ( .clk(clk), .i_rstn(resetn_condition), .i_output_line_data(output_line_data), .i_curr_out_line(curr_out_line), .i_completed_vol_tally(completed_vol_tally[1]), .i_stall(i_stall), .if_lt_config(layout_transform_config), .i_ready(start_rx), .o_line_num(output_line_num), .o_read_req(actively_reading), .o_data(o_data), .o_valid(o_valid), .o_last(o_last), .o_lines_written(lines_written), .o_finished_lines(finished_lines_reg) ); endmodule