diff options
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv')
| -rw-r--r-- | python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv | 384 |
1 files changed, 384 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv new file mode 100644 index 0000000..e52fe53 --- /dev/null +++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv @@ -0,0 +1,384 @@ +// Copyright 2020-2024 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +/** + * dla_layout_transform.sv + * + * Top level of the DLA layout transform (LT) module. The transform can u8 data to FP16 (can be disabled) + * and converts DHWC tensors to CDHWCvec (which is the format required by the PE array), it will also + * fold the data into the CVEC dimension whenever the stride dimensions of the first convolution are non-1. + * The folding feature cannot be turned off here - it can be turned off my ensuring that the transform node + * in the compiler has strides of 1. See `dla_pass_folding.cpp` to see how this is done in the compiler. + * + * The main feature of this layout transform is that it can fold input tensor dimensions into the channel dimension + * which improves the efficiency of the PE array. The parameters of the first convolution in the graph are + * required as input to this module. The input tensor is partitioned into volumes equal to the + * STRIDE_HEIGHTxSTRIDE_WIDTHxSTRIDE_DEPTHxCHANNELS of the input convolution. + * The partitioned volume is then copied into one "CVEC" line and output once the CVEC line is complete. + * + * To achieve the folding transform, the DLA layout transform module instantiates the following modules: + * > dla_layout_transform.sv - This module, serves as the interface for users, and instantiates top-level + * signals and submodules. + * + * > dla_lt_conversion.sv - If enabled, converts input data from U8 to FP16 data types. + * + * > dla_lt_dimension_counter.sv - Generates tensor indexes for all tokens in incoming data packet. + * + * > dla_lt_gen_index_info.sv - Uses tensor indexes from the lt_dimension_counter to calculate the mapping + * target output position; this includes which RAM module, RAM line, and posisiton within the RAM line + * (each line holds a CVEC line of output data) each output is mapped to. The memory manager uses this + * data to emplace the incoming data into its position in the RAM. + * + * > dla_lt_memory_manager.sv - Uses the addressing information from the lt_gen_index_info module to emplace + * incoming data into the RAM. The RAM is used to store intermediate results because often, when we fold + * data, we have to buffer an output CVEC for many cycles before all the data becomes available. + * Has these submodules, + * > dla_lt_ram_arb.sv - Arbitrates write requests and read requests from two sources. + * > dla_lt_funnel.sv - Maps data from incoming data packet to the correct position within CVEC in + * a single cycle for all data in the input packet. Uses the indexing info from the lt_gen_index_info + * module. + * + * > dla_lt_output_logic.sv - Keeps track of the number of completed CVEC lines, and writes them to output. + * This module is also responsible for keeping track of the output dimensions and writing padding lines when + * required. + * + */ + +`resetall +`undefineall +`default_nettype none + +`include "dla_acl_parameter_assert.svh" + +function int calc_output_channels( + input int cvec, channels, stride_height, stride_width, stride_depth +); + integer div_result; + div_result = ((channels * stride_width * stride_height * stride_depth) + cvec - 1) / cvec; + calc_output_channels = div_result * cvec; +endfunction + +function int calc_output_dim_max( + input int feature_dim, filter_dim, dilation_dim, pad_dim, stride_dim +); + integer conv_dim; + // conv_dim = (feature_dim - ((filter_dim - 1) * dilation_dim + 1) + pad_dim) / stride_dim + 1; + conv_dim = (feature_dim + pad_dim) + 1; + // ceil_value = (filter_dim + stride_dim - 1) / stride_dim;//((filter_dim % stride_dim) == 0) ? 0 : 1; + calc_output_dim_max = conv_dim + filter_dim - 1; +endfunction + + module dla_layout_transform + import dla_common_pkg::*,dla_lt_pkg::*; + #( + // Convolution parameters: + parameter int MAX_CHANNELS =0, + parameter int MAX_FEATURE_HEIGHT=0, + parameter int MAX_FEATURE_WIDTH=0, + parameter int MAX_FEATURE_DEPTH=0, + parameter int MAX_STRIDE_HEIGHT=0, + parameter int MAX_STRIDE_WIDTH=0, + parameter int MAX_STRIDE_DEPTH=0, + parameter int MAX_PAD_FRONT=0, + parameter int MAX_PAD_LEFT=0, + parameter int MAX_PAD_TOP=0, + parameter int MAX_FILTER_WIDTH=4, + parameter int MAX_FILTER_HEIGHT=4, + parameter int MAX_FILTER_DEPTH=4, + parameter int MAX_DILATION_WIDTH, + parameter int MAX_DILATION_HEIGHT, + parameter int MAX_DILATION_DEPTH, + + // Exact parameters + parameter int CVEC=0, + parameter bit DO_U8_CONV=1, + parameter int DATA_ELEMENT_WIDTH = 32, + parameter int CNT_BITS = 32, + parameter int DDR_BYTES = 4, + parameter int CONFIG_DATA_BYTES, + + device_family_t DEVICE, + // Derived Params + localparam int MAX_DIM_BITS = $clog2((MAX_FEATURE_DEPTH + MAX_PAD_FRONT) * (MAX_FEATURE_HEIGHT + MAX_PAD_TOP) * (MAX_FEATURE_WIDTH + MAX_PAD_LEFT) * CVEC), + localparam int unsigned MAX_INPUT_VOLUME = MAX_CHANNELS * MAX_FEATURE_WIDTH * MAX_FEATURE_HEIGHT * MAX_FEATURE_DEPTH, + //todo: Capitalize constants... + localparam int unsigned ELEM_PER_DDR = (DDR_BYTES*8)/DATA_ELEMENT_WIDTH, + localparam int unsigned OUTPUT_DATA_WIDTH = 16, + localparam int MAX_TRANSFERS = (MAX_INPUT_VOLUME + ELEM_PER_DDR -1) / ELEM_PER_DDR + ) ( + // Module connections + input wire clk, + input wire i_rstn, + input wire [CONFIG_DATA_BYTES*8-1:0] i_config_data, + input wire i_config_valid, + output logic o_config_ready, + input wire [8*DDR_BYTES-1:0] i_data, + input wire i_valid, + input wire i_stall, + output logic o_ready, + output logic o_stall, + output logic [CVEC-1:0][OUTPUT_DATA_WIDTH-1:0] o_data, + output logic o_valid, + output logic o_last, + output logic o_param_error + ); + + `DLA_ACL_PARAMETER_ASSERT((DO_U8_CONV == 0 && DATA_ELEMENT_WIDTH == 16) || (DO_U8_CONV == 1 && DATA_ELEMENT_WIDTH == 8)); // only supported combinations currently. + + localparam int unsigned MAX_OUTPUT_C = calc_output_channels(CVEC, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, MAX_STRIDE_DEPTH); + localparam int unsigned MAX_OUTPUT_W = calc_output_dim_max(MAX_FEATURE_WIDTH, MAX_FILTER_WIDTH, MAX_DILATION_WIDTH, MAX_PAD_LEFT, MAX_STRIDE_WIDTH); + localparam int unsigned MAX_OUTPUT_H = calc_output_dim_max(MAX_FEATURE_HEIGHT, MAX_FILTER_HEIGHT, MAX_DILATION_HEIGHT, MAX_PAD_TOP, MAX_STRIDE_HEIGHT); + localparam int unsigned MAX_OUTPUT_D = calc_output_dim_max(MAX_FEATURE_DEPTH, MAX_FILTER_DEPTH, MAX_DILATION_DEPTH, MAX_PAD_FRONT, MAX_STRIDE_DEPTH); + localparam int unsigned MAX_INNER_ROWS = MAX_OUTPUT_C > CVEC ? MAX_OUTPUT_C/CVEC-1 : 0; + localparam int unsigned MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_C; + // if CHANNELS > ELEM_PER_DDR then only 2 CVECs are modified per cycle in the worst case. Bump up to 4 to redue congestion; this value heavily effects area! + localparam shortint unsigned max_num_partitions = MAX_CHANNELS > ELEM_PER_DDR ? 4 : calc_max_partitions( + MAX_FEATURE_HEIGHT, MAX_FEATURE_WIDTH, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, ELEM_PER_DDR + ) + 20; // TODO: The max_partition calculation does a pretty good job, but there are some edge conditions that are not accounted for. Review. For now, +N works. + localparam integer num_buffers = (max_num_partitions)+(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH) + MAX_INNER_ROWS*(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH)*(MAX_FEATURE_HEIGHT/MAX_STRIDE_HEIGHT)*(MAX_FEATURE_DEPTH/MAX_STRIDE_DEPTH);// minimum number of buffers!!! + localparam int n_pool_bits = $clog2((max_num_partitions)); + localparam int n_buffer_pools = $rtoi($pow(2, n_pool_bits)); + localparam int cvec_per_buffer = $rtoi($pow(2, $clog2(($rtoi($ceil((num_buffers*1.0)/(max_num_partitions)))+1) + 30))); // round to nearst power of 2 since we mod by the value in a few places. + localparam int total_buffers = n_buffer_pools * cvec_per_buffer; + localparam int buffers_in_progress = max_num_partitions*7; + + localparam int available = total_buffers - buffers_in_progress; + + logic [$clog2((MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC)))-1:0] lines_written; + logic [ELEM_PER_DDR-1:0] completed_vol_tally [1:0]; + logic [address_info_e.num()-1:0][CNT_BITS-1:0] addr_queue [n_buffer_pools-1:0][ELEM_PER_DDR-1:0]; + + logic ready_for_config; + logic ready_for_transfer; + logic input_data_valid; + logic cnt_ready; + logic next_transfer_overflow; + logic internal_reset; + logic resetn_condition, start_rx, done_frame; + logic config_ready; + logic [$clog2(MAX_TRANSFERS):0] frame_finished, transfer_count; + + layout_transform_config_if layout_transform_config(); + + shortint unsigned finished_lines_reg; + int buffer_usage; + + // counters used by dimension counter logic: + logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_d [ELEM_PER_DDR-1:0]; + logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_d [ELEM_PER_DDR-1:0]; + logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_d [ELEM_PER_DDR-1:0]; + logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_d [ELEM_PER_DDR-1:0]; + + logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_d [ELEM_PER_DDR-1:0]; + logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_d [ELEM_PER_DDR-1:0]; + logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_d [ELEM_PER_DDR-1:0]; + + logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_d [ELEM_PER_DDR-1:0]; + logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_d [ELEM_PER_DDR-1:0]; + logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_d [ELEM_PER_DDR-1:0]; + logic [MAX_DIM_BITS-1:0] Index_d [ELEM_PER_DDR-1:0]; + + assign resetn_condition = i_rstn & !internal_reset; + assign o_config_ready = config_ready & ready_for_config; + + dla_config_deserialize #( + .CONFIG_WIDTH(CONFIG_DATA_BYTES*8) + ) lt_config_deserialize ( + .clk(clk), + .i_resetn(resetn_condition), + .i_valid(i_config_valid), + .i_config(i_config_data), + .o_ready(config_ready), + + .if_config(layout_transform_config) + ); + + always_ff @( posedge clk ) begin : latch_last_frame + frame_finished <= frame_finished; + + if (i_valid & o_ready & ~done_frame) begin + // We've started to accept output for this inference. This + // signals to the output logic that we can start writing outputs. + start_rx <= 1; + frame_finished <= frame_finished - 1; + done_frame <= frame_finished[$clog2(MAX_TRANSFERS)]; + end else if (layout_transform_config.valid & ~start_rx) begin + // Don't accept a new config until we're finished with this transform. + ready_for_config <= 1'b0; + frame_finished <= ((layout_transform_config.data.feature_volume + ELEM_PER_DDR - 1) / ELEM_PER_DDR) - 2; + end + + if (~resetn_condition) begin + start_rx <= 0; + frame_finished <= MAX_TRANSFERS-2; + done_frame <= 0; + ready_for_config <= 1'b1; + end + end + + assign buffer_usage = (finished_lines_reg - lines_written); + assign next_transfer_overflow = available <= buffer_usage; // may need to pipeline this. Maybe change to 'almost full signal'. + assign o_ready = ready_for_transfer == 1'b1 & next_transfer_overflow == 1'b0 & !done_frame; + assign o_stall = ready_for_transfer == 1'b0 | next_transfer_overflow == 1'b1 | done_frame; + assign internal_reset = o_last & o_valid & !i_stall; + // Dimension counter: Keeps track of position within tensor of incoming data. + dla_lt_dimension_counter #( + .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), + .MAX_CHANNELS(MAX_CHANNELS), + .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH), + .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT), + .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH), + .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH), + .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT), + .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH), + .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME), + .MAX_DIM_BITS(MAX_DIM_BITS) + ) dim_counter ( + .clk(clk), + .i_rstn(resetn_condition), + .i_increment(input_data_valid | ~ready_for_transfer), + .if_lt_config(layout_transform_config), + + .o_ready(cnt_ready), + .o_c_dim(C_d), + .o_w_dim(W_d), + .o_h_dim(H_d), + .o_d_dim(D_d), + .o_w_inner(IN_W_d), + .o_h_inner(IN_H_d), + .o_d_inner(IN_D_d), + .o_w_stride(S_W_d), + .o_h_stride(S_H_d), + .o_d_stride(S_D_d), + .o_index(Index_d) + ); + + // TODO(arooney): add more conversions + logic [ELEM_PER_DDR-1:0][15:0] fp16_val; + if (DO_U8_CONV) begin + dla_lt_data_conversion #( + .DDR_BYTES(DDR_BYTES), + .DATA_ELEMENT_WIDTH(DATA_ELEMENT_WIDTH), + .ELEMENTS_PER_CYCLE(ELEM_PER_DDR) + ) data_conversion ( + .clk(clk), + .i_valid(i_valid & o_ready), + .i_data(i_data), + + .o_fp16_val(fp16_val), + .o_valid(input_data_valid) + ); + end + else begin + assign input_data_valid = i_valid & o_ready; + always_ff @(posedge clk) begin + fp16_val <= i_data; + end + end + + dla_lt_gen_index_info #( + .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), + .N_BUFFER_POOLS(n_buffer_pools), + .CVEC_PER_BUFFER(cvec_per_buffer), + .N_POOL_BITS(n_pool_bits), + .CNT_BITS(CNT_BITS), + .MAX_CHANNELS(MAX_CHANNELS), + .CVEC(CVEC), + .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH), + .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT), + .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH), + + .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH), + .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT), + .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH), + .MAX_DIM_BITS(MAX_DIM_BITS), + .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME) + ) gen_index_info ( + .clk(clk), + .i_rstn(resetn_condition), + .i_next_overflow(next_transfer_overflow), + .i_valid(input_data_valid), + .i_ready(cnt_ready), + .i_c_dim(C_d), + .i_w_inner(IN_W_d), + .i_h_inner(IN_H_d), + .i_d_inner(IN_D_d), + .i_w_stride(S_W_d), + .i_h_stride(S_H_d), + .i_d_stride(S_D_d), + .i_index(Index_d), + .if_lt_config(layout_transform_config), + + .o_addr_queue(addr_queue), + .o_completed_vol_tally(completed_vol_tally[0]), + .o_ready_for_transfer(ready_for_transfer) + ); + + logic [($clog2(cvec_per_buffer))-1:0] output_line_num [n_buffer_pools-1:0]; + logic [($clog2(cvec_per_buffer))-1:0] curr_out_line [n_buffer_pools-1:0]; + logic [n_buffer_pools-1:0] actively_reading; + logic [MAX_OUTPUT_C-1:0][16-1:0] output_line_data [n_buffer_pools-1:0]; + dla_lt_memory_manager #( + .NUM_BUFFER_POOLS(n_buffer_pools), + .CVEC_PER_BUFFER(cvec_per_buffer), + .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), + .CNT_BITS(CNT_BITS), + .CVEC(CVEC), + .MAX_OUTPUT_C(MAX_OUTPUT_C), + .DEVICE(DEVICE) + ) memory_manager ( + .clk(clk), + .i_rstn(resetn_condition), + .i_addr_queue(addr_queue), + .i_output_line_num(output_line_num), + .i_actively_reading(actively_reading), + .i_fp16_data(fp16_val), + .i_completed_vol_tally(completed_vol_tally[0]), + + .o_completed_vol_tally(completed_vol_tally[1]), + .o_output_line_data(output_line_data), + .o_curr_out_line(curr_out_line) + ); + + dla_lt_output_logic #( + .NUM_BUFFER_POOLS(n_buffer_pools), + .CVEC_PER_BUFFER(cvec_per_buffer), + .ELEMENTS_PER_CYCLE(ELEM_PER_DDR), + .N_POOL_BITS(n_pool_bits), + .MAX_OUTPUT_W(MAX_OUTPUT_W), + .MAX_OUTPUT_H(MAX_OUTPUT_H), + .MAX_OUTPUT_D(MAX_OUTPUT_D), + .MAX_OUTPUT_C(MAX_OUTPUT_C), + .CVEC(CVEC), + .CNT_BITS(CNT_BITS), + .MAX_DIM_BITS(MAX_DIM_BITS) + ) output_logic ( + .clk(clk), + .i_rstn(resetn_condition), + .i_output_line_data(output_line_data), + .i_curr_out_line(curr_out_line), + .i_completed_vol_tally(completed_vol_tally[1]), + .i_stall(i_stall), + .if_lt_config(layout_transform_config), + .i_ready(start_rx), + + .o_line_num(output_line_num), + .o_read_req(actively_reading), + .o_data(o_data), + .o_valid(o_valid), + .o_last(o_last), + .o_lines_written(lines_written), + .o_finished_lines(finished_lines_reg) + ); + +endmodule |
