summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv')
-rw-r--r--python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv384
1 files changed, 384 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv
new file mode 100644
index 0000000..e52fe53
--- /dev/null
+++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv
@@ -0,0 +1,384 @@
+// Copyright 2020-2024 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/**
+ * dla_layout_transform.sv
+ *
+ * Top level of the DLA layout transform (LT) module. The transform can u8 data to FP16 (can be disabled)
+ * and converts DHWC tensors to CDHWCvec (which is the format required by the PE array), it will also
+ * fold the data into the CVEC dimension whenever the stride dimensions of the first convolution are non-1.
+ * The folding feature cannot be turned off here - it can be turned off my ensuring that the transform node
+ * in the compiler has strides of 1. See `dla_pass_folding.cpp` to see how this is done in the compiler.
+ *
+ * The main feature of this layout transform is that it can fold input tensor dimensions into the channel dimension
+ * which improves the efficiency of the PE array. The parameters of the first convolution in the graph are
+ * required as input to this module. The input tensor is partitioned into volumes equal to the
+ * STRIDE_HEIGHTxSTRIDE_WIDTHxSTRIDE_DEPTHxCHANNELS of the input convolution.
+ * The partitioned volume is then copied into one "CVEC" line and output once the CVEC line is complete.
+ *
+ * To achieve the folding transform, the DLA layout transform module instantiates the following modules:
+ * > dla_layout_transform.sv - This module, serves as the interface for users, and instantiates top-level
+ * signals and submodules.
+ *
+ * > dla_lt_conversion.sv - If enabled, converts input data from U8 to FP16 data types.
+ *
+ * > dla_lt_dimension_counter.sv - Generates tensor indexes for all tokens in incoming data packet.
+ *
+ * > dla_lt_gen_index_info.sv - Uses tensor indexes from the lt_dimension_counter to calculate the mapping
+ * target output position; this includes which RAM module, RAM line, and posisiton within the RAM line
+ * (each line holds a CVEC line of output data) each output is mapped to. The memory manager uses this
+ * data to emplace the incoming data into its position in the RAM.
+ *
+ * > dla_lt_memory_manager.sv - Uses the addressing information from the lt_gen_index_info module to emplace
+ * incoming data into the RAM. The RAM is used to store intermediate results because often, when we fold
+ * data, we have to buffer an output CVEC for many cycles before all the data becomes available.
+ * Has these submodules,
+ * > dla_lt_ram_arb.sv - Arbitrates write requests and read requests from two sources.
+ * > dla_lt_funnel.sv - Maps data from incoming data packet to the correct position within CVEC in
+ * a single cycle for all data in the input packet. Uses the indexing info from the lt_gen_index_info
+ * module.
+ *
+ * > dla_lt_output_logic.sv - Keeps track of the number of completed CVEC lines, and writes them to output.
+ * This module is also responsible for keeping track of the output dimensions and writing padding lines when
+ * required.
+ *
+ */
+
+`resetall
+`undefineall
+`default_nettype none
+
+`include "dla_acl_parameter_assert.svh"
+
+function int calc_output_channels(
+ input int cvec, channels, stride_height, stride_width, stride_depth
+);
+ integer div_result;
+ div_result = ((channels * stride_width * stride_height * stride_depth) + cvec - 1) / cvec;
+ calc_output_channels = div_result * cvec;
+endfunction
+
+function int calc_output_dim_max(
+ input int feature_dim, filter_dim, dilation_dim, pad_dim, stride_dim
+);
+ integer conv_dim;
+ // conv_dim = (feature_dim - ((filter_dim - 1) * dilation_dim + 1) + pad_dim) / stride_dim + 1;
+ conv_dim = (feature_dim + pad_dim) + 1;
+ // ceil_value = (filter_dim + stride_dim - 1) / stride_dim;//((filter_dim % stride_dim) == 0) ? 0 : 1;
+ calc_output_dim_max = conv_dim + filter_dim - 1;
+endfunction
+
+ module dla_layout_transform
+ import dla_common_pkg::*,dla_lt_pkg::*;
+ #(
+ // Convolution parameters:
+ parameter int MAX_CHANNELS =0,
+ parameter int MAX_FEATURE_HEIGHT=0,
+ parameter int MAX_FEATURE_WIDTH=0,
+ parameter int MAX_FEATURE_DEPTH=0,
+ parameter int MAX_STRIDE_HEIGHT=0,
+ parameter int MAX_STRIDE_WIDTH=0,
+ parameter int MAX_STRIDE_DEPTH=0,
+ parameter int MAX_PAD_FRONT=0,
+ parameter int MAX_PAD_LEFT=0,
+ parameter int MAX_PAD_TOP=0,
+ parameter int MAX_FILTER_WIDTH=4,
+ parameter int MAX_FILTER_HEIGHT=4,
+ parameter int MAX_FILTER_DEPTH=4,
+ parameter int MAX_DILATION_WIDTH,
+ parameter int MAX_DILATION_HEIGHT,
+ parameter int MAX_DILATION_DEPTH,
+
+ // Exact parameters
+ parameter int CVEC=0,
+ parameter bit DO_U8_CONV=1,
+ parameter int DATA_ELEMENT_WIDTH = 32,
+ parameter int CNT_BITS = 32,
+ parameter int DDR_BYTES = 4,
+ parameter int CONFIG_DATA_BYTES,
+
+ device_family_t DEVICE,
+ // Derived Params
+ localparam int MAX_DIM_BITS = $clog2((MAX_FEATURE_DEPTH + MAX_PAD_FRONT) * (MAX_FEATURE_HEIGHT + MAX_PAD_TOP) * (MAX_FEATURE_WIDTH + MAX_PAD_LEFT) * CVEC),
+ localparam int unsigned MAX_INPUT_VOLUME = MAX_CHANNELS * MAX_FEATURE_WIDTH * MAX_FEATURE_HEIGHT * MAX_FEATURE_DEPTH,
+ //todo: Capitalize constants...
+ localparam int unsigned ELEM_PER_DDR = (DDR_BYTES*8)/DATA_ELEMENT_WIDTH,
+ localparam int unsigned OUTPUT_DATA_WIDTH = 16,
+ localparam int MAX_TRANSFERS = (MAX_INPUT_VOLUME + ELEM_PER_DDR -1) / ELEM_PER_DDR
+ ) (
+ // Module connections
+ input wire clk,
+ input wire i_rstn,
+ input wire [CONFIG_DATA_BYTES*8-1:0] i_config_data,
+ input wire i_config_valid,
+ output logic o_config_ready,
+ input wire [8*DDR_BYTES-1:0] i_data,
+ input wire i_valid,
+ input wire i_stall,
+ output logic o_ready,
+ output logic o_stall,
+ output logic [CVEC-1:0][OUTPUT_DATA_WIDTH-1:0] o_data,
+ output logic o_valid,
+ output logic o_last,
+ output logic o_param_error
+ );
+
+ `DLA_ACL_PARAMETER_ASSERT((DO_U8_CONV == 0 && DATA_ELEMENT_WIDTH == 16) || (DO_U8_CONV == 1 && DATA_ELEMENT_WIDTH == 8)); // only supported combinations currently.
+
+ localparam int unsigned MAX_OUTPUT_C = calc_output_channels(CVEC, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, MAX_STRIDE_DEPTH);
+ localparam int unsigned MAX_OUTPUT_W = calc_output_dim_max(MAX_FEATURE_WIDTH, MAX_FILTER_WIDTH, MAX_DILATION_WIDTH, MAX_PAD_LEFT, MAX_STRIDE_WIDTH);
+ localparam int unsigned MAX_OUTPUT_H = calc_output_dim_max(MAX_FEATURE_HEIGHT, MAX_FILTER_HEIGHT, MAX_DILATION_HEIGHT, MAX_PAD_TOP, MAX_STRIDE_HEIGHT);
+ localparam int unsigned MAX_OUTPUT_D = calc_output_dim_max(MAX_FEATURE_DEPTH, MAX_FILTER_DEPTH, MAX_DILATION_DEPTH, MAX_PAD_FRONT, MAX_STRIDE_DEPTH);
+ localparam int unsigned MAX_INNER_ROWS = MAX_OUTPUT_C > CVEC ? MAX_OUTPUT_C/CVEC-1 : 0;
+ localparam int unsigned MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_C;
+ // if CHANNELS > ELEM_PER_DDR then only 2 CVECs are modified per cycle in the worst case. Bump up to 4 to redue congestion; this value heavily effects area!
+ localparam shortint unsigned max_num_partitions = MAX_CHANNELS > ELEM_PER_DDR ? 4 : calc_max_partitions(
+ MAX_FEATURE_HEIGHT, MAX_FEATURE_WIDTH, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, ELEM_PER_DDR
+ ) + 20; // TODO: The max_partition calculation does a pretty good job, but there are some edge conditions that are not accounted for. Review. For now, +N works.
+ localparam integer num_buffers = (max_num_partitions)+(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH) + MAX_INNER_ROWS*(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH)*(MAX_FEATURE_HEIGHT/MAX_STRIDE_HEIGHT)*(MAX_FEATURE_DEPTH/MAX_STRIDE_DEPTH);// minimum number of buffers!!!
+ localparam int n_pool_bits = $clog2((max_num_partitions));
+ localparam int n_buffer_pools = $rtoi($pow(2, n_pool_bits));
+ localparam int cvec_per_buffer = $rtoi($pow(2, $clog2(($rtoi($ceil((num_buffers*1.0)/(max_num_partitions)))+1) + 30))); // round to nearst power of 2 since we mod by the value in a few places.
+ localparam int total_buffers = n_buffer_pools * cvec_per_buffer;
+ localparam int buffers_in_progress = max_num_partitions*7;
+
+ localparam int available = total_buffers - buffers_in_progress;
+
+ logic [$clog2((MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC)))-1:0] lines_written;
+ logic [ELEM_PER_DDR-1:0] completed_vol_tally [1:0];
+ logic [address_info_e.num()-1:0][CNT_BITS-1:0] addr_queue [n_buffer_pools-1:0][ELEM_PER_DDR-1:0];
+
+ logic ready_for_config;
+ logic ready_for_transfer;
+ logic input_data_valid;
+ logic cnt_ready;
+ logic next_transfer_overflow;
+ logic internal_reset;
+ logic resetn_condition, start_rx, done_frame;
+ logic config_ready;
+ logic [$clog2(MAX_TRANSFERS):0] frame_finished, transfer_count;
+
+ layout_transform_config_if layout_transform_config();
+
+ shortint unsigned finished_lines_reg;
+ int buffer_usage;
+
+ // counters used by dimension counter logic:
+ logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_d [ELEM_PER_DDR-1:0];
+ logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_d [ELEM_PER_DDR-1:0];
+ logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_d [ELEM_PER_DDR-1:0];
+ logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_d [ELEM_PER_DDR-1:0];
+
+ logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_d [ELEM_PER_DDR-1:0];
+ logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_d [ELEM_PER_DDR-1:0];
+ logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_d [ELEM_PER_DDR-1:0];
+
+ logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_d [ELEM_PER_DDR-1:0];
+ logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_d [ELEM_PER_DDR-1:0];
+ logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_d [ELEM_PER_DDR-1:0];
+ logic [MAX_DIM_BITS-1:0] Index_d [ELEM_PER_DDR-1:0];
+
+ assign resetn_condition = i_rstn & !internal_reset;
+ assign o_config_ready = config_ready & ready_for_config;
+
+ dla_config_deserialize #(
+ .CONFIG_WIDTH(CONFIG_DATA_BYTES*8)
+ ) lt_config_deserialize (
+ .clk(clk),
+ .i_resetn(resetn_condition),
+ .i_valid(i_config_valid),
+ .i_config(i_config_data),
+ .o_ready(config_ready),
+
+ .if_config(layout_transform_config)
+ );
+
+ always_ff @( posedge clk ) begin : latch_last_frame
+ frame_finished <= frame_finished;
+
+ if (i_valid & o_ready & ~done_frame) begin
+ // We've started to accept output for this inference. This
+ // signals to the output logic that we can start writing outputs.
+ start_rx <= 1;
+ frame_finished <= frame_finished - 1;
+ done_frame <= frame_finished[$clog2(MAX_TRANSFERS)];
+ end else if (layout_transform_config.valid & ~start_rx) begin
+ // Don't accept a new config until we're finished with this transform.
+ ready_for_config <= 1'b0;
+ frame_finished <= ((layout_transform_config.data.feature_volume + ELEM_PER_DDR - 1) / ELEM_PER_DDR) - 2;
+ end
+
+ if (~resetn_condition) begin
+ start_rx <= 0;
+ frame_finished <= MAX_TRANSFERS-2;
+ done_frame <= 0;
+ ready_for_config <= 1'b1;
+ end
+ end
+
+ assign buffer_usage = (finished_lines_reg - lines_written);
+ assign next_transfer_overflow = available <= buffer_usage; // may need to pipeline this. Maybe change to 'almost full signal'.
+ assign o_ready = ready_for_transfer == 1'b1 & next_transfer_overflow == 1'b0 & !done_frame;
+ assign o_stall = ready_for_transfer == 1'b0 | next_transfer_overflow == 1'b1 | done_frame;
+ assign internal_reset = o_last & o_valid & !i_stall;
+ // Dimension counter: Keeps track of position within tensor of incoming data.
+ dla_lt_dimension_counter #(
+ .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+ .MAX_CHANNELS(MAX_CHANNELS),
+ .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
+ .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
+ .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),
+ .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
+ .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
+ .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
+ .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME),
+ .MAX_DIM_BITS(MAX_DIM_BITS)
+ ) dim_counter (
+ .clk(clk),
+ .i_rstn(resetn_condition),
+ .i_increment(input_data_valid | ~ready_for_transfer),
+ .if_lt_config(layout_transform_config),
+
+ .o_ready(cnt_ready),
+ .o_c_dim(C_d),
+ .o_w_dim(W_d),
+ .o_h_dim(H_d),
+ .o_d_dim(D_d),
+ .o_w_inner(IN_W_d),
+ .o_h_inner(IN_H_d),
+ .o_d_inner(IN_D_d),
+ .o_w_stride(S_W_d),
+ .o_h_stride(S_H_d),
+ .o_d_stride(S_D_d),
+ .o_index(Index_d)
+ );
+
+ // TODO(arooney): add more conversions
+ logic [ELEM_PER_DDR-1:0][15:0] fp16_val;
+ if (DO_U8_CONV) begin
+ dla_lt_data_conversion #(
+ .DDR_BYTES(DDR_BYTES),
+ .DATA_ELEMENT_WIDTH(DATA_ELEMENT_WIDTH),
+ .ELEMENTS_PER_CYCLE(ELEM_PER_DDR)
+ ) data_conversion (
+ .clk(clk),
+ .i_valid(i_valid & o_ready),
+ .i_data(i_data),
+
+ .o_fp16_val(fp16_val),
+ .o_valid(input_data_valid)
+ );
+ end
+ else begin
+ assign input_data_valid = i_valid & o_ready;
+ always_ff @(posedge clk) begin
+ fp16_val <= i_data;
+ end
+ end
+
+ dla_lt_gen_index_info #(
+ .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+ .N_BUFFER_POOLS(n_buffer_pools),
+ .CVEC_PER_BUFFER(cvec_per_buffer),
+ .N_POOL_BITS(n_pool_bits),
+ .CNT_BITS(CNT_BITS),
+ .MAX_CHANNELS(MAX_CHANNELS),
+ .CVEC(CVEC),
+ .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
+ .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
+ .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),
+
+ .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
+ .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
+ .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
+ .MAX_DIM_BITS(MAX_DIM_BITS),
+ .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME)
+ ) gen_index_info (
+ .clk(clk),
+ .i_rstn(resetn_condition),
+ .i_next_overflow(next_transfer_overflow),
+ .i_valid(input_data_valid),
+ .i_ready(cnt_ready),
+ .i_c_dim(C_d),
+ .i_w_inner(IN_W_d),
+ .i_h_inner(IN_H_d),
+ .i_d_inner(IN_D_d),
+ .i_w_stride(S_W_d),
+ .i_h_stride(S_H_d),
+ .i_d_stride(S_D_d),
+ .i_index(Index_d),
+ .if_lt_config(layout_transform_config),
+
+ .o_addr_queue(addr_queue),
+ .o_completed_vol_tally(completed_vol_tally[0]),
+ .o_ready_for_transfer(ready_for_transfer)
+ );
+
+ logic [($clog2(cvec_per_buffer))-1:0] output_line_num [n_buffer_pools-1:0];
+ logic [($clog2(cvec_per_buffer))-1:0] curr_out_line [n_buffer_pools-1:0];
+ logic [n_buffer_pools-1:0] actively_reading;
+ logic [MAX_OUTPUT_C-1:0][16-1:0] output_line_data [n_buffer_pools-1:0];
+ dla_lt_memory_manager #(
+ .NUM_BUFFER_POOLS(n_buffer_pools),
+ .CVEC_PER_BUFFER(cvec_per_buffer),
+ .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+ .CNT_BITS(CNT_BITS),
+ .CVEC(CVEC),
+ .MAX_OUTPUT_C(MAX_OUTPUT_C),
+ .DEVICE(DEVICE)
+ ) memory_manager (
+ .clk(clk),
+ .i_rstn(resetn_condition),
+ .i_addr_queue(addr_queue),
+ .i_output_line_num(output_line_num),
+ .i_actively_reading(actively_reading),
+ .i_fp16_data(fp16_val),
+ .i_completed_vol_tally(completed_vol_tally[0]),
+
+ .o_completed_vol_tally(completed_vol_tally[1]),
+ .o_output_line_data(output_line_data),
+ .o_curr_out_line(curr_out_line)
+ );
+
+ dla_lt_output_logic #(
+ .NUM_BUFFER_POOLS(n_buffer_pools),
+ .CVEC_PER_BUFFER(cvec_per_buffer),
+ .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+ .N_POOL_BITS(n_pool_bits),
+ .MAX_OUTPUT_W(MAX_OUTPUT_W),
+ .MAX_OUTPUT_H(MAX_OUTPUT_H),
+ .MAX_OUTPUT_D(MAX_OUTPUT_D),
+ .MAX_OUTPUT_C(MAX_OUTPUT_C),
+ .CVEC(CVEC),
+ .CNT_BITS(CNT_BITS),
+ .MAX_DIM_BITS(MAX_DIM_BITS)
+ ) output_logic (
+ .clk(clk),
+ .i_rstn(resetn_condition),
+ .i_output_line_data(output_line_data),
+ .i_curr_out_line(curr_out_line),
+ .i_completed_vol_tally(completed_vol_tally[1]),
+ .i_stall(i_stall),
+ .if_lt_config(layout_transform_config),
+ .i_ready(start_rx),
+
+ .o_line_num(output_line_num),
+ .o_read_req(actively_reading),
+ .o_data(o_data),
+ .o_valid(o_valid),
+ .o_last(o_last),
+ .o_lines_written(lines_written),
+ .o_finished_lines(finished_lines_reg)
+ );
+
+endmodule