diff options
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv')
| -rw-r--r-- | python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv | 401 |
1 files changed, 401 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv new file mode 100644 index 0000000..153869f --- /dev/null +++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv @@ -0,0 +1,401 @@ +// Copyright 2020-2024 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +/** + * dla_lt_dimension_counter.sv + * + * This module generates tensor indexing information for N elements each cycle. The layout + * transformation needs to know the context of each incoming token, and thus we need to + * calculate the dimension information only from the tensor index. The transform also needs to + * know about which "stride partition" the input belongs to, and where within the partition the + * input is from. + * + * For example, consider a 4x4 RGB tensor with 2x2 partitions. The tensor with the (0,0) partition + * outlined is shown below (where each value corresponds to the CWH ordering): + + [2 5 ] 8 11 + [1 4 ] 7 10 [14 17]20 23 + [0 3 ] 6 9 [13 16]19 22 26 29 32 35 + [12 15]18 21 25 28 31 34 38 41 44 47 + 24 27 30 33 37 40 43 46 + 36 39 42 45 + + * in this case, for instance, the element 16 is in position C=1, W=1, H=1, and stride (S_W=0,S_H=0), + * and is in position (inner_W=1,inner_H=1) within the stride. + * + */ + +`resetall +`undefineall +`default_nettype none + +import dla_lt_pkg::*; + +module dla_lt_dimension_counter import dla_common_pkg::*; #( + parameter int ELEMENTS_PER_CYCLE, + parameter int MAX_CHANNELS, + + parameter int MAX_FEATURE_WIDTH, + parameter int MAX_FEATURE_HEIGHT, + parameter int MAX_FEATURE_DEPTH, + + parameter int MAX_STRIDE_WIDTH, + parameter int MAX_STRIDE_HEIGHT, + parameter int MAX_STRIDE_DEPTH, + parameter int MAX_INPUT_VOLUME, + + parameter int MAX_DIM_BITS +) ( + input wire clk, + input wire i_rstn, + input wire i_increment, + + layout_transform_config_if if_lt_config, + + // ready signal is asserted when the startup sequence is over + output wire o_ready, + + // Tensor dimension output: + output wire [max($clog2(MAX_CHANNELS), 1)-1:0] o_c_dim [ELEMENTS_PER_CYCLE-1:0], + output wire [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] o_w_dim [ELEMENTS_PER_CYCLE-1:0], + output wire [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] o_h_dim [ELEMENTS_PER_CYCLE-1:0], + output wire [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] o_d_dim [ELEMENTS_PER_CYCLE-1:0], + + // Location within the stride partition: + output wire [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] o_w_inner [ELEMENTS_PER_CYCLE-1:0], + output wire [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] o_h_inner [ELEMENTS_PER_CYCLE-1:0], + output wire [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] o_d_inner [ELEMENTS_PER_CYCLE-1:0], + + // Which stride partition: + output wire [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] o_w_stride [ELEMENTS_PER_CYCLE-1:0], + output wire [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] o_h_stride [ELEMENTS_PER_CYCLE-1:0], + output wire [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] o_d_stride [ELEMENTS_PER_CYCLE-1:0], + + // Input index in original ordering: + output wire [MAX_DIM_BITS-1:0] o_index [ELEMENTS_PER_CYCLE-1:0] +); + + // Index calculation registers: + logic [$clog2(ELEMENTS_PER_CYCLE + 1):0] state_cnt; + logic startup; + + assign o_ready = state_cnt[$clog2(ELEMENTS_PER_CYCLE + 1)]; + + logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_d [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_d [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_d [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_d [ELEMENTS_PER_CYCLE-1:0]; + + logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_d [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_d [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_d [ELEMENTS_PER_CYCLE-1:0]; + + logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_d [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_d [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_d [ELEMENTS_PER_CYCLE-1:0]; + logic [MAX_DIM_BITS-1:0] Index_d [ELEMENTS_PER_CYCLE-1:0]; + + assign o_c_dim = C_d; + assign o_w_dim = W_d; + assign o_h_dim = H_d; + assign o_d_dim = D_d; + assign o_w_inner = IN_W_d; + assign o_h_inner = IN_H_d; + assign o_d_inner = IN_D_d; + assign o_w_stride = S_W_d; + assign o_h_stride = S_H_d; + assign o_d_stride = S_D_d; + assign o_index = Index_d; + + logic[MAX_DIM_BITS-1:0] next_index; + + // tried to avoid the waste here by creating in generate (Should be triangular), but couldn't index upwards... + logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; + + assign startup = state_cnt[$clog2(ELEMENTS_PER_CYCLE + 1)]; + + logic [16-1:0] channel_, width_, width_stride_, width_inner_, height_, height_stride_, height_inner_, depth_; + logic counters_valid_; + + dla_lt_step_counter #( + .ELEMENTS_PER_CYCLE(ELEMENTS_PER_CYCLE), + .DIM_BITS(16), + .DEPTH_TENSOR(0) // TODO(arooney): Enable 3D inputs + ) step_counter ( + .clk(clk), + .i_resetn(i_rstn), + .i_increment((startup == 1'b0 | i_increment == 1'b1) & if_lt_config.valid), + .i_channel_dim(if_lt_config.data.input_channels), + .i_width_dim(if_lt_config.data.input_width), + .i_width_overhang(if_lt_config.data.w_padding_per_stride), + .i_height_overhang(if_lt_config.data.h_padding_per_stride), + .i_height_dim(if_lt_config.data.input_height), + .i_depth_dim(if_lt_config.data.input_depth), + .i_channel_step(if_lt_config.data.c_step), + .i_width_stride(if_lt_config.data.stride_width), + .i_width_step(if_lt_config.data.w_step), + .i_stride_w_count(if_lt_config.data.output_w_range), + .i_width_stride_step(if_lt_config.data.w_stride_step), + .i_width_inner_step(if_lt_config.data.w_inner_step), + .i_height_stride(if_lt_config.data.stride_height), + .i_height_step(if_lt_config.data.h_step), + .i_stride_h_count(if_lt_config.data.output_h_range), + .i_height_stride_step(if_lt_config.data.h_stride_step), + .i_height_inner_step(if_lt_config.data.h_inner_step), + .i_depth_step(if_lt_config.data.d_step), + .i_pad_w(if_lt_config.data.left_pad), + .i_pad_h(if_lt_config.data.high_pad), + .i_continue_count_cond(if_lt_config.data.continue_count_cond), + .i_overhang_end_w(if_lt_config.data.w_end_overhang), + .i_w_nstrides(if_lt_config.data.w_nstrides), + .i_h_nstrides(if_lt_config.data.h_nstrides), + .o_channel(channel_), + .o_width(width_), + .o_width_stride(width_stride_), + .o_width_inner(width_inner_), + .o_height(height_), + .o_height_stride(height_stride_), + .o_height_inner(height_inner_), + .o_depth(depth_), + .o_valid(counters_valid_) + ); + + for (genvar in_token = 0; in_token < ELEMENTS_PER_CYCLE; in_token++) + begin : gen_calculate_tensor_indexes + always_ff @(posedge clk) + /** + * Process: calculate_tensor_indexes + * Description: Given `ELEMENTS_PER_CYCLE` tokens, we must compute as many tensor indexes per-cycle in order + * to calculate the transformed address of each input token. This process computes the first + * value of each tensor dimension using division and modulo operators; the remainder of the + * `ELEMENTS_PER_CYCLE` values for each index are computed using the last index computed in the previous + * pipieline stages. There are `ELEMENTS_PER_CYCLE` stages before `ELEMENTS_PER_CYCLE` index values become ready, + * at which point, `ELEMENTS_PER_CYCLE` values become ready every cycle. + */ + begin : ff_calculate_tensor_indexes + if (!i_rstn) + begin + C_regbank[in_token] <= '{default: '0}; + W_regbank[in_token] <= '{default: '0}; + H_regbank[in_token] <= '{default: '0}; + D_regbank[in_token] <= '{default: '0}; + IN_W_regbank[in_token] <= '{default: '0}; + S_W_regbank[in_token] <= '{default: '0}; + IN_H_regbank[in_token] <= '{default: '0}; + S_H_regbank[in_token] <= '{default: '0}; + IN_D_regbank[in_token] <= '{default: '0}; + S_D_regbank[in_token] <= '{default: '0}; + C_d[in_token] <= '0; + W_d[in_token] <= '0; + H_d[in_token] <= '0; + D_d[in_token] <= '0; + Index_d[in_token] <= '0; + + if (in_token == 0) + begin + state_cnt <= ELEMENTS_PER_CYCLE; + next_index <= '0; + end + end + else + begin + if ((startup == 1'b0 || i_increment == 1'b1) & if_lt_config.valid & counters_valid_) // separate state and math logic... + begin + if (in_token == 0) + begin + state_cnt <= ~o_ready ? state_cnt - 1 : state_cnt; + + next_index <= next_index + ELEMENTS_PER_CYCLE; + + C_regbank[in_token][0] <= channel_; + W_regbank[in_token][0] <= width_; + H_regbank[in_token][0] <= height_; + D_regbank[in_token][0] <= depth_; + + S_W_regbank[in_token][0] <= width_stride_; + S_H_regbank[in_token][0] <= height_stride_; + S_D_regbank[in_token][0] <= 0; + + IN_W_regbank[in_token][0] <= width_inner_; + IN_H_regbank[in_token][0] <= height_inner_; + IN_D_regbank[in_token][0] <= 0; + end + else + begin + if (C_regbank[in_token-1][0] == if_lt_config.data.input_channels-1) + begin + C_regbank[in_token][0] <= 0; + if (W_regbank[in_token-1][0] == if_lt_config.data.input_width-1) + begin + W_regbank[in_token][0] <= 0; + IN_W_regbank[in_token][0] <= if_lt_config.data.w_padding_per_stride; + S_W_regbank[in_token][0] <= 0; + if (H_regbank[in_token-1][0] == if_lt_config.data.input_height-1) + begin + H_regbank[in_token][0] <= 0; + IN_H_regbank[in_token][0] <= 0; + S_H_regbank[in_token][0] <= 0; + if (D_regbank[in_token-1][0] == if_lt_config.data.input_depth-1) + begin + D_regbank[in_token][0] <= 0; + IN_D_regbank[in_token][0] <= 0; + S_D_regbank[in_token][0] <= 0; + end + else + begin + D_regbank[in_token][0] <= D_regbank[in_token-1][0] + 1; + IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0] + 1; + if (IN_D_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_depth) + begin + IN_D_regbank[in_token][0] <= 0; + S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0] + 1; + end + else + begin + S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; + end + end + end + else + begin + H_regbank[in_token][0] <= H_regbank[in_token-1][0] + 1; + IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0] + 1; + if (IN_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_height) + begin + IN_H_regbank[in_token][0] <= 0; + if (S_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_h_limit) begin + S_H_regbank[in_token][0] <= 0; + end else begin + S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0] + 1; + end + end + else + begin + S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0]; + end + D_regbank[in_token][0] <= D_regbank[in_token-1][0]; + IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0]; + S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; + + end + end + else + begin + W_regbank[in_token][0] <= W_regbank[in_token-1][0] + 1; + IN_W_regbank[in_token][0] <= IN_W_regbank[in_token-1][0] + 1; + if (IN_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_width) + begin + IN_W_regbank[in_token][0] <= 0; + S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1; + if (S_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_w_limit) begin + S_W_regbank[in_token][0] <= 0; + end else begin + S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1; + end + end + else + begin + S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0]; + end + H_regbank[in_token][0] <= H_regbank[in_token-1][0]; + D_regbank[in_token][0] <= D_regbank[in_token-1][0]; + + IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0]; + IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0]; + + S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0]; + S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; + end + end + else + begin + C_regbank[in_token][0] <= C_regbank[in_token-1][0] + 1; + + W_regbank[in_token][0] <= W_regbank[in_token-1][0]; + H_regbank[in_token][0] <= H_regbank[in_token-1][0]; + D_regbank[in_token][0] <= D_regbank[in_token-1][0]; + + IN_W_regbank[in_token][0] <= IN_W_regbank[in_token-1][0]; + IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0]; + IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0]; + + S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0]; + S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0]; + S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; + end + if (IN_W_regbank[in_token-1][0] >= if_lt_config.data.stride_width) + begin + IN_W_regbank[in_token][0] <= 0; + if (S_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_w_limit) begin + S_W_regbank[in_token][0] <= 0; + end else begin + S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1; + end + end + if (IN_H_regbank[in_token-1][0] >= if_lt_config.data.stride_height) + begin + IN_H_regbank[in_token][0] <= 0; + if (S_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_h_limit) begin + S_H_regbank[in_token][0] <= 0; + end else begin + S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0] + 1; + end + end + if (IN_D_regbank[in_token-1][0] >= if_lt_config.data.stride_depth) + begin + IN_D_regbank[in_token][0] <= 0; + S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0] + 1; + end + end + for (integer i = 1; i < (ELEMENTS_PER_CYCLE - in_token); i++) + begin + C_regbank[in_token][i] <= C_regbank[in_token][i-1]; + W_regbank[in_token][i] <= W_regbank[in_token][i-1]; + H_regbank[in_token][i] <= H_regbank[in_token][i-1]; + D_regbank[in_token][i] <= D_regbank[in_token][i-1]; + + IN_W_regbank[in_token][i] <= IN_W_regbank[in_token][i-1]; + IN_H_regbank[in_token][i] <= IN_H_regbank[in_token][i-1]; + IN_D_regbank[in_token][i] <= IN_D_regbank[in_token][i-1]; + + S_W_regbank[in_token][i] <= S_W_regbank[in_token][i-1]; + S_H_regbank[in_token][i] <= S_H_regbank[in_token][i-1]; + S_D_regbank[in_token][i] <= S_D_regbank[in_token][i-1]; + end + C_d[in_token] <= C_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + W_d[in_token] <= W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + H_d[in_token] <= H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + D_d[in_token] <= D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + + IN_W_d[in_token] <= IN_W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + IN_H_d[in_token] <= IN_H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + IN_D_d[in_token] <= IN_D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + + S_W_d[in_token] <= S_W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + S_H_d[in_token] <= S_H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + S_D_d[in_token] <= S_D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; + Index_d[in_token] <= next_index + in_token - (ELEMENTS_PER_CYCLE * (ELEMENTS_PER_CYCLE)); + end + end + end //ff_calculate_tensor_indexes + end //gen_calculate_tensor_indexes +endmodule //lt_dimension_counter |
