// Copyright 2020-2024 Intel Corporation. // // This software and the related documents are Intel copyrighted materials, // and your use of them is governed by the express license under which they // were provided to you ("License"). Unless the License provides otherwise, // you may not use, modify, copy, publish, distribute, disclose or transmit // this software or the related documents without Intel's prior written // permission. // // This software and the related documents are provided as is, with no express // or implied warranties, other than those that are expressly stated in the // License. /** * dla_lt_dimension_counter.sv * * This module generates tensor indexing information for N elements each cycle. The layout * transformation needs to know the context of each incoming token, and thus we need to * calculate the dimension information only from the tensor index. The transform also needs to * know about which "stride partition" the input belongs to, and where within the partition the * input is from. * * For example, consider a 4x4 RGB tensor with 2x2 partitions. The tensor with the (0,0) partition * outlined is shown below (where each value corresponds to the CWH ordering): [2 5 ] 8 11 [1 4 ] 7 10 [14 17]20 23 [0 3 ] 6 9 [13 16]19 22 26 29 32 35 [12 15]18 21 25 28 31 34 38 41 44 47 24 27 30 33 37 40 43 46 36 39 42 45 * in this case, for instance, the element 16 is in position C=1, W=1, H=1, and stride (S_W=0,S_H=0), * and is in position (inner_W=1,inner_H=1) within the stride. * */ `resetall `undefineall `default_nettype none import dla_lt_pkg::*; module dla_lt_dimension_counter import dla_common_pkg::*; #( parameter int ELEMENTS_PER_CYCLE, parameter int MAX_CHANNELS, parameter int MAX_FEATURE_WIDTH, parameter int MAX_FEATURE_HEIGHT, parameter int MAX_FEATURE_DEPTH, parameter int MAX_STRIDE_WIDTH, parameter int MAX_STRIDE_HEIGHT, parameter int MAX_STRIDE_DEPTH, parameter int MAX_INPUT_VOLUME, parameter int MAX_DIM_BITS ) ( input wire clk, input wire i_rstn, input wire i_increment, layout_transform_config_if if_lt_config, // ready signal is asserted when the startup sequence is over output wire o_ready, // Tensor dimension output: output wire [max($clog2(MAX_CHANNELS), 1)-1:0] o_c_dim [ELEMENTS_PER_CYCLE-1:0], output wire [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] o_w_dim [ELEMENTS_PER_CYCLE-1:0], output wire [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] o_h_dim [ELEMENTS_PER_CYCLE-1:0], output wire [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] o_d_dim [ELEMENTS_PER_CYCLE-1:0], // Location within the stride partition: output wire [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] o_w_inner [ELEMENTS_PER_CYCLE-1:0], output wire [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] o_h_inner [ELEMENTS_PER_CYCLE-1:0], output wire [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] o_d_inner [ELEMENTS_PER_CYCLE-1:0], // Which stride partition: output wire [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] o_w_stride [ELEMENTS_PER_CYCLE-1:0], output wire [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] o_h_stride [ELEMENTS_PER_CYCLE-1:0], output wire [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] o_d_stride [ELEMENTS_PER_CYCLE-1:0], // Input index in original ordering: output wire [MAX_DIM_BITS-1:0] o_index [ELEMENTS_PER_CYCLE-1:0] ); // Index calculation registers: logic [$clog2(ELEMENTS_PER_CYCLE + 1):0] state_cnt; logic startup; assign o_ready = state_cnt[$clog2(ELEMENTS_PER_CYCLE + 1)]; logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_d [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_d [ELEMENTS_PER_CYCLE-1:0]; logic [MAX_DIM_BITS-1:0] Index_d [ELEMENTS_PER_CYCLE-1:0]; assign o_c_dim = C_d; assign o_w_dim = W_d; assign o_h_dim = H_d; assign o_d_dim = D_d; assign o_w_inner = IN_W_d; assign o_h_inner = IN_H_d; assign o_d_inner = IN_D_d; assign o_w_stride = S_W_d; assign o_h_stride = S_H_d; assign o_d_stride = S_D_d; assign o_index = Index_d; logic[MAX_DIM_BITS-1:0] next_index; // tried to avoid the waste here by creating in generate (Should be triangular), but couldn't index upwards... logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0]; assign startup = state_cnt[$clog2(ELEMENTS_PER_CYCLE + 1)]; logic [16-1:0] channel_, width_, width_stride_, width_inner_, height_, height_stride_, height_inner_, depth_; logic counters_valid_; dla_lt_step_counter #( .ELEMENTS_PER_CYCLE(ELEMENTS_PER_CYCLE), .DIM_BITS(16), .DEPTH_TENSOR(0) // TODO(arooney): Enable 3D inputs ) step_counter ( .clk(clk), .i_resetn(i_rstn), .i_increment((startup == 1'b0 | i_increment == 1'b1) & if_lt_config.valid), .i_channel_dim(if_lt_config.data.input_channels), .i_width_dim(if_lt_config.data.input_width), .i_width_overhang(if_lt_config.data.w_padding_per_stride), .i_height_overhang(if_lt_config.data.h_padding_per_stride), .i_height_dim(if_lt_config.data.input_height), .i_depth_dim(if_lt_config.data.input_depth), .i_channel_step(if_lt_config.data.c_step), .i_width_stride(if_lt_config.data.stride_width), .i_width_step(if_lt_config.data.w_step), .i_stride_w_count(if_lt_config.data.output_w_range), .i_width_stride_step(if_lt_config.data.w_stride_step), .i_width_inner_step(if_lt_config.data.w_inner_step), .i_height_stride(if_lt_config.data.stride_height), .i_height_step(if_lt_config.data.h_step), .i_stride_h_count(if_lt_config.data.output_h_range), .i_height_stride_step(if_lt_config.data.h_stride_step), .i_height_inner_step(if_lt_config.data.h_inner_step), .i_depth_step(if_lt_config.data.d_step), .i_pad_w(if_lt_config.data.left_pad), .i_pad_h(if_lt_config.data.high_pad), .i_continue_count_cond(if_lt_config.data.continue_count_cond), .i_overhang_end_w(if_lt_config.data.w_end_overhang), .i_w_nstrides(if_lt_config.data.w_nstrides), .i_h_nstrides(if_lt_config.data.h_nstrides), .o_channel(channel_), .o_width(width_), .o_width_stride(width_stride_), .o_width_inner(width_inner_), .o_height(height_), .o_height_stride(height_stride_), .o_height_inner(height_inner_), .o_depth(depth_), .o_valid(counters_valid_) ); for (genvar in_token = 0; in_token < ELEMENTS_PER_CYCLE; in_token++) begin : gen_calculate_tensor_indexes always_ff @(posedge clk) /** * Process: calculate_tensor_indexes * Description: Given `ELEMENTS_PER_CYCLE` tokens, we must compute as many tensor indexes per-cycle in order * to calculate the transformed address of each input token. This process computes the first * value of each tensor dimension using division and modulo operators; the remainder of the * `ELEMENTS_PER_CYCLE` values for each index are computed using the last index computed in the previous * pipieline stages. There are `ELEMENTS_PER_CYCLE` stages before `ELEMENTS_PER_CYCLE` index values become ready, * at which point, `ELEMENTS_PER_CYCLE` values become ready every cycle. */ begin : ff_calculate_tensor_indexes if (!i_rstn) begin C_regbank[in_token] <= '{default: '0}; W_regbank[in_token] <= '{default: '0}; H_regbank[in_token] <= '{default: '0}; D_regbank[in_token] <= '{default: '0}; IN_W_regbank[in_token] <= '{default: '0}; S_W_regbank[in_token] <= '{default: '0}; IN_H_regbank[in_token] <= '{default: '0}; S_H_regbank[in_token] <= '{default: '0}; IN_D_regbank[in_token] <= '{default: '0}; S_D_regbank[in_token] <= '{default: '0}; C_d[in_token] <= '0; W_d[in_token] <= '0; H_d[in_token] <= '0; D_d[in_token] <= '0; Index_d[in_token] <= '0; if (in_token == 0) begin state_cnt <= ELEMENTS_PER_CYCLE; next_index <= '0; end end else begin if ((startup == 1'b0 || i_increment == 1'b1) & if_lt_config.valid & counters_valid_) // separate state and math logic... begin if (in_token == 0) begin state_cnt <= ~o_ready ? state_cnt - 1 : state_cnt; next_index <= next_index + ELEMENTS_PER_CYCLE; C_regbank[in_token][0] <= channel_; W_regbank[in_token][0] <= width_; H_regbank[in_token][0] <= height_; D_regbank[in_token][0] <= depth_; S_W_regbank[in_token][0] <= width_stride_; S_H_regbank[in_token][0] <= height_stride_; S_D_regbank[in_token][0] <= 0; IN_W_regbank[in_token][0] <= width_inner_; IN_H_regbank[in_token][0] <= height_inner_; IN_D_regbank[in_token][0] <= 0; end else begin if (C_regbank[in_token-1][0] == if_lt_config.data.input_channels-1) begin C_regbank[in_token][0] <= 0; if (W_regbank[in_token-1][0] == if_lt_config.data.input_width-1) begin W_regbank[in_token][0] <= 0; IN_W_regbank[in_token][0] <= if_lt_config.data.w_padding_per_stride; S_W_regbank[in_token][0] <= 0; if (H_regbank[in_token-1][0] == if_lt_config.data.input_height-1) begin H_regbank[in_token][0] <= 0; IN_H_regbank[in_token][0] <= 0; S_H_regbank[in_token][0] <= 0; if (D_regbank[in_token-1][0] == if_lt_config.data.input_depth-1) begin D_regbank[in_token][0] <= 0; IN_D_regbank[in_token][0] <= 0; S_D_regbank[in_token][0] <= 0; end else begin D_regbank[in_token][0] <= D_regbank[in_token-1][0] + 1; IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0] + 1; if (IN_D_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_depth) begin IN_D_regbank[in_token][0] <= 0; S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0] + 1; end else begin S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; end end end else begin H_regbank[in_token][0] <= H_regbank[in_token-1][0] + 1; IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0] + 1; if (IN_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_height) begin IN_H_regbank[in_token][0] <= 0; if (S_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_h_limit) begin S_H_regbank[in_token][0] <= 0; end else begin S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0] + 1; end end else begin S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0]; end D_regbank[in_token][0] <= D_regbank[in_token-1][0]; IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0]; S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; end end else begin W_regbank[in_token][0] <= W_regbank[in_token-1][0] + 1; IN_W_regbank[in_token][0] <= IN_W_regbank[in_token-1][0] + 1; if (IN_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_width) begin IN_W_regbank[in_token][0] <= 0; S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1; if (S_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_w_limit) begin S_W_regbank[in_token][0] <= 0; end else begin S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1; end end else begin S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0]; end H_regbank[in_token][0] <= H_regbank[in_token-1][0]; D_regbank[in_token][0] <= D_regbank[in_token-1][0]; IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0]; IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0]; S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0]; S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; end end else begin C_regbank[in_token][0] <= C_regbank[in_token-1][0] + 1; W_regbank[in_token][0] <= W_regbank[in_token-1][0]; H_regbank[in_token][0] <= H_regbank[in_token-1][0]; D_regbank[in_token][0] <= D_regbank[in_token-1][0]; IN_W_regbank[in_token][0] <= IN_W_regbank[in_token-1][0]; IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0]; IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0]; S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0]; S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0]; S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0]; end if (IN_W_regbank[in_token-1][0] >= if_lt_config.data.stride_width) begin IN_W_regbank[in_token][0] <= 0; if (S_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_w_limit) begin S_W_regbank[in_token][0] <= 0; end else begin S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1; end end if (IN_H_regbank[in_token-1][0] >= if_lt_config.data.stride_height) begin IN_H_regbank[in_token][0] <= 0; if (S_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_h_limit) begin S_H_regbank[in_token][0] <= 0; end else begin S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0] + 1; end end if (IN_D_regbank[in_token-1][0] >= if_lt_config.data.stride_depth) begin IN_D_regbank[in_token][0] <= 0; S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0] + 1; end end for (integer i = 1; i < (ELEMENTS_PER_CYCLE - in_token); i++) begin C_regbank[in_token][i] <= C_regbank[in_token][i-1]; W_regbank[in_token][i] <= W_regbank[in_token][i-1]; H_regbank[in_token][i] <= H_regbank[in_token][i-1]; D_regbank[in_token][i] <= D_regbank[in_token][i-1]; IN_W_regbank[in_token][i] <= IN_W_regbank[in_token][i-1]; IN_H_regbank[in_token][i] <= IN_H_regbank[in_token][i-1]; IN_D_regbank[in_token][i] <= IN_D_regbank[in_token][i-1]; S_W_regbank[in_token][i] <= S_W_regbank[in_token][i-1]; S_H_regbank[in_token][i] <= S_H_regbank[in_token][i-1]; S_D_regbank[in_token][i] <= S_D_regbank[in_token][i-1]; end C_d[in_token] <= C_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; W_d[in_token] <= W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; H_d[in_token] <= H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; D_d[in_token] <= D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; IN_W_d[in_token] <= IN_W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; IN_H_d[in_token] <= IN_H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; IN_D_d[in_token] <= IN_D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; S_W_d[in_token] <= S_W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; S_H_d[in_token] <= S_H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; S_D_d[in_token] <= S_D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1]; Index_d[in_token] <= next_index + in_token - (ELEMENTS_PER_CYCLE * (ELEMENTS_PER_CYCLE)); end end end //ff_calculate_tensor_indexes end //gen_calculate_tensor_indexes endmodule //lt_dimension_counter