summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv')
-rw-r--r--python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv401
1 files changed, 401 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv
new file mode 100644
index 0000000..153869f
--- /dev/null
+++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_lt_dimension_counter.sv
@@ -0,0 +1,401 @@
+// Copyright 2020-2024 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/**
+ * dla_lt_dimension_counter.sv
+ *
+ * This module generates tensor indexing information for N elements each cycle. The layout
+ * transformation needs to know the context of each incoming token, and thus we need to
+ * calculate the dimension information only from the tensor index. The transform also needs to
+ * know about which "stride partition" the input belongs to, and where within the partition the
+ * input is from.
+ *
+ * For example, consider a 4x4 RGB tensor with 2x2 partitions. The tensor with the (0,0) partition
+ * outlined is shown below (where each value corresponds to the CWH ordering):
+
+ [2 5 ] 8 11
+ [1 4 ] 7 10 [14 17]20 23
+ [0 3 ] 6 9 [13 16]19 22 26 29 32 35
+ [12 15]18 21 25 28 31 34 38 41 44 47
+ 24 27 30 33 37 40 43 46
+ 36 39 42 45
+
+ * in this case, for instance, the element 16 is in position C=1, W=1, H=1, and stride (S_W=0,S_H=0),
+ * and is in position (inner_W=1,inner_H=1) within the stride.
+ *
+ */
+
+`resetall
+`undefineall
+`default_nettype none
+
+import dla_lt_pkg::*;
+
+module dla_lt_dimension_counter import dla_common_pkg::*; #(
+ parameter int ELEMENTS_PER_CYCLE,
+ parameter int MAX_CHANNELS,
+
+ parameter int MAX_FEATURE_WIDTH,
+ parameter int MAX_FEATURE_HEIGHT,
+ parameter int MAX_FEATURE_DEPTH,
+
+ parameter int MAX_STRIDE_WIDTH,
+ parameter int MAX_STRIDE_HEIGHT,
+ parameter int MAX_STRIDE_DEPTH,
+ parameter int MAX_INPUT_VOLUME,
+
+ parameter int MAX_DIM_BITS
+) (
+ input wire clk,
+ input wire i_rstn,
+ input wire i_increment,
+
+ layout_transform_config_if if_lt_config,
+
+ // ready signal is asserted when the startup sequence is over
+ output wire o_ready,
+
+ // Tensor dimension output:
+ output wire [max($clog2(MAX_CHANNELS), 1)-1:0] o_c_dim [ELEMENTS_PER_CYCLE-1:0],
+ output wire [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] o_w_dim [ELEMENTS_PER_CYCLE-1:0],
+ output wire [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] o_h_dim [ELEMENTS_PER_CYCLE-1:0],
+ output wire [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] o_d_dim [ELEMENTS_PER_CYCLE-1:0],
+
+ // Location within the stride partition:
+ output wire [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] o_w_inner [ELEMENTS_PER_CYCLE-1:0],
+ output wire [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] o_h_inner [ELEMENTS_PER_CYCLE-1:0],
+ output wire [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] o_d_inner [ELEMENTS_PER_CYCLE-1:0],
+
+ // Which stride partition:
+ output wire [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] o_w_stride [ELEMENTS_PER_CYCLE-1:0],
+ output wire [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] o_h_stride [ELEMENTS_PER_CYCLE-1:0],
+ output wire [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] o_d_stride [ELEMENTS_PER_CYCLE-1:0],
+
+ // Input index in original ordering:
+ output wire [MAX_DIM_BITS-1:0] o_index [ELEMENTS_PER_CYCLE-1:0]
+);
+
+ // Index calculation registers:
+ logic [$clog2(ELEMENTS_PER_CYCLE + 1):0] state_cnt;
+ logic startup;
+
+ assign o_ready = state_cnt[$clog2(ELEMENTS_PER_CYCLE + 1)];
+
+ logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_d [ELEMENTS_PER_CYCLE-1:0];
+
+ logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_d [ELEMENTS_PER_CYCLE-1:0];
+
+ logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_d [ELEMENTS_PER_CYCLE-1:0];
+ logic [MAX_DIM_BITS-1:0] Index_d [ELEMENTS_PER_CYCLE-1:0];
+
+ assign o_c_dim = C_d;
+ assign o_w_dim = W_d;
+ assign o_h_dim = H_d;
+ assign o_d_dim = D_d;
+ assign o_w_inner = IN_W_d;
+ assign o_h_inner = IN_H_d;
+ assign o_d_inner = IN_D_d;
+ assign o_w_stride = S_W_d;
+ assign o_h_stride = S_H_d;
+ assign o_d_stride = S_D_d;
+ assign o_index = Index_d;
+
+ logic[MAX_DIM_BITS-1:0] next_index;
+
+ // tried to avoid the waste here by creating in generate (Should be triangular), but couldn't index upwards...
+ logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+ logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_regbank [ELEMENTS_PER_CYCLE-1:0] [ELEMENTS_PER_CYCLE-1:0];
+
+ assign startup = state_cnt[$clog2(ELEMENTS_PER_CYCLE + 1)];
+
+ logic [16-1:0] channel_, width_, width_stride_, width_inner_, height_, height_stride_, height_inner_, depth_;
+ logic counters_valid_;
+
+ dla_lt_step_counter #(
+ .ELEMENTS_PER_CYCLE(ELEMENTS_PER_CYCLE),
+ .DIM_BITS(16),
+ .DEPTH_TENSOR(0) // TODO(arooney): Enable 3D inputs
+ ) step_counter (
+ .clk(clk),
+ .i_resetn(i_rstn),
+ .i_increment((startup == 1'b0 | i_increment == 1'b1) & if_lt_config.valid),
+ .i_channel_dim(if_lt_config.data.input_channels),
+ .i_width_dim(if_lt_config.data.input_width),
+ .i_width_overhang(if_lt_config.data.w_padding_per_stride),
+ .i_height_overhang(if_lt_config.data.h_padding_per_stride),
+ .i_height_dim(if_lt_config.data.input_height),
+ .i_depth_dim(if_lt_config.data.input_depth),
+ .i_channel_step(if_lt_config.data.c_step),
+ .i_width_stride(if_lt_config.data.stride_width),
+ .i_width_step(if_lt_config.data.w_step),
+ .i_stride_w_count(if_lt_config.data.output_w_range),
+ .i_width_stride_step(if_lt_config.data.w_stride_step),
+ .i_width_inner_step(if_lt_config.data.w_inner_step),
+ .i_height_stride(if_lt_config.data.stride_height),
+ .i_height_step(if_lt_config.data.h_step),
+ .i_stride_h_count(if_lt_config.data.output_h_range),
+ .i_height_stride_step(if_lt_config.data.h_stride_step),
+ .i_height_inner_step(if_lt_config.data.h_inner_step),
+ .i_depth_step(if_lt_config.data.d_step),
+ .i_pad_w(if_lt_config.data.left_pad),
+ .i_pad_h(if_lt_config.data.high_pad),
+ .i_continue_count_cond(if_lt_config.data.continue_count_cond),
+ .i_overhang_end_w(if_lt_config.data.w_end_overhang),
+ .i_w_nstrides(if_lt_config.data.w_nstrides),
+ .i_h_nstrides(if_lt_config.data.h_nstrides),
+ .o_channel(channel_),
+ .o_width(width_),
+ .o_width_stride(width_stride_),
+ .o_width_inner(width_inner_),
+ .o_height(height_),
+ .o_height_stride(height_stride_),
+ .o_height_inner(height_inner_),
+ .o_depth(depth_),
+ .o_valid(counters_valid_)
+ );
+
+ for (genvar in_token = 0; in_token < ELEMENTS_PER_CYCLE; in_token++)
+ begin : gen_calculate_tensor_indexes
+ always_ff @(posedge clk)
+ /**
+ * Process: calculate_tensor_indexes
+ * Description: Given `ELEMENTS_PER_CYCLE` tokens, we must compute as many tensor indexes per-cycle in order
+ * to calculate the transformed address of each input token. This process computes the first
+ * value of each tensor dimension using division and modulo operators; the remainder of the
+ * `ELEMENTS_PER_CYCLE` values for each index are computed using the last index computed in the previous
+ * pipieline stages. There are `ELEMENTS_PER_CYCLE` stages before `ELEMENTS_PER_CYCLE` index values become ready,
+ * at which point, `ELEMENTS_PER_CYCLE` values become ready every cycle.
+ */
+ begin : ff_calculate_tensor_indexes
+ if (!i_rstn)
+ begin
+ C_regbank[in_token] <= '{default: '0};
+ W_regbank[in_token] <= '{default: '0};
+ H_regbank[in_token] <= '{default: '0};
+ D_regbank[in_token] <= '{default: '0};
+ IN_W_regbank[in_token] <= '{default: '0};
+ S_W_regbank[in_token] <= '{default: '0};
+ IN_H_regbank[in_token] <= '{default: '0};
+ S_H_regbank[in_token] <= '{default: '0};
+ IN_D_regbank[in_token] <= '{default: '0};
+ S_D_regbank[in_token] <= '{default: '0};
+ C_d[in_token] <= '0;
+ W_d[in_token] <= '0;
+ H_d[in_token] <= '0;
+ D_d[in_token] <= '0;
+ Index_d[in_token] <= '0;
+
+ if (in_token == 0)
+ begin
+ state_cnt <= ELEMENTS_PER_CYCLE;
+ next_index <= '0;
+ end
+ end
+ else
+ begin
+ if ((startup == 1'b0 || i_increment == 1'b1) & if_lt_config.valid & counters_valid_) // separate state and math logic...
+ begin
+ if (in_token == 0)
+ begin
+ state_cnt <= ~o_ready ? state_cnt - 1 : state_cnt;
+
+ next_index <= next_index + ELEMENTS_PER_CYCLE;
+
+ C_regbank[in_token][0] <= channel_;
+ W_regbank[in_token][0] <= width_;
+ H_regbank[in_token][0] <= height_;
+ D_regbank[in_token][0] <= depth_;
+
+ S_W_regbank[in_token][0] <= width_stride_;
+ S_H_regbank[in_token][0] <= height_stride_;
+ S_D_regbank[in_token][0] <= 0;
+
+ IN_W_regbank[in_token][0] <= width_inner_;
+ IN_H_regbank[in_token][0] <= height_inner_;
+ IN_D_regbank[in_token][0] <= 0;
+ end
+ else
+ begin
+ if (C_regbank[in_token-1][0] == if_lt_config.data.input_channels-1)
+ begin
+ C_regbank[in_token][0] <= 0;
+ if (W_regbank[in_token-1][0] == if_lt_config.data.input_width-1)
+ begin
+ W_regbank[in_token][0] <= 0;
+ IN_W_regbank[in_token][0] <= if_lt_config.data.w_padding_per_stride;
+ S_W_regbank[in_token][0] <= 0;
+ if (H_regbank[in_token-1][0] == if_lt_config.data.input_height-1)
+ begin
+ H_regbank[in_token][0] <= 0;
+ IN_H_regbank[in_token][0] <= 0;
+ S_H_regbank[in_token][0] <= 0;
+ if (D_regbank[in_token-1][0] == if_lt_config.data.input_depth-1)
+ begin
+ D_regbank[in_token][0] <= 0;
+ IN_D_regbank[in_token][0] <= 0;
+ S_D_regbank[in_token][0] <= 0;
+ end
+ else
+ begin
+ D_regbank[in_token][0] <= D_regbank[in_token-1][0] + 1;
+ IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0] + 1;
+ if (IN_D_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_depth)
+ begin
+ IN_D_regbank[in_token][0] <= 0;
+ S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0] + 1;
+ end
+ else
+ begin
+ S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0];
+ end
+ end
+ end
+ else
+ begin
+ H_regbank[in_token][0] <= H_regbank[in_token-1][0] + 1;
+ IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0] + 1;
+ if (IN_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_height)
+ begin
+ IN_H_regbank[in_token][0] <= 0;
+ if (S_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_h_limit) begin
+ S_H_regbank[in_token][0] <= 0;
+ end else begin
+ S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0] + 1;
+ end
+ end
+ else
+ begin
+ S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0];
+ end
+ D_regbank[in_token][0] <= D_regbank[in_token-1][0];
+ IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0];
+ S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0];
+
+ end
+ end
+ else
+ begin
+ W_regbank[in_token][0] <= W_regbank[in_token-1][0] + 1;
+ IN_W_regbank[in_token][0] <= IN_W_regbank[in_token-1][0] + 1;
+ if (IN_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_width)
+ begin
+ IN_W_regbank[in_token][0] <= 0;
+ S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1;
+ if (S_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_w_limit) begin
+ S_W_regbank[in_token][0] <= 0;
+ end else begin
+ S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1;
+ end
+ end
+ else
+ begin
+ S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0];
+ end
+ H_regbank[in_token][0] <= H_regbank[in_token-1][0];
+ D_regbank[in_token][0] <= D_regbank[in_token-1][0];
+
+ IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0];
+ IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0];
+
+ S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0];
+ S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0];
+ end
+ end
+ else
+ begin
+ C_regbank[in_token][0] <= C_regbank[in_token-1][0] + 1;
+
+ W_regbank[in_token][0] <= W_regbank[in_token-1][0];
+ H_regbank[in_token][0] <= H_regbank[in_token-1][0];
+ D_regbank[in_token][0] <= D_regbank[in_token-1][0];
+
+ IN_W_regbank[in_token][0] <= IN_W_regbank[in_token-1][0];
+ IN_H_regbank[in_token][0] <= IN_H_regbank[in_token-1][0];
+ IN_D_regbank[in_token][0] <= IN_D_regbank[in_token-1][0];
+
+ S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0];
+ S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0];
+ S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0];
+ end
+ if (IN_W_regbank[in_token-1][0] >= if_lt_config.data.stride_width)
+ begin
+ IN_W_regbank[in_token][0] <= 0;
+ if (S_W_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_w_limit) begin
+ S_W_regbank[in_token][0] <= 0;
+ end else begin
+ S_W_regbank[in_token][0] <= S_W_regbank[in_token-1][0] + 1;
+ end
+ end
+ if (IN_H_regbank[in_token-1][0] >= if_lt_config.data.stride_height)
+ begin
+ IN_H_regbank[in_token][0] <= 0;
+ if (S_H_regbank[in_token-1][0] + 1 >= if_lt_config.data.stride_h_limit) begin
+ S_H_regbank[in_token][0] <= 0;
+ end else begin
+ S_H_regbank[in_token][0] <= S_H_regbank[in_token-1][0] + 1;
+ end
+ end
+ if (IN_D_regbank[in_token-1][0] >= if_lt_config.data.stride_depth)
+ begin
+ IN_D_regbank[in_token][0] <= 0;
+ S_D_regbank[in_token][0] <= S_D_regbank[in_token-1][0] + 1;
+ end
+ end
+ for (integer i = 1; i < (ELEMENTS_PER_CYCLE - in_token); i++)
+ begin
+ C_regbank[in_token][i] <= C_regbank[in_token][i-1];
+ W_regbank[in_token][i] <= W_regbank[in_token][i-1];
+ H_regbank[in_token][i] <= H_regbank[in_token][i-1];
+ D_regbank[in_token][i] <= D_regbank[in_token][i-1];
+
+ IN_W_regbank[in_token][i] <= IN_W_regbank[in_token][i-1];
+ IN_H_regbank[in_token][i] <= IN_H_regbank[in_token][i-1];
+ IN_D_regbank[in_token][i] <= IN_D_regbank[in_token][i-1];
+
+ S_W_regbank[in_token][i] <= S_W_regbank[in_token][i-1];
+ S_H_regbank[in_token][i] <= S_H_regbank[in_token][i-1];
+ S_D_regbank[in_token][i] <= S_D_regbank[in_token][i-1];
+ end
+ C_d[in_token] <= C_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ W_d[in_token] <= W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ H_d[in_token] <= H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ D_d[in_token] <= D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+
+ IN_W_d[in_token] <= IN_W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ IN_H_d[in_token] <= IN_H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ IN_D_d[in_token] <= IN_D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+
+ S_W_d[in_token] <= S_W_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ S_H_d[in_token] <= S_H_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ S_D_d[in_token] <= S_D_regbank[in_token][(ELEMENTS_PER_CYCLE - in_token)-1];
+ Index_d[in_token] <= next_index + in_token - (ELEMENTS_PER_CYCLE * (ELEMENTS_PER_CYCLE));
+ end
+ end
+ end //ff_calculate_tensor_indexes
+ end //gen_calculate_tensor_indexes
+endmodule //lt_dimension_counter