diff options
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_aux_depthwise_control.sv')
| -rw-r--r-- | python/openvino/demo/ip/intel_ai_ip/verilog/dla_aux_depthwise_control.sv | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_aux_depthwise_control.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_aux_depthwise_control.sv new file mode 100644 index 0000000..9c5ae69 --- /dev/null +++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_aux_depthwise_control.sv @@ -0,0 +1,484 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +/* + * Module `dla_aux_depthwise_control` + * + * Control of the core functionality of the auxiliary block. + * + * WARNING! ONLY EDIT THE PARTS MARKED IN BETWEEN + * "START EDITING" AND "END EDITING" + * + * See README.md of the Example Aux block for more details. + */ + +`undefineall +`resetall +`default_nettype none + +`include "dla_acl_parameter_assert.svh" + +module dla_aux_depthwise_control + import dla_aux_depthwise_pkg::*; +#( + parameter aux_depthwise_arch_params_t ARCH // Architecture parameters +) ( + input wire clk , // Clock + input wire i_resetn , // active low reset + // + depthwise_config_to_control_if.receiver i_config_to_control, // Config to control connection + output control_to_config_t o_control_to_config, // Control to config connection + depthwise_control_to_lane_if.sender o_control_to_lane , // Control to lane connection + input lane_to_control_t i_lane_to_control , // Lane to control connection + // + output debug_control_t o_debug // Debug output +); + +/* synthesis translate_off */ +`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_data_pack_params_t'(i_config_to_control.data_pack_params) == ARCH.AUX_DATA_PACK_PARAMS, + "i_config_to_control if parameters don't match data pack params") +`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_special_params_t'(i_config_to_control.special_params) == ARCH.AUX_SPECIAL_PARAMS, + "i_config_to_control if parameters don't match special params") +`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_data_pack_params_t'(o_control_to_lane.data_pack_params) == ARCH.AUX_DATA_PACK_PARAMS, + "o_control_to_lane if parameters don't match data pack params") +`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_special_params_t'(o_control_to_lane.special_params) == ARCH.AUX_SPECIAL_PARAMS, + "o_control_to_lane if parameters don't match special params") +/* synthesis translate_on */ + +// +// ------------------------------ START EDITING ------------------------------ +// + // Shorthand versions of parameters + localparam NATIVE_VECTOR_SIZE = ARCH.AUX_DATA_PACK_PARAMS.NATIVE_VECTOR_SIZE ; + localparam VECTOR_SIZE = ARCH.AUX_DATA_PACK_PARAMS.VECTOR_SIZE ; + localparam MAX_WINDOW_HEIGHT = ARCH.AUX_SPECIAL_PARAMS.MAX_WINDOW_HEIGHT ; + localparam MAX_WINDOW_WIDTH = ARCH.AUX_SPECIAL_PARAMS.MAX_WINDOW_WIDTH ; + localparam MAX_STRIDE_HORIZONTAL = ARCH.AUX_SPECIAL_PARAMS.MAX_STRIDE_HORIZONTAL; + localparam MAX_STRIDE_VERTICAL = ARCH.AUX_SPECIAL_PARAMS.MAX_STRIDE_VERTICAL ; + localparam MAX_DILATION_VERTICAL = ARCH.AUX_SPECIAL_PARAMS.MAX_DILATION_VERTICAL ; + localparam MAX_DILATION_HORIZONTAL = ARCH.AUX_SPECIAL_PARAMS.MAX_DILATION_HORIZONTAL ; + localparam VERTICAL_LINES = MAX_WINDOW_HEIGHT + ((MAX_WINDOW_HEIGHT-1) * (MAX_DILATION_VERTICAL-1)); + localparam WINDOW_BITS_VERTICAL = $clog2(MAX_WINDOW_HEIGHT + 1); + localparam WINDOW_BITS_HORIZONTAL = $clog2(MAX_WINDOW_WIDTH + 1); + localparam DILATION_BITS_VERTICAL = $clog2(MAX_DILATION_VERTICAL + 1); + localparam DILATION_BITS_HORIZONTAL = $clog2(MAX_DILATION_HORIZONTAL + 1); + localparam VECTOR_RATIO = NATIVE_VECTOR_SIZE / VECTOR_SIZE; + localparam TILE_COUNT = ARCH.AUX_DATA_PACK_PARAMS.GROUP_SIZE * + ARCH.AUX_DATA_PACK_PARAMS.GROUP_NUM; + + + // input valid counters + logic [$clog2( VECTOR_RATIO + 1 )-1:0] count_in_vector; + logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_WIDTH + 1 )-1:0] count_in_width ; + logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_HEIGHT + 1 )-1:0] count_in_height; + logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_CHANNELS + NATIVE_VECTOR_SIZE)-1:0] count_in_channels ; + + // output valid counters + logic [$clog2( VECTOR_RATIO + 1 )-1:0] count_out_vector; + logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_WIDTH + 1 )-1:0] count_out_width ; + logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_HEIGHT + 1 )-1:0] count_out_height; + logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_CHANNELS + NATIVE_VECTOR_SIZE)-1:0] count_out_channels ; + + // Register the computation of the effective filter sizes to be used lane_to_control_t + logic [WINDOW_BITS_VERTICAL-1:0] kernel_vert_minus_one; + logic [WINDOW_BITS_HORIZONTAL-1:0] kernel_horiz_minus_one; + + logic [DILATION_BITS_VERTICAL-1:0] dilation_vert_minus_one; + logic [DILATION_BITS_HORIZONTAL-1:0] dilation_horiz_minus_one; + + logic [WINDOW_BITS_VERTICAL+DILATION_BITS_VERTICAL-1:0] kernel_x_dilation_vert; + logic [WINDOW_BITS_HORIZONTAL+DILATION_BITS_HORIZONTAL-1:0] kernel_x_dilation_horiz; + + logic [WINDOW_BITS_VERTICAL+DILATION_BITS_VERTICAL:0] eff_kernel_vert; + logic [WINDOW_BITS_HORIZONTAL+DILATION_BITS_HORIZONTAL:0] eff_kernel_horiz; + + always_ff @(posedge clk) begin + if (~i_resetn) begin + kernel_vert_minus_one <= '{default:'0}; + kernel_horiz_minus_one <= '{default:'0}; + dilation_vert_minus_one <= '{default:'0}; + dilation_horiz_minus_one <= '{default:'0}; + kernel_x_dilation_vert <= '{default:'0}; + kernel_x_dilation_horiz <= '{default:'0}; + eff_kernel_vert <= '{default:'0}; + eff_kernel_horiz <= '{default:'0}; + end else begin + kernel_vert_minus_one <= (i_config_to_control.data[0][0].window_height - 1); + kernel_horiz_minus_one <= (i_config_to_control.data[0][0].window_width - 1); + dilation_vert_minus_one <= (i_config_to_control.data[0][0].dilation_vertical - 1); + dilation_horiz_minus_one <= (i_config_to_control.data[0][0].dilation_horizontal - 1); + kernel_x_dilation_vert <= kernel_vert_minus_one * dilation_vert_minus_one; + kernel_x_dilation_horiz <= kernel_horiz_minus_one * dilation_horiz_minus_one; + eff_kernel_vert <= i_config_to_control.data[0][0].window_height + kernel_x_dilation_vert; + eff_kernel_horiz <= i_config_to_control.data[0][0].window_width + kernel_x_dilation_horiz; + end + end + // + // Input valid counter comprises cascaded counters of vector, width, height and channels. + // + // The input backpressure signal is also generated in this process. + // + logic input_group_done; + logic feature_ready; + logic feature_almost_ready; + logic configured_delayed; + assign o_control_to_lane.data[0][0].ready = feature_ready; + always_ff @(posedge clk) begin : proc_input_counters + // Nested counters for channels, line and column, which operate only when the core's input is valid + configured_delayed <= i_config_to_control.data[0][0].configured; + input_group_done <= 0; + feature_almost_ready <= 0; + if (i_config_to_control.data[0][0].configured & ~configured_delayed) + o_control_to_lane.data[0][0].configured_starting <= ~o_control_to_lane.data[0][0].configured_starting; + if (o_control_to_config.done) + o_control_to_lane.data[0][0].configured_ending <= ~o_control_to_lane.data[0][0].configured_ending; + if (i_lane_to_control.core_input_valid) begin + // shallow channels counter + count_in_vector <= count_in_vector + 1'b1; + if (count_in_vector >= VECTOR_RATIO-1) begin + count_in_vector <= '0; + // column counter + count_in_width <= count_in_width + 1'b1; + // We want to stop reading features if filters are not ready and if we are close to getting enough features to produce output + // enough features euql to a number of rows = window_height and columns equal window_width + if ((count_in_height >= kernel_vert_minus_one) && (count_in_width >= i_config_to_control.data[0][0].window_width-2)) begin + feature_almost_ready <= 1; + end + if (count_in_width >= i_config_to_control.data[0][0].tile_width-1) begin + count_in_width <= '0; + // line counter + count_in_height <= count_in_height + 1'b1; + if (count_in_height >= i_config_to_control.data[0][0].tile_height-1) begin + count_in_height <= '0; + // channels counter + input_group_done <= 1; + count_in_channels <= $bits(count_in_channels)'(count_in_channels + NATIVE_VECTOR_SIZE); + if (count_in_channels >= i_config_to_control.data[0][0].tile_channels - NATIVE_VECTOR_SIZE) begin + count_in_channels <= '0; + input_group_done <= 1; + // input tensor is finished, backpressure the input pipeline + //o_control_to_lane.data[0][0].ready <= 1'b0; + end + end + end + end + end + // + begin + logic configured_reg; + logic filter_ready_reg; + // register the current value of the 'configured' signal + configured_reg <= i_config_to_control.data[0][0].configured; + filter_ready_reg <= i_lane_to_control.depthwise_filter_ready; + // wait for a rising edge of the 'configured' signal to disable input pipeline backpressure + // CHECKME: why the first and? + end + // reset counters if in reset or not configured + if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin + count_in_vector <= '0; + count_in_width <= '0; + count_in_height <= '0; + count_in_channels <= '0; + input_group_done <= 1'b0; + feature_almost_ready <= 0; + configured_delayed <= '0; + end + if (~i_resetn) begin + o_control_to_lane.data[0][0].configured_starting <= 0; + o_control_to_lane.data[0][0].configured_ending <= 0; + end + end : proc_input_counters + // + // state machine to handle when features and filters should be ready to be received + // right now, we receive filters first then features, then we process + // + typedef enum logic [2:0] { + IDLE = 3'b000, + FILTER_FEATURE = 3'b001, + FILTER = 3'b010, + FEATURE = 3'b011, + PROCESSING = 3'b100 + } state_t; + state_t state, state_next; + always_ff @(posedge clk) begin + if (~i_resetn) begin + state <= IDLE; + end else begin + state <= state_next; + end + end + always_comb begin + state_next = state; + feature_ready = 0; + case(state) + IDLE: begin + feature_ready = 0; + if (i_config_to_control.data[0][0].configured) begin + state_next = FILTER_FEATURE; + feature_ready = 1; + end + end + FILTER_FEATURE: begin + feature_ready = 1; + if (feature_almost_ready) begin + feature_ready = 0; + state_next = FILTER; + end + if (i_lane_to_control.depthwise_filter_ready) + state_next = FEATURE; + end + FILTER: begin + if (i_lane_to_control.depthwise_filter_ready) + state_next = FEATURE; + end + FEATURE: begin + feature_ready = 1; + if (input_group_done) begin + state_next = PROCESSING; + feature_ready = 0; + end + else if (o_control_to_lane.data[0][0].done & i_config_to_control.data[0][0].configured) begin + state_next = FILTER; + feature_ready = 0; + end else if (o_control_to_lane.data[0][0].done & ~i_config_to_control.data[0][0].configured) begin + state_next = IDLE; + feature_ready = 0; + end + end + PROCESSING: begin + if (o_control_to_config.done) begin + state_next = IDLE; + feature_ready = 0; + end + else if (o_control_to_lane.data[0][0].done & i_config_to_control.data[0][0].configured) + state_next = FILTER_FEATURE; + else if (o_control_to_lane.data[0][0].done & ~i_config_to_control.data[0][0].configured) + state_next = IDLE; + end + default: state_next = IDLE; // Default state + endcase + end + // Pass dilation from config to lane + assign o_control_to_lane.data[0][0].dilation_vertical = i_config_to_control.data[0][0].dilation_vertical; + assign o_control_to_lane.data[0][0].dilation_horizontal = i_config_to_control.data[0][0].dilation_horizontal; + // + // Line-buffers inside the core are implemented as FIFOs. FIFO synchronization and handover + // between consequent tensors are achieved by the following steps: + // * Line buffers are filled with tensor-width amount of data at the beginning of each tensor. + // * The fill level is kept constant throughout the tensor. + // * At the end of each tensor all FIFOs are drained to prepare them for the next tensor. + // + always_ff @(posedge clk) begin : proc_line_buff_control + o_control_to_lane.data[0][0].line_buff_wait_fill <= count_in_height == 0; + o_control_to_lane.data[0][0].line_buff_flush <= count_in_height == i_config_to_control.data[0][0].tile_height-1; + // Flush the FIFO fill level when window height is configured to be 1 + if (i_config_to_control.data[0][0].window_height == 1 && i_config_to_control.data[0][0].configured) begin + o_control_to_lane.data[0][0].line_buff_wait_fill <= 1'b0; + o_control_to_lane.data[0][0].line_buff_flush <= 1'b1; + end + if (~i_resetn) begin + o_control_to_lane.data[0][0].line_buff_wait_fill <= 1'b0; + o_control_to_lane.data[0][0].line_buff_flush <= 1'b0; + end + end : proc_line_buff_control + + // + // Padding generator control consists of multiple enable flags. Each flag enables a set/reset + // mode of a register or act like select bits of a multiplexer. + // + // If max window size is larger than the configured window size, then the generator is used to + // load the identity element of the operation into the out of bound registers. + // + always_ff @(posedge clk) begin : proc_pad_control + o_control_to_lane.data[0][0].window_height <= i_config_to_control.data[0][0].window_height; + o_control_to_lane.data[0][0].window_width <= i_config_to_control.data[0][0].window_width; + // Vertical padding control + for (int i = 0; i < TILE_COUNT; i++) begin : proc_pad_control_vert + for (int j = 0; j < VERTICAL_LINES; j++) begin + // For the height of the active window, determine if, when and which padding mode is enabled + // per-tile and per-line + // ((MAX_WINDOW_HEIGHT-1) * (MAX_DILATION_VERTICAL-1)) + if (j < eff_kernel_vert) begin + o_control_to_lane.data[0][0].en_pad_zero_vert[i][j] <= ( + count_in_height < j + i_config_to_control.data[0][0].tile_vertical_start[i] || + count_in_height > j + i_config_to_control.data[0][0].tile_vertical_end [i]) && + i_config_to_control.data[0][0].padding_mode == 2'b00; + // + // TODO: Implement constant and reflection boundary conditions + // + o_control_to_lane.data[0][0].en_pad_nan_vert[i][j] <= ( + count_in_height < j + i_config_to_control.data[0][0].tile_vertical_start[i] || + count_in_height > j + i_config_to_control.data[0][0].tile_vertical_end [i]) && + i_config_to_control.data[0][0].padding_ignore; + end else begin + // For the lines outside the active window, pad everything to NaN, which is defined to be + // the identity element + o_control_to_lane.data[0][0].en_pad_nan_vert[i][j] <= 1'b1; + end + end + end : proc_pad_control_vert + // Horizontal padding control + for (int i = 0; i < TILE_COUNT; i++) begin : proc_pad_control_horiz + for (int j = 0; j < MAX_WINDOW_WIDTH; j++) begin + // For the width of the active window, determine if, when and which padding mode is enabled + // per-tile and per-line + if (j < i_config_to_control.data[0][0].window_width) begin + o_control_to_lane.data[0][0].en_pad_zero_horiz[i][j] <= ( + count_in_width < j + i_config_to_control.data[0][0].tile_horizontal_start[i] || + count_in_width > j + i_config_to_control.data[0][0].tile_horizontal_end [i]) && + i_config_to_control.data[0][0].padding_mode == 2'b00; + // + // TODO: Implement constant and reflection boundary conditions + // + o_control_to_lane.data[0][0].en_pad_nan_horiz[i][j] <= ( + count_in_width < j + i_config_to_control.data[0][0].tile_horizontal_start[i] || + count_in_width > j + i_config_to_control.data[0][0].tile_horizontal_end [i]) && + i_config_to_control.data[0][0].padding_ignore; + end else begin + // For the columns outside the active window pad everything to NaN, which is defined to be the + // identity element + o_control_to_lane.data[0][0].en_pad_nan_horiz[i][j] <= 1'b1; + end + end + end : proc_pad_control_horiz + // These flags mark the area of padding + o_control_to_lane.data[0][0].is_padding_zone_vert <= i_config_to_control.data[0][0].window_height > 1 && + count_in_height < (eff_kernel_vert-1); + o_control_to_lane.data[0][0].is_padding_zone_horiz <= i_config_to_control.data[0][0].window_width > 1 && + count_in_width < (eff_kernel_horiz-1); + if (~i_resetn) begin + o_control_to_lane.data[0][0].is_padding_zone_vert <= 1'b1; + o_control_to_lane.data[0][0].is_padding_zone_horiz <= 1'b1; + o_control_to_lane.data[0][0].window_height <= MAX_WINDOW_HEIGHT; + o_control_to_lane.data[0][0].window_width <= MAX_WINDOW_WIDTH; + end + end : proc_pad_control + + // + // Stride counters and stride valid signal generator. + // + // Other input counters are used in conjunction + // + always_ff @(posedge clk) begin : proc_stride + // stride counters + logic [$clog2(MAX_STRIDE_VERTICAL +1):0] count_stride_vert ; + logic [$clog2(MAX_STRIDE_HORIZONTAL+1):0] count_stride_horiz; + // Count only when input is valid and shallow channels counter is about to overflow (which means + // we are moving on to the next face coordinates) + if (i_lane_to_control.core_input_valid) begin + if (count_in_vector >= VECTOR_RATIO-1) begin + // By default, increment the horizontal stride counter, as long as the width-counter has + // counted minimum window-with number of elements (so core has a full window to operate on). + if (count_in_width >= kernel_horiz_minus_one) begin + count_stride_horiz <= count_stride_horiz + 1'b1; + end + // Reset horizontal stride counter when it overflows + if (count_stride_horiz >= i_config_to_control.data[0][0].stride_horizontal - 1) begin + count_stride_horiz <= '0; + end + // Vertical stride counter is manipulated only when the input width-counter is about to + // overflow + if (count_in_width >= i_config_to_control.data[0][0].tile_width-1) begin + // Reset the horizontal counter + count_stride_horiz <= '0; + // By default, increment the vertical stride counter, as long as the height-counter has + // counted minimum window-height number of elements (so core has a full window to operate + // on). + if (count_in_height >= kernel_vert_minus_one) begin + count_stride_vert <= count_stride_vert + 1'b1; + end + // Reset vertical stride counter when it overflows or when input height-counter is about + // to overflow + if ( + count_stride_vert >= i_config_to_control.data[0][0].stride_vertical - 1 || + count_in_height >= i_config_to_control.data[0][0].tile_height - 1 + ) begin + count_stride_vert <= '0; + end + end + end + end + // Stride counters must be reset when window height is configured to 1 + if ( + i_config_to_control.data[0][0].window_height == 1 && i_config_to_control.data[0][0].configured && + count_stride_horiz == '1 && count_stride_vert == '1 + ) begin + count_stride_vert <= '0; + count_stride_horiz <= '0; + end + // During reset both counters are set to give one extra cycle to the counters + if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin + count_stride_horiz <= '1; + count_stride_vert <= '1; + end + // + // Stride is valid when both horizontal and vertical counters are zero + // + o_control_to_lane.data[0][0].stride_valid <= count_stride_vert == '0 && count_stride_horiz == '0; + end : proc_stride + + // + // Output valid counter comprises cascaded counters of vector, width, height and channels. + // + // A 'done' pulse is sent to the config decoder when the last tensor element is processed. + // + always_ff @(posedge clk) begin : proc_output_counters + // clear the done signal by default + o_control_to_config.done <= 1'b0; + o_control_to_lane.data[0][0].done <= 1'b0; + // Nested counters for channels, line and column, which operate only when core has a valid result. + if (i_lane_to_control.core_output_valid) begin + // shallow channels counter + count_out_vector <= count_out_vector + 1'b1; + if (count_out_vector >= VECTOR_RATIO-1) begin + count_out_vector <= '0; + // column counter + count_out_width <= $bits(count_out_width)'(count_out_width + i_config_to_control.data[0][0].stride_horizontal); + if (count_out_width >= (i_config_to_control.data[0][0].tile_width - + i_config_to_control.data[0][0].window_width) - + kernel_x_dilation_horiz) begin + count_out_width <= '0; + // line counter + count_out_height <= $bits(count_out_height)'(count_out_height + i_config_to_control.data[0][0].stride_vertical); + //-i_config_to_control.data[0][0].stride_vertical + 1 + if (count_out_height >= i_config_to_control.data[0][0].tile_height - + i_config_to_control.data[0][0].window_height - + kernel_x_dilation_vert) begin + count_out_height <= '0; + // send a 1 clock cycle long 'done' every time channels counter increment + // to indicate a new set of filters + o_control_to_lane.data[0][0].done <= 1'b1; + // channels counter + count_out_channels <= $bits(count_out_channels)'(count_out_channels + NATIVE_VECTOR_SIZE); + if (count_out_channels >= i_config_to_control.data[0][0].tile_channels - NATIVE_VECTOR_SIZE) begin + count_out_channels <= '0; + // send a 1 clock cycle long 'done' pulse after all counters reset to 0 + o_control_to_config.done <= 1'b1; + end + end + end + end + end + // Reset counters if the module is in reset or not configured + if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin + count_out_vector <= '0; + count_out_width <= '0; + count_out_height <= '0; + count_out_channels <= '0; + end + end : proc_output_counters +// +// ------------------------------ END EDITING ------------------------------ +// + +endmodule |
