// Copyright 2020-2023 Intel Corporation. // // This software and the related documents are Intel copyrighted materials, // and your use of them is governed by the express license under which they // were provided to you ("License"). Unless the License provides otherwise, // you may not use, modify, copy, publish, distribute, disclose or transmit // this software or the related documents without Intel's prior written // permission. // // This software and the related documents are provided as is, with no express // or implied warranties, other than those that are expressly stated in the // License. /* * Module `dla_aux_depthwise_control` * * Control of the core functionality of the auxiliary block. * * WARNING! ONLY EDIT THE PARTS MARKED IN BETWEEN * "START EDITING" AND "END EDITING" * * See README.md of the Example Aux block for more details. */ `undefineall `resetall `default_nettype none `include "dla_acl_parameter_assert.svh" module dla_aux_depthwise_control import dla_aux_depthwise_pkg::*; #( parameter aux_depthwise_arch_params_t ARCH // Architecture parameters ) ( input wire clk , // Clock input wire i_resetn , // active low reset // depthwise_config_to_control_if.receiver i_config_to_control, // Config to control connection output control_to_config_t o_control_to_config, // Control to config connection depthwise_control_to_lane_if.sender o_control_to_lane , // Control to lane connection input lane_to_control_t i_lane_to_control , // Lane to control connection // output debug_control_t o_debug // Debug output ); /* synthesis translate_off */ `DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_data_pack_params_t'(i_config_to_control.data_pack_params) == ARCH.AUX_DATA_PACK_PARAMS, "i_config_to_control if parameters don't match data pack params") `DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_special_params_t'(i_config_to_control.special_params) == ARCH.AUX_SPECIAL_PARAMS, "i_config_to_control if parameters don't match special params") `DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_data_pack_params_t'(o_control_to_lane.data_pack_params) == ARCH.AUX_DATA_PACK_PARAMS, "o_control_to_lane if parameters don't match data pack params") `DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_special_params_t'(o_control_to_lane.special_params) == ARCH.AUX_SPECIAL_PARAMS, "o_control_to_lane if parameters don't match special params") /* synthesis translate_on */ // // ------------------------------ START EDITING ------------------------------ // // Shorthand versions of parameters localparam NATIVE_VECTOR_SIZE = ARCH.AUX_DATA_PACK_PARAMS.NATIVE_VECTOR_SIZE ; localparam VECTOR_SIZE = ARCH.AUX_DATA_PACK_PARAMS.VECTOR_SIZE ; localparam MAX_WINDOW_HEIGHT = ARCH.AUX_SPECIAL_PARAMS.MAX_WINDOW_HEIGHT ; localparam MAX_WINDOW_WIDTH = ARCH.AUX_SPECIAL_PARAMS.MAX_WINDOW_WIDTH ; localparam MAX_STRIDE_HORIZONTAL = ARCH.AUX_SPECIAL_PARAMS.MAX_STRIDE_HORIZONTAL; localparam MAX_STRIDE_VERTICAL = ARCH.AUX_SPECIAL_PARAMS.MAX_STRIDE_VERTICAL ; localparam MAX_DILATION_VERTICAL = ARCH.AUX_SPECIAL_PARAMS.MAX_DILATION_VERTICAL ; localparam MAX_DILATION_HORIZONTAL = ARCH.AUX_SPECIAL_PARAMS.MAX_DILATION_HORIZONTAL ; localparam VERTICAL_LINES = MAX_WINDOW_HEIGHT + ((MAX_WINDOW_HEIGHT-1) * (MAX_DILATION_VERTICAL-1)); localparam WINDOW_BITS_VERTICAL = $clog2(MAX_WINDOW_HEIGHT + 1); localparam WINDOW_BITS_HORIZONTAL = $clog2(MAX_WINDOW_WIDTH + 1); localparam DILATION_BITS_VERTICAL = $clog2(MAX_DILATION_VERTICAL + 1); localparam DILATION_BITS_HORIZONTAL = $clog2(MAX_DILATION_HORIZONTAL + 1); localparam VECTOR_RATIO = NATIVE_VECTOR_SIZE / VECTOR_SIZE; localparam TILE_COUNT = ARCH.AUX_DATA_PACK_PARAMS.GROUP_SIZE * ARCH.AUX_DATA_PACK_PARAMS.GROUP_NUM; // input valid counters logic [$clog2( VECTOR_RATIO + 1 )-1:0] count_in_vector; logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_WIDTH + 1 )-1:0] count_in_width ; logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_HEIGHT + 1 )-1:0] count_in_height; logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_CHANNELS + NATIVE_VECTOR_SIZE)-1:0] count_in_channels ; // output valid counters logic [$clog2( VECTOR_RATIO + 1 )-1:0] count_out_vector; logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_WIDTH + 1 )-1:0] count_out_width ; logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_HEIGHT + 1 )-1:0] count_out_height; logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_CHANNELS + NATIVE_VECTOR_SIZE)-1:0] count_out_channels ; // Register the computation of the effective filter sizes to be used lane_to_control_t logic [WINDOW_BITS_VERTICAL-1:0] kernel_vert_minus_one; logic [WINDOW_BITS_HORIZONTAL-1:0] kernel_horiz_minus_one; logic [DILATION_BITS_VERTICAL-1:0] dilation_vert_minus_one; logic [DILATION_BITS_HORIZONTAL-1:0] dilation_horiz_minus_one; logic [WINDOW_BITS_VERTICAL+DILATION_BITS_VERTICAL-1:0] kernel_x_dilation_vert; logic [WINDOW_BITS_HORIZONTAL+DILATION_BITS_HORIZONTAL-1:0] kernel_x_dilation_horiz; logic [WINDOW_BITS_VERTICAL+DILATION_BITS_VERTICAL:0] eff_kernel_vert; logic [WINDOW_BITS_HORIZONTAL+DILATION_BITS_HORIZONTAL:0] eff_kernel_horiz; always_ff @(posedge clk) begin if (~i_resetn) begin kernel_vert_minus_one <= '{default:'0}; kernel_horiz_minus_one <= '{default:'0}; dilation_vert_minus_one <= '{default:'0}; dilation_horiz_minus_one <= '{default:'0}; kernel_x_dilation_vert <= '{default:'0}; kernel_x_dilation_horiz <= '{default:'0}; eff_kernel_vert <= '{default:'0}; eff_kernel_horiz <= '{default:'0}; end else begin kernel_vert_minus_one <= (i_config_to_control.data[0][0].window_height - 1); kernel_horiz_minus_one <= (i_config_to_control.data[0][0].window_width - 1); dilation_vert_minus_one <= (i_config_to_control.data[0][0].dilation_vertical - 1); dilation_horiz_minus_one <= (i_config_to_control.data[0][0].dilation_horizontal - 1); kernel_x_dilation_vert <= kernel_vert_minus_one * dilation_vert_minus_one; kernel_x_dilation_horiz <= kernel_horiz_minus_one * dilation_horiz_minus_one; eff_kernel_vert <= i_config_to_control.data[0][0].window_height + kernel_x_dilation_vert; eff_kernel_horiz <= i_config_to_control.data[0][0].window_width + kernel_x_dilation_horiz; end end // // Input valid counter comprises cascaded counters of vector, width, height and channels. // // The input backpressure signal is also generated in this process. // logic input_group_done; logic feature_ready; logic feature_almost_ready; logic configured_delayed; assign o_control_to_lane.data[0][0].ready = feature_ready; always_ff @(posedge clk) begin : proc_input_counters // Nested counters for channels, line and column, which operate only when the core's input is valid configured_delayed <= i_config_to_control.data[0][0].configured; input_group_done <= 0; feature_almost_ready <= 0; if (i_config_to_control.data[0][0].configured & ~configured_delayed) o_control_to_lane.data[0][0].configured_starting <= ~o_control_to_lane.data[0][0].configured_starting; if (o_control_to_config.done) o_control_to_lane.data[0][0].configured_ending <= ~o_control_to_lane.data[0][0].configured_ending; if (i_lane_to_control.core_input_valid) begin // shallow channels counter count_in_vector <= count_in_vector + 1'b1; if (count_in_vector >= VECTOR_RATIO-1) begin count_in_vector <= '0; // column counter count_in_width <= count_in_width + 1'b1; // We want to stop reading features if filters are not ready and if we are close to getting enough features to produce output // enough features euql to a number of rows = window_height and columns equal window_width if ((count_in_height >= kernel_vert_minus_one) && (count_in_width >= i_config_to_control.data[0][0].window_width-2)) begin feature_almost_ready <= 1; end if (count_in_width >= i_config_to_control.data[0][0].tile_width-1) begin count_in_width <= '0; // line counter count_in_height <= count_in_height + 1'b1; if (count_in_height >= i_config_to_control.data[0][0].tile_height-1) begin count_in_height <= '0; // channels counter input_group_done <= 1; count_in_channels <= $bits(count_in_channels)'(count_in_channels + NATIVE_VECTOR_SIZE); if (count_in_channels >= i_config_to_control.data[0][0].tile_channels - NATIVE_VECTOR_SIZE) begin count_in_channels <= '0; input_group_done <= 1; // input tensor is finished, backpressure the input pipeline //o_control_to_lane.data[0][0].ready <= 1'b0; end end end end end // begin logic configured_reg; logic filter_ready_reg; // register the current value of the 'configured' signal configured_reg <= i_config_to_control.data[0][0].configured; filter_ready_reg <= i_lane_to_control.depthwise_filter_ready; // wait for a rising edge of the 'configured' signal to disable input pipeline backpressure // CHECKME: why the first and? end // reset counters if in reset or not configured if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin count_in_vector <= '0; count_in_width <= '0; count_in_height <= '0; count_in_channels <= '0; input_group_done <= 1'b0; feature_almost_ready <= 0; configured_delayed <= '0; end if (~i_resetn) begin o_control_to_lane.data[0][0].configured_starting <= 0; o_control_to_lane.data[0][0].configured_ending <= 0; end end : proc_input_counters // // state machine to handle when features and filters should be ready to be received // right now, we receive filters first then features, then we process // typedef enum logic [2:0] { IDLE = 3'b000, FILTER_FEATURE = 3'b001, FILTER = 3'b010, FEATURE = 3'b011, PROCESSING = 3'b100 } state_t; state_t state, state_next; always_ff @(posedge clk) begin if (~i_resetn) begin state <= IDLE; end else begin state <= state_next; end end always_comb begin state_next = state; feature_ready = 0; case(state) IDLE: begin feature_ready = 0; if (i_config_to_control.data[0][0].configured) begin state_next = FILTER_FEATURE; feature_ready = 1; end end FILTER_FEATURE: begin feature_ready = 1; if (feature_almost_ready) begin feature_ready = 0; state_next = FILTER; end if (i_lane_to_control.depthwise_filter_ready) state_next = FEATURE; end FILTER: begin if (i_lane_to_control.depthwise_filter_ready) state_next = FEATURE; end FEATURE: begin feature_ready = 1; if (input_group_done) begin state_next = PROCESSING; feature_ready = 0; end else if (o_control_to_lane.data[0][0].done & i_config_to_control.data[0][0].configured) begin state_next = FILTER; feature_ready = 0; end else if (o_control_to_lane.data[0][0].done & ~i_config_to_control.data[0][0].configured) begin state_next = IDLE; feature_ready = 0; end end PROCESSING: begin if (o_control_to_config.done) begin state_next = IDLE; feature_ready = 0; end else if (o_control_to_lane.data[0][0].done & i_config_to_control.data[0][0].configured) state_next = FILTER_FEATURE; else if (o_control_to_lane.data[0][0].done & ~i_config_to_control.data[0][0].configured) state_next = IDLE; end default: state_next = IDLE; // Default state endcase end // Pass dilation from config to lane assign o_control_to_lane.data[0][0].dilation_vertical = i_config_to_control.data[0][0].dilation_vertical; assign o_control_to_lane.data[0][0].dilation_horizontal = i_config_to_control.data[0][0].dilation_horizontal; // // Line-buffers inside the core are implemented as FIFOs. FIFO synchronization and handover // between consequent tensors are achieved by the following steps: // * Line buffers are filled with tensor-width amount of data at the beginning of each tensor. // * The fill level is kept constant throughout the tensor. // * At the end of each tensor all FIFOs are drained to prepare them for the next tensor. // always_ff @(posedge clk) begin : proc_line_buff_control o_control_to_lane.data[0][0].line_buff_wait_fill <= count_in_height == 0; o_control_to_lane.data[0][0].line_buff_flush <= count_in_height == i_config_to_control.data[0][0].tile_height-1; // Flush the FIFO fill level when window height is configured to be 1 if (i_config_to_control.data[0][0].window_height == 1 && i_config_to_control.data[0][0].configured) begin o_control_to_lane.data[0][0].line_buff_wait_fill <= 1'b0; o_control_to_lane.data[0][0].line_buff_flush <= 1'b1; end if (~i_resetn) begin o_control_to_lane.data[0][0].line_buff_wait_fill <= 1'b0; o_control_to_lane.data[0][0].line_buff_flush <= 1'b0; end end : proc_line_buff_control // // Padding generator control consists of multiple enable flags. Each flag enables a set/reset // mode of a register or act like select bits of a multiplexer. // // If max window size is larger than the configured window size, then the generator is used to // load the identity element of the operation into the out of bound registers. // always_ff @(posedge clk) begin : proc_pad_control o_control_to_lane.data[0][0].window_height <= i_config_to_control.data[0][0].window_height; o_control_to_lane.data[0][0].window_width <= i_config_to_control.data[0][0].window_width; // Vertical padding control for (int i = 0; i < TILE_COUNT; i++) begin : proc_pad_control_vert for (int j = 0; j < VERTICAL_LINES; j++) begin // For the height of the active window, determine if, when and which padding mode is enabled // per-tile and per-line // ((MAX_WINDOW_HEIGHT-1) * (MAX_DILATION_VERTICAL-1)) if (j < eff_kernel_vert) begin o_control_to_lane.data[0][0].en_pad_zero_vert[i][j] <= ( count_in_height < j + i_config_to_control.data[0][0].tile_vertical_start[i] || count_in_height > j + i_config_to_control.data[0][0].tile_vertical_end [i]) && i_config_to_control.data[0][0].padding_mode == 2'b00; // // TODO: Implement constant and reflection boundary conditions // o_control_to_lane.data[0][0].en_pad_nan_vert[i][j] <= ( count_in_height < j + i_config_to_control.data[0][0].tile_vertical_start[i] || count_in_height > j + i_config_to_control.data[0][0].tile_vertical_end [i]) && i_config_to_control.data[0][0].padding_ignore; end else begin // For the lines outside the active window, pad everything to NaN, which is defined to be // the identity element o_control_to_lane.data[0][0].en_pad_nan_vert[i][j] <= 1'b1; end end end : proc_pad_control_vert // Horizontal padding control for (int i = 0; i < TILE_COUNT; i++) begin : proc_pad_control_horiz for (int j = 0; j < MAX_WINDOW_WIDTH; j++) begin // For the width of the active window, determine if, when and which padding mode is enabled // per-tile and per-line if (j < i_config_to_control.data[0][0].window_width) begin o_control_to_lane.data[0][0].en_pad_zero_horiz[i][j] <= ( count_in_width < j + i_config_to_control.data[0][0].tile_horizontal_start[i] || count_in_width > j + i_config_to_control.data[0][0].tile_horizontal_end [i]) && i_config_to_control.data[0][0].padding_mode == 2'b00; // // TODO: Implement constant and reflection boundary conditions // o_control_to_lane.data[0][0].en_pad_nan_horiz[i][j] <= ( count_in_width < j + i_config_to_control.data[0][0].tile_horizontal_start[i] || count_in_width > j + i_config_to_control.data[0][0].tile_horizontal_end [i]) && i_config_to_control.data[0][0].padding_ignore; end else begin // For the columns outside the active window pad everything to NaN, which is defined to be the // identity element o_control_to_lane.data[0][0].en_pad_nan_horiz[i][j] <= 1'b1; end end end : proc_pad_control_horiz // These flags mark the area of padding o_control_to_lane.data[0][0].is_padding_zone_vert <= i_config_to_control.data[0][0].window_height > 1 && count_in_height < (eff_kernel_vert-1); o_control_to_lane.data[0][0].is_padding_zone_horiz <= i_config_to_control.data[0][0].window_width > 1 && count_in_width < (eff_kernel_horiz-1); if (~i_resetn) begin o_control_to_lane.data[0][0].is_padding_zone_vert <= 1'b1; o_control_to_lane.data[0][0].is_padding_zone_horiz <= 1'b1; o_control_to_lane.data[0][0].window_height <= MAX_WINDOW_HEIGHT; o_control_to_lane.data[0][0].window_width <= MAX_WINDOW_WIDTH; end end : proc_pad_control // // Stride counters and stride valid signal generator. // // Other input counters are used in conjunction // always_ff @(posedge clk) begin : proc_stride // stride counters logic [$clog2(MAX_STRIDE_VERTICAL +1):0] count_stride_vert ; logic [$clog2(MAX_STRIDE_HORIZONTAL+1):0] count_stride_horiz; // Count only when input is valid and shallow channels counter is about to overflow (which means // we are moving on to the next face coordinates) if (i_lane_to_control.core_input_valid) begin if (count_in_vector >= VECTOR_RATIO-1) begin // By default, increment the horizontal stride counter, as long as the width-counter has // counted minimum window-with number of elements (so core has a full window to operate on). if (count_in_width >= kernel_horiz_minus_one) begin count_stride_horiz <= count_stride_horiz + 1'b1; end // Reset horizontal stride counter when it overflows if (count_stride_horiz >= i_config_to_control.data[0][0].stride_horizontal - 1) begin count_stride_horiz <= '0; end // Vertical stride counter is manipulated only when the input width-counter is about to // overflow if (count_in_width >= i_config_to_control.data[0][0].tile_width-1) begin // Reset the horizontal counter count_stride_horiz <= '0; // By default, increment the vertical stride counter, as long as the height-counter has // counted minimum window-height number of elements (so core has a full window to operate // on). if (count_in_height >= kernel_vert_minus_one) begin count_stride_vert <= count_stride_vert + 1'b1; end // Reset vertical stride counter when it overflows or when input height-counter is about // to overflow if ( count_stride_vert >= i_config_to_control.data[0][0].stride_vertical - 1 || count_in_height >= i_config_to_control.data[0][0].tile_height - 1 ) begin count_stride_vert <= '0; end end end end // Stride counters must be reset when window height is configured to 1 if ( i_config_to_control.data[0][0].window_height == 1 && i_config_to_control.data[0][0].configured && count_stride_horiz == '1 && count_stride_vert == '1 ) begin count_stride_vert <= '0; count_stride_horiz <= '0; end // During reset both counters are set to give one extra cycle to the counters if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin count_stride_horiz <= '1; count_stride_vert <= '1; end // // Stride is valid when both horizontal and vertical counters are zero // o_control_to_lane.data[0][0].stride_valid <= count_stride_vert == '0 && count_stride_horiz == '0; end : proc_stride // // Output valid counter comprises cascaded counters of vector, width, height and channels. // // A 'done' pulse is sent to the config decoder when the last tensor element is processed. // always_ff @(posedge clk) begin : proc_output_counters // clear the done signal by default o_control_to_config.done <= 1'b0; o_control_to_lane.data[0][0].done <= 1'b0; // Nested counters for channels, line and column, which operate only when core has a valid result. if (i_lane_to_control.core_output_valid) begin // shallow channels counter count_out_vector <= count_out_vector + 1'b1; if (count_out_vector >= VECTOR_RATIO-1) begin count_out_vector <= '0; // column counter count_out_width <= $bits(count_out_width)'(count_out_width + i_config_to_control.data[0][0].stride_horizontal); if (count_out_width >= (i_config_to_control.data[0][0].tile_width - i_config_to_control.data[0][0].window_width) - kernel_x_dilation_horiz) begin count_out_width <= '0; // line counter count_out_height <= $bits(count_out_height)'(count_out_height + i_config_to_control.data[0][0].stride_vertical); //-i_config_to_control.data[0][0].stride_vertical + 1 if (count_out_height >= i_config_to_control.data[0][0].tile_height - i_config_to_control.data[0][0].window_height - kernel_x_dilation_vert) begin count_out_height <= '0; // send a 1 clock cycle long 'done' every time channels counter increment // to indicate a new set of filters o_control_to_lane.data[0][0].done <= 1'b1; // channels counter count_out_channels <= $bits(count_out_channels)'(count_out_channels + NATIVE_VECTOR_SIZE); if (count_out_channels >= i_config_to_control.data[0][0].tile_channels - NATIVE_VECTOR_SIZE) begin count_out_channels <= '0; // send a 1 clock cycle long 'done' pulse after all counters reset to 0 o_control_to_config.done <= 1'b1; end end end end end // Reset counters if the module is in reset or not configured if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin count_out_vector <= '0; count_out_width <= '0; count_out_height <= '0; count_out_channels <= '0; end end : proc_output_counters // // ------------------------------ END EDITING ------------------------------ // endmodule