python/openvino/demo/ip/intel_ai_ip/verilog/dla_acl_ecc_decoder.sv


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257

// Copyright 2020 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//
//  ACL ECC DECODER
//
//  This module decodes data using a single error correct, double error detect Hamming code. As the data width get large,
//  so will the xor network and that would limit fmax. To resolve this, we slice the data into smaller groups and decode
//  each independently. Essentially we trade off more memory overhead for parity bits in order to limit the fmax
//  degradation due to ECC.
//
//  The user must specify the data width and the slicing size. From this, one can compute the number of parity bits and
//  total encoded bits (see the calculations in localparams below).
//
//  Error reporting: for each decoder (after slicing), there are 2 status signals: single error corrected, and double
//  error detected. Each of these signal types are OR-ed together from all of the decoders (from slicing) before being
//  reported to the outside world. Beware that if there are two bit errors but they are in separate slicing groups, two
//  independent decoders can correct one bit each, so this will be reported as single error corrected.
//
//  Reset: there is no reset. Pipeline stages are purely feed-forward, the intent is that reset will propagate through.
//
//  This module is actually a wrapper around the actual ECC implementation in secded_decoder. Here is the architecture.
//  For example, suppose DATA_WIDTH is 70 and ECC_GROUP_SIZE is 32, then we will slice input data into 32 + 32 + 6, and
//  3 encoders are used to produce 39 + 39 + 11 encoded bits.
//
//                                i_encoded[88:0]
//                                      |
//  +------------------------------------------------------------------------+
//  |                     optional input pipeline stages                     |
//  +------------------------------------------------------------------------+
//          |                           |                           |
//    encoded[88:78]              encoded[77:39]              encoded[38:0]
//          |                           |                           |
//  +----------------+          +----------------+          +----------------+
//  | secded_decoder |          | secded_decoder |          | secded_decoder |
//  +----------------+          +----------------+          +----------------+
//          |                           |                           |
//      data[69:64]                 data[63:32]                 data[31:0]
//          |                           |                           |
//  +------------------------------------------------------------------------+
//  |                     optional output pipeline stages                    |
//  +------------------------------------------------------------------------+
//                                      |
//                                o_data[69:0]
//
//  Everything decoder related is contained within this file. The related file that does the corresponding encoding is
//  dla_acl_ecc_encoder.sv. Note both encoder and decoder require dla_acl_ecc_pkg.sv.
//
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

`default_nettype none

//BEWARE: do not leave the "clock_enable" input port disconnected if any pipeline stages are used, it will default to 0 and nothing will go through

module dla_acl_ecc_decoder
import dla_acl_ecc_pkg::*;
#(
    parameter int DATA_WIDTH,                   //number of bits in the decoded output data
    parameter int ECC_GROUP_SIZE,               //how many bits of unencoded data to group into one ecc block, see description in header comments
    parameter int INPUT_PIPELINE_STAGES = 0,    //number of pipeline stages between i_encoded and the ecc decoder
    parameter int OUTPUT_PIPELINE_STAGES = 0,   //number of pipeline stages between the ecc decoder and o_data
    parameter int STATUS_PIPELINE_STAGES = 0    //number of pipeline stages between the ecc decoder and o_single_error_corrected/o_double_error_detected
)
(
    input  wire                                                          clock,                     //clock is only needed if pipeline stages are nonzero
    input  wire                                                          clock_enable,              //set to 1 to sample i_encoded, intended for integration with altera_syncram, only needed if pipeline stages are nonzero
    input  wire  [getEncodedBitsEccGroup(DATA_WIDTH,ECC_GROUP_SIZE)-1:0] i_encoded,                 //encoded input data
    output logic [DATA_WIDTH-1:0]                                        o_data,                    //decoded output data
    output logic                                                         o_single_error_corrected,  //at least one ecc decoder corrected a single bit error within their ecc group
    output logic                                                         o_double_error_detected    //at least one ecc decoder detected a double bit error within their ecc group
);

    //helper functions for determining number of bits are defined in dla_acl_ecc.svh
    localparam int ECC_NUM_GROUPS  = getNumGroups(DATA_WIDTH,ECC_GROUP_SIZE);           //how many groups to slice the data into
    localparam int LAST_GROUP_SIZE = getLastGroupSize(DATA_WIDTH,ECC_GROUP_SIZE);       //all groups have size ECC_GROUP_SIZE except possibly the last group which may be smaller since it gets the remaining bits
    localparam int ENCODED_BITS    = getEncodedBitsEccGroup(DATA_WIDTH,ECC_GROUP_SIZE);

    //internal signals
    genvar g;
    logic [ENCODED_BITS-1:0] encoded;
    logic [DATA_WIDTH-1:0] data;
    logic [2*ECC_NUM_GROUPS-1:0] error_status;
    logic [ECC_NUM_GROUPS-1:0] single_error_corrected;
    logic [ECC_NUM_GROUPS-1:0] double_error_detected;

    //input pipeline stages
    generate
    if (INPUT_PIPELINE_STAGES == 0) begin
        assign encoded = i_encoded;
    end
    else begin
        logic [ENCODED_BITS-1:0] encoded_pipe [INPUT_PIPELINE_STAGES-1:0];
        always_ff @(posedge clock) begin    //only the first pipeline stage needs a clock enable, the remaining pipeline stages will load the same data when the clock enable propagates there
            if (clock_enable) encoded_pipe[0] <= i_encoded;
        end
        for (g=1; g<INPUT_PIPELINE_STAGES; g++) begin : gen_input_pipe
            always_ff @(posedge clock) begin
                encoded_pipe[g] <= encoded_pipe[g-1];
            end
        end
        assign encoded = encoded_pipe[INPUT_PIPELINE_STAGES-1];
    end
    endgenerate

    //slice the data for each decoder
    generate
    for (g=0; g<ECC_NUM_GROUPS; g++) begin : gen_decoder
        localparam int RAW_BASE = ECC_GROUP_SIZE*g;
        localparam int ENC_BASE = getEncodedBits(ECC_GROUP_SIZE)*g;
        localparam int RAW_WIDTH = (g==ECC_NUM_GROUPS-1) ? LAST_GROUP_SIZE : ECC_GROUP_SIZE;
        localparam int ENC_WIDTH = getEncodedBits(RAW_WIDTH);

        secded_decoder #(
            .DATA_WIDTH               (RAW_WIDTH)
        )
        secded_encoder_inst
        (
            .i_encoded                (encoded[ENC_BASE +: ENC_WIDTH]),
            .o_data                   (data[RAW_BASE +: RAW_WIDTH]),
            .o_single_error_corrected (error_status[g]),
            .o_double_error_detected  (error_status[g+ECC_NUM_GROUPS])
        );
    end
    endgenerate

    //output pipeline stages
    generate
    if (OUTPUT_PIPELINE_STAGES == 0) begin
        assign o_data = data;
    end
    else begin
        logic [DATA_WIDTH-1:0] data_pipe [OUTPUT_PIPELINE_STAGES-1:0];
        if (INPUT_PIPELINE_STAGES == 0) begin    //this is the first pipeline stage
            always_ff @(posedge clock) begin
                if (clock_enable) data_pipe[0] <= data;
            end
        end
        else begin  //there was a previous pipeline in the input stage which would have captured the clock enable
            always_ff @(posedge clock) begin
                data_pipe[0] <= data;
            end
        end
        for (g=1; g<OUTPUT_PIPELINE_STAGES; g++) begin : gen_output_pipe
            always_ff @(posedge clock) begin
                data_pipe[g] <= data_pipe[g-1];
            end
        end
        assign o_data = data_pipe[OUTPUT_PIPELINE_STAGES-1];
    end
    endgenerate

    //error status pipeline stages
    generate
    if (STATUS_PIPELINE_STAGES == 0) begin
        assign {double_error_detected, single_error_corrected} = error_status;
    end
    else begin
        logic [2*ECC_NUM_GROUPS-1:0] error_status_pipe [STATUS_PIPELINE_STAGES-1:0];
        if (INPUT_PIPELINE_STAGES == 0) begin    //this is the first pipeline stage
            always_ff @(posedge clock) begin
                if (clock_enable) error_status_pipe[0] <= error_status;
            end
        end
        else begin  //there was a previous pipeline in the input stage which would have captured the clock enable
            always_ff @(posedge clock) begin
                error_status_pipe[0] <= error_status;
            end
        end
        for (g=1; g<STATUS_PIPELINE_STAGES; g++) begin : gen_status_pipe
            always_ff @(posedge clock) begin
                error_status_pipe[g] <= error_status_pipe[g-1];
            end
        end
        assign {double_error_detected, single_error_corrected} = error_status_pipe[STATUS_PIPELINE_STAGES-1];
    end
    endgenerate
    assign o_single_error_corrected = |single_error_corrected;
    assign o_double_error_detected = |double_error_detected;

endmodule
//end dla_acl_ecc_decoder


// Hamming code decoder, single error correct, double error detect
//
// This implementation follows the bit mapping as shown on Wikipedia, parity bits are added at power of 2 locations, data bits go in between
// For example, with DATA_WIDTH = 11, we have 4 Hamming parity bits and one overall parity bit, so the bit locations will looks like this, d means data, p means parity
// [0] = p0, [1] = p1, [2] = p2, [3] = d0, [4] = p3, [5] = d1, [6] = d2, [7] = d3, [8] = p4, [9] = d4, [10] = d5, [11] = d6, [12] = d7, [13] = d8, [14] = d9, [15] = d10

module secded_decoder
import dla_acl_ecc_pkg::*;
#(
    parameter int DATA_WIDTH
) (
    input  wire  [getEncodedBits(DATA_WIDTH)-1:0] i_encoded,                //encoded input data
    output logic [DATA_WIDTH-1:0]                 o_data,                   //decoded output data
    output logic                                  o_single_error_corrected, //asserts when one bit of encoded data is wrong, this will be reported and corrected
    output logic                                  o_double_error_detected   //asserts when two bits of encoded data are wrong, this will only be reported and not corrected
);

    //helper functions for determining number of bits are defined in dla_acl_ecc.svh
    localparam int PARITY_BITS = getParityBits(DATA_WIDTH);
    localparam int ENCODED_BITS = getEncodedBits(DATA_WIDTH);

    //compute the parity bits
    logic [PARITY_BITS-1:0] parity;
    always_comb begin
        for (int parity_index=1; parity_index<PARITY_BITS; parity_index++) begin
            parity[parity_index] = 0;
            for (int enc_index=0; enc_index<ENCODED_BITS; enc_index++) begin
                if (enc_index & (1<<(parity_index-1))) begin   //bit parity_index-1 of enc_index is 1
                    parity[parity_index] = parity[parity_index] ^ i_encoded[enc_index]; //running xor
                end
            end
        end
        parity[0] = ^i_encoded; //overall parity
    end

    //syndrome indicates which bits was wrong, if any
    logic [PARITY_BITS-2:0] syndrome;
    assign syndrome = parity[PARITY_BITS-1:1];

    //report if there was 1 bit or 2 bit errors respectively
    assign o_single_error_corrected = parity[0];    //odd number of errors, 1 error gets corrected, 3 errors is not correctable and mapping to the word of minimum hamming distance will give incorrect data
    assign o_double_error_detected = ~parity[0] && (syndrome != 0);    //even number of errors, 0 errors results in syndrome == 0, 2 error will have a nonzero syndrome

    //extract out the data bits, and correct if there is a single bit error
    //parity bits are at power of 2 bit locations, data bits are in between
    //for example, with DATA_WIDTH = 11, we have 5 parity bits and the bit locations will looks like this, d means data, p means parity
    //[0] = p0, [1] = p1, [2] = p2, [3] = d0, [4] = p3, [5] = d1, [6] = d2, [7] = d3, [8] = p4, [9] = d4, [10] = d5, [11] = d6, [12] = d7, [13] = d8, [14] = d9, [15] = d10
    always_comb begin
        for (int enc_index=0, data_index=0; enc_index<ENCODED_BITS; enc_index++) begin
            if (!(enc_index == 0 || (2**$clog2(enc_index)) == enc_index)) begin    //enc_index is not a power of 2
                o_data[data_index] = (enc_index==syndrome) ? ~i_encoded[enc_index] : i_encoded[enc_index];
                data_index++;
            end
        end
    end

endmodule
//end secded_decoder

`default_nettype wire