]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
Merge tag 'netfs-prep-20220318' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / drivers / gpu / drm / amd / amdgpu / umc_v6_7.c
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v6_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27
28 #include "umc/umc_6_7_0_offset.h"
29 #include "umc/umc_6_7_0_sh_mask.h"
30
31 const uint32_t
32         umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33                 {28, 20, 24, 16, 12, 4, 8, 0},
34                 {6, 30, 2, 26, 22, 14, 18, 10},
35                 {19, 11, 15, 7, 3, 27, 31, 23},
36                 {9, 1, 5, 29, 25, 17, 21, 13}
37 };
38 const uint32_t
39         umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40                 {19, 11, 15, 7, 3, 27, 31, 23},
41                 {9, 1, 5, 29, 25, 17, 21, 13},
42                 {28, 20, 24, 16, 12, 4, 8, 0},
43                 {6, 30, 2, 26, 22, 14, 18, 10},
44 };
45
46 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
47                                               uint32_t umc_inst,
48                                               uint32_t ch_inst)
49 {
50         uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
51
52         /* adjust umc and channel index offset,
53          * the register address is not linear on each umc instace */
54         umc_inst = index / 4;
55         ch_inst = index % 4;
56
57         return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
58 }
59
60 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev,
61                                               uint32_t umc_inst,
62                                               uint32_t ch_inst)
63 {
64         return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
65 }
66
67 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
68                                                    uint32_t umc_inst, uint32_t ch_inst,
69                                                    unsigned long *error_count)
70 {
71         uint64_t mc_umc_status;
72         uint32_t eccinfo_table_idx;
73         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
74
75         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
76         /* check for SRAM correctable error
77           MCUMC_STATUS is a 64 bit register */
78         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
79         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
80             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
81                 *error_count += 1;
82 }
83
84 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
85                                                           uint32_t umc_inst, uint32_t ch_inst,
86                                                       unsigned long *error_count)
87 {
88         uint64_t mc_umc_status;
89         uint32_t eccinfo_table_idx;
90         uint32_t umc_reg_offset;
91         uint32_t mc_umc_addr;
92         uint64_t reg_value;
93         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
94
95         umc_reg_offset = get_umc_v6_7_reg_offset(adev,
96                                                 umc_inst, ch_inst);
97
98         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
99         /* check the MCUMC_STATUS */
100         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
101         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
102             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
103             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
104             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
105             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
106             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
107                 *error_count += 1;
108
109                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
110                         dev_info(adev->dev, "Deferred error, no user action is needed.\n");
111
112                 if (mc_umc_status)
113                         dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
114
115                 /* print IPID registers value */
116                 mc_umc_addr =
117                         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
118                 reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
119                 if (reg_value)
120                         dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
121
122                 /* print SYND registers value */
123                 mc_umc_addr =
124                         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
125                 reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
126                 if (reg_value)
127                         dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
128
129                 /* print MISC0 registers value */
130                 mc_umc_addr =
131                         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
132                 reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
133                 if (reg_value)
134                         dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
135         }
136 }
137
138 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
139                                            void *ras_error_status)
140 {
141         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
142
143         uint32_t umc_inst        = 0;
144         uint32_t ch_inst         = 0;
145
146         /*TODO: driver needs to toggle DF Cstate to ensure
147          * safe access of UMC registers. Will add the protection */
148         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
149                 umc_v6_7_ecc_info_query_correctable_error_count(adev,
150                                                       umc_inst, ch_inst,
151                                                       &(err_data->ce_count));
152                 umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
153                                                       umc_inst, ch_inst,
154                                                           &(err_data->ue_count));
155         }
156 }
157
158 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
159                                          struct ras_err_data *err_data,
160                                          uint32_t ch_inst,
161                                          uint32_t umc_inst)
162 {
163         uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column;
164         uint32_t channel_index;
165         uint32_t eccinfo_table_idx;
166         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
167
168         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
169         channel_index =
170                 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
171
172         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
173
174         if (mc_umc_status == 0)
175                 return;
176
177         if (!err_data->err_addr)
178                 return;
179
180         /* calculate error address if ue/ce error is detected */
181         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
182             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
183             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
184
185                 err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
186                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
187
188                 /* translate umc channel address to soc pa, 3 parts are included */
189                 soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
190                                 ADDR_OF_256B_BLOCK(channel_index) |
191                                 OFFSET_IN_256B_BLOCK(err_addr);
192
193                 /* The umc channel bits are not original values, they are hashed */
194                 SET_CHANNEL_HASH(channel_index, soc_pa);
195
196                 /* clear [C4 C3 C2] in soc physical address */
197                 soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
198
199                 /* we only save ue error information currently, ce is skipped */
200                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
201                                 == 1) {
202                         /* loop for all possibilities of [C4 C3 C2] */
203                         for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
204                                 retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
205                                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
206                                 amdgpu_umc_fill_error_record(err_data, err_addr,
207                                         retired_page, channel_index, umc_inst);
208
209                                 /* shift R14 bit */
210                                 retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
211                                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
212                                 amdgpu_umc_fill_error_record(err_data, err_addr,
213                                         retired_page, channel_index, umc_inst);
214                         }
215                 }
216         }
217 }
218
219 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
220                                              void *ras_error_status)
221 {
222         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
223
224         uint32_t umc_inst        = 0;
225         uint32_t ch_inst         = 0;
226
227         /*TODO: driver needs to toggle DF Cstate to ensure
228          * safe access of UMC resgisters. Will add the protection
229          * when firmware interface is ready */
230         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
231                 umc_v6_7_ecc_info_query_error_address(adev,
232                                              err_data,
233                                              ch_inst,
234                                              umc_inst);
235         }
236 }
237
238 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
239                                                    uint32_t umc_reg_offset,
240                                                    unsigned long *error_count)
241 {
242         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
243         uint32_t ecc_err_cnt, ecc_err_cnt_addr;
244         uint64_t mc_umc_status;
245         uint32_t mc_umc_status_addr;
246
247         /* UMC 6_1_1 registers */
248         ecc_err_cnt_sel_addr =
249                 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
250         ecc_err_cnt_addr =
251                 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
252         mc_umc_status_addr =
253                 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
254
255         /* select the lower chip and check the error count */
256         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
257         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
258                                         EccErrCntCsSel, 0);
259         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
260
261         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
262         *error_count +=
263                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
264                  UMC_V6_7_CE_CNT_INIT);
265
266         /* select the higher chip and check the err counter */
267         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
268                                         EccErrCntCsSel, 1);
269         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
270
271         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
272         *error_count +=
273                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
274                  UMC_V6_7_CE_CNT_INIT);
275
276         /* check for SRAM correctable error
277           MCUMC_STATUS is a 64 bit register */
278         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
279         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
280             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
281                 *error_count += 1;
282 }
283
284 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
285                                                       uint32_t umc_reg_offset,
286                                                       unsigned long *error_count)
287 {
288         uint64_t mc_umc_status;
289         uint32_t mc_umc_status_addr;
290         uint32_t mc_umc_addr;
291         uint64_t reg_value;
292
293         mc_umc_status_addr =
294                 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
295
296         /* check the MCUMC_STATUS */
297         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
298         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
299             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
300             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
301             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
302             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
303             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
304                 *error_count += 1;
305
306                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
307                         dev_info(adev->dev, "Deferred error, no user action is needed.\n");
308
309                 if (mc_umc_status)
310                         dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
311
312                 /* print IPID registers value */
313                 mc_umc_addr =
314                         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
315                 reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
316                 if (reg_value)
317                         dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
318
319                 /* print SYND registers value */
320                 mc_umc_addr =
321                         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
322                 reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
323                 if (reg_value)
324                         dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
325
326                 /* print MISC0 registers value */
327                 mc_umc_addr =
328                         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
329                 reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
330                 if (reg_value)
331                         dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
332         }
333 }
334
335 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
336                                                    uint32_t umc_reg_offset)
337 {
338         uint32_t ecc_err_cnt_addr;
339         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
340
341         ecc_err_cnt_sel_addr =
342                 SOC15_REG_OFFSET(UMC, 0,
343                                 regUMCCH0_0_EccErrCntSel);
344         ecc_err_cnt_addr =
345                 SOC15_REG_OFFSET(UMC, 0,
346                                 regUMCCH0_0_EccErrCnt);
347
348         /* select the lower chip */
349         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
350                                        umc_reg_offset) * 4);
351         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
352                                         UMCCH0_0_EccErrCntSel,
353                                         EccErrCntCsSel, 0);
354         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
355                         ecc_err_cnt_sel);
356
357         /* clear lower chip error count */
358         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
359                         UMC_V6_7_CE_CNT_INIT);
360
361         /* select the higher chip */
362         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
363                                         umc_reg_offset) * 4);
364         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
365                                         UMCCH0_0_EccErrCntSel,
366                                         EccErrCntCsSel, 1);
367         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
368                         ecc_err_cnt_sel);
369
370         /* clear higher chip error count */
371         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
372                         UMC_V6_7_CE_CNT_INIT);
373 }
374
375 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
376 {
377         uint32_t umc_inst        = 0;
378         uint32_t ch_inst         = 0;
379         uint32_t umc_reg_offset  = 0;
380
381         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
382                 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
383                                                          umc_inst,
384                                                          ch_inst);
385
386                 umc_v6_7_reset_error_count_per_channel(adev,
387                                                        umc_reg_offset);
388         }
389 }
390
391 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
392                                            void *ras_error_status)
393 {
394         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
395
396         uint32_t umc_inst        = 0;
397         uint32_t ch_inst         = 0;
398         uint32_t umc_reg_offset  = 0;
399
400         /*TODO: driver needs to toggle DF Cstate to ensure
401          * safe access of UMC registers. Will add the protection */
402         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
403                 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
404                                                          umc_inst,
405                                                          ch_inst);
406                 umc_v6_7_query_correctable_error_count(adev,
407                                                        umc_reg_offset,
408                                                        &(err_data->ce_count));
409                 umc_v6_7_querry_uncorrectable_error_count(adev,
410                                                           umc_reg_offset,
411                                                           &(err_data->ue_count));
412         }
413
414         umc_v6_7_reset_error_count(adev);
415 }
416
417 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
418                                          struct ras_err_data *err_data,
419                                          uint32_t umc_reg_offset,
420                                          uint32_t ch_inst,
421                                          uint32_t umc_inst)
422 {
423         uint32_t mc_umc_status_addr;
424         uint32_t channel_index;
425         uint64_t mc_umc_status, mc_umc_addrt0;
426         uint64_t err_addr, soc_pa, retired_page, column;
427
428         mc_umc_status_addr =
429                 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
430         mc_umc_addrt0 =
431                 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
432
433         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
434
435         if (mc_umc_status == 0)
436                 return;
437
438         if (!err_data->err_addr) {
439                 /* clear umc status */
440                 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
441                 return;
442         }
443
444         channel_index =
445                 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
446
447         /* calculate error address if ue/ce error is detected */
448         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
449             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
450             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
451
452                 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
453                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
454
455                 /* translate umc channel address to soc pa, 3 parts are included */
456                 soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
457                                 ADDR_OF_256B_BLOCK(channel_index) |
458                                 OFFSET_IN_256B_BLOCK(err_addr);
459
460                 /* The umc channel bits are not original values, they are hashed */
461                 SET_CHANNEL_HASH(channel_index, soc_pa);
462
463                 /* clear [C4 C3 C2] in soc physical address */
464                 soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
465
466                 /* we only save ue error information currently, ce is skipped */
467                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
468                                 == 1) {
469                         /* loop for all possibilities of [C4 C3 C2] */
470                         for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
471                                 retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
472                                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
473                                 amdgpu_umc_fill_error_record(err_data, err_addr,
474                                         retired_page, channel_index, umc_inst);
475
476                                 /* shift R14 bit */
477                                 retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
478                                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
479                                 amdgpu_umc_fill_error_record(err_data, err_addr,
480                                         retired_page, channel_index, umc_inst);
481                         }
482                 }
483         }
484
485         /* clear umc status */
486         WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
487 }
488
489 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
490                                              void *ras_error_status)
491 {
492         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
493
494         uint32_t umc_inst        = 0;
495         uint32_t ch_inst         = 0;
496         uint32_t umc_reg_offset  = 0;
497
498         /*TODO: driver needs to toggle DF Cstate to ensure
499          * safe access of UMC resgisters. Will add the protection
500          * when firmware interface is ready */
501         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
502                 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
503                                                          umc_inst,
504                                                          ch_inst);
505                 umc_v6_7_query_error_address(adev,
506                                              err_data,
507                                              umc_reg_offset,
508                                              ch_inst,
509                                              umc_inst);
510         }
511 }
512
513 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
514                                                 struct amdgpu_device *adev,
515                                                 uint32_t umc_reg_offset)
516 {
517         uint32_t ecc_ctrl_addr, ecc_ctrl;
518
519         ecc_ctrl_addr =
520                 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
521         ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
522                                         umc_reg_offset) * 4);
523
524         return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
525 }
526
527 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
528 {
529         uint32_t umc_reg_offset  = 0;
530
531         /* Enabling fatal error in umc instance0 channel0 will be
532          * considered as fatal error mode
533          */
534         umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
535         return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
536 }
537
538 const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
539         .query_ras_error_count = umc_v6_7_query_ras_error_count,
540         .query_ras_error_address = umc_v6_7_query_ras_error_address,
541 };
542
543 struct amdgpu_umc_ras umc_v6_7_ras = {
544         .ras_block = {
545                 .hw_ops = &umc_v6_7_ras_hw_ops,
546         },
547         .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
548         .ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
549         .ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
550 };
This page took 0.068745 seconds and 4 git commands to generate.