2 * Copyright 2021 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
28 #include "umc/umc_6_7_0_offset.h"
29 #include "umc/umc_6_7_0_sh_mask.h"
32 umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33 {28, 20, 24, 16, 12, 4, 8, 0},
34 {6, 30, 2, 26, 22, 14, 18, 10},
35 {19, 11, 15, 7, 3, 27, 31, 23},
36 {9, 1, 5, 29, 25, 17, 21, 13}
39 umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40 {19, 11, 15, 7, 3, 27, 31, 23},
41 {9, 1, 5, 29, 25, 17, 21, 13},
42 {28, 20, 24, 16, 12, 4, 8, 0},
43 {6, 30, 2, 26, 22, 14, 18, 10},
46 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
50 return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
53 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev,
57 return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
60 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
61 uint32_t channel_index,
62 unsigned long *error_count)
65 uint64_t mc_umc_status;
66 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
69 * select the lower chip and check the error count
70 * skip add error count, calc error counter only from mca_umc_status
72 ecc_err_cnt = ras->umc_ecc.ecc[channel_index].ce_count_lo_chip;
75 * select the higher chip and check the err counter
76 * skip add error count, calc error counter only from mca_umc_status
78 ecc_err_cnt = ras->umc_ecc.ecc[channel_index].ce_count_hi_chip;
80 /* check for SRAM correctable error
81 MCUMC_STATUS is a 64 bit register */
82 mc_umc_status = ras->umc_ecc.ecc[channel_index].mca_umc_status;
83 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
84 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
88 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
89 uint32_t channel_index,
90 unsigned long *error_count)
92 uint64_t mc_umc_status;
93 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
95 /* check the MCUMC_STATUS */
96 mc_umc_status = ras->umc_ecc.ecc[channel_index].mca_umc_status;
97 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
98 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
99 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
100 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
101 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
102 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
106 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
107 void *ras_error_status)
109 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
111 uint32_t umc_inst = 0;
112 uint32_t ch_inst = 0;
113 uint32_t umc_reg_offset = 0;
114 uint32_t channel_index = 0;
116 /*TODO: driver needs to toggle DF Cstate to ensure
117 * safe access of UMC registers. Will add the protection */
118 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
119 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
122 channel_index = get_umc_v6_7_channel_index(adev,
125 umc_v6_7_ecc_info_query_correctable_error_count(adev,
127 &(err_data->ce_count));
128 umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
130 &(err_data->ue_count));
134 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
135 struct ras_err_data *err_data,
136 uint32_t umc_reg_offset,
140 uint64_t mc_umc_status, err_addr, retired_page;
141 struct eeprom_table_record *err_rec;
142 uint32_t channel_index;
143 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
146 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
148 mc_umc_status = ras->umc_ecc.ecc[channel_index].mca_umc_status;
150 if (mc_umc_status == 0)
153 if (!err_data->err_addr)
156 err_rec = &err_data->err_addr[err_data->err_addr_cnt];
158 /* calculate error address if ue/ce error is detected */
159 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
160 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
161 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
163 err_addr = ras->umc_ecc.ecc[channel_index].mca_umc_addr;
164 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
166 /* translate umc channel address to soc pa, 3 parts are included */
167 retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
168 ADDR_OF_256B_BLOCK(channel_index) |
169 OFFSET_IN_256B_BLOCK(err_addr);
171 /* we only save ue error information currently, ce is skipped */
172 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
174 err_rec->address = err_addr;
175 /* page frame address is saved */
176 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
177 err_rec->ts = (uint64_t)ktime_get_real_seconds();
178 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
180 err_rec->mem_channel = channel_index;
181 err_rec->mcumc_id = umc_inst;
183 err_data->err_addr_cnt++;
188 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
189 void *ras_error_status)
191 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
193 uint32_t umc_inst = 0;
194 uint32_t ch_inst = 0;
195 uint32_t umc_reg_offset = 0;
197 /*TODO: driver needs to toggle DF Cstate to ensure
198 * safe access of UMC resgisters. Will add the protection
199 * when firmware interface is ready */
200 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
201 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
204 umc_v6_7_ecc_info_query_error_address(adev,
212 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
213 uint32_t umc_reg_offset,
214 unsigned long *error_count)
216 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
217 uint32_t ecc_err_cnt, ecc_err_cnt_addr;
218 uint64_t mc_umc_status;
219 uint32_t mc_umc_status_addr;
221 /* UMC 6_1_1 registers */
222 ecc_err_cnt_sel_addr =
223 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
225 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
227 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
229 /* select the lower chip and check the error count */
230 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
231 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
233 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
235 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
237 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
238 UMC_V6_7_CE_CNT_INIT);
240 /* select the higher chip and check the err counter */
241 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
243 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
245 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
247 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
248 UMC_V6_7_CE_CNT_INIT);
250 /* check for SRAM correctable error
251 MCUMC_STATUS is a 64 bit register */
252 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
253 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
254 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
258 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
259 uint32_t umc_reg_offset,
260 unsigned long *error_count)
262 uint64_t mc_umc_status;
263 uint32_t mc_umc_status_addr;
266 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
268 /* check the MCUMC_STATUS */
269 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
270 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
271 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
272 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
273 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
274 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
275 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
279 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
280 uint32_t umc_reg_offset)
282 uint32_t ecc_err_cnt_addr;
283 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
285 ecc_err_cnt_sel_addr =
286 SOC15_REG_OFFSET(UMC, 0,
287 regUMCCH0_0_EccErrCntSel);
289 SOC15_REG_OFFSET(UMC, 0,
290 regUMCCH0_0_EccErrCnt);
292 /* select the lower chip */
293 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
294 umc_reg_offset) * 4);
295 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
296 UMCCH0_0_EccErrCntSel,
298 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
301 /* clear lower chip error count */
302 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
303 UMC_V6_7_CE_CNT_INIT);
305 /* select the higher chip */
306 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
307 umc_reg_offset) * 4);
308 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
309 UMCCH0_0_EccErrCntSel,
311 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
314 /* clear higher chip error count */
315 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
316 UMC_V6_7_CE_CNT_INIT);
319 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
321 uint32_t umc_inst = 0;
322 uint32_t ch_inst = 0;
323 uint32_t umc_reg_offset = 0;
325 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
326 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
330 umc_v6_7_reset_error_count_per_channel(adev,
335 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
336 void *ras_error_status)
338 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
340 uint32_t umc_inst = 0;
341 uint32_t ch_inst = 0;
342 uint32_t umc_reg_offset = 0;
344 /*TODO: driver needs to toggle DF Cstate to ensure
345 * safe access of UMC registers. Will add the protection */
346 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
347 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
350 umc_v6_7_query_correctable_error_count(adev,
352 &(err_data->ce_count));
353 umc_v6_7_querry_uncorrectable_error_count(adev,
355 &(err_data->ue_count));
358 umc_v6_7_reset_error_count(adev);
361 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
362 struct ras_err_data *err_data,
363 uint32_t umc_reg_offset,
367 uint32_t mc_umc_status_addr;
368 uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
369 struct eeprom_table_record *err_rec;
370 uint32_t channel_index;
373 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
375 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
377 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
379 if (mc_umc_status == 0)
382 if (!err_data->err_addr) {
383 /* clear umc status */
384 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
388 err_rec = &err_data->err_addr[err_data->err_addr_cnt];
391 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
393 /* calculate error address if ue/ce error is detected */
394 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
395 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
396 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
398 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
399 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
401 /* translate umc channel address to soc pa, 3 parts are included */
402 retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
403 ADDR_OF_256B_BLOCK(channel_index) |
404 OFFSET_IN_256B_BLOCK(err_addr);
406 /* we only save ue error information currently, ce is skipped */
407 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
409 err_rec->address = err_addr;
410 /* page frame address is saved */
411 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
412 err_rec->ts = (uint64_t)ktime_get_real_seconds();
413 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
415 err_rec->mem_channel = channel_index;
416 err_rec->mcumc_id = umc_inst;
418 err_data->err_addr_cnt++;
422 /* clear umc status */
423 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
426 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
427 void *ras_error_status)
429 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
431 uint32_t umc_inst = 0;
432 uint32_t ch_inst = 0;
433 uint32_t umc_reg_offset = 0;
435 /*TODO: driver needs to toggle DF Cstate to ensure
436 * safe access of UMC resgisters. Will add the protection
437 * when firmware interface is ready */
438 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
439 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
442 umc_v6_7_query_error_address(adev,
450 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
451 struct amdgpu_device *adev,
452 uint32_t umc_reg_offset)
454 uint32_t ecc_ctrl_addr, ecc_ctrl;
457 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
458 ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
459 umc_reg_offset) * 4);
461 return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
464 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
466 uint32_t umc_inst = 0;
467 uint32_t ch_inst = 0;
468 uint32_t umc_reg_offset = 0;
470 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
471 umc_reg_offset = get_umc_v6_7_reg_offset(adev,
474 /* Enabling fatal error in one channel will be considered
475 as fatal error mode */
476 if (umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset))
483 const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs = {
484 .ras_late_init = amdgpu_umc_ras_late_init,
485 .ras_fini = amdgpu_umc_ras_fini,
486 .query_ras_error_count = umc_v6_7_query_ras_error_count,
487 .query_ras_error_address = umc_v6_7_query_ras_error_address,
488 .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
489 .ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
490 .ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,