]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
Merge tag 'nfs-for-5.20-2' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
[linux.git] / drivers / gpu / drm / amd / amdgpu / umc_v8_7.c
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v8_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27
28 #include "rsmu/rsmu_0_0_2_offset.h"
29 #include "rsmu/rsmu_0_0_2_sh_mask.h"
30 #include "umc/umc_8_7_0_offset.h"
31 #include "umc/umc_8_7_0_sh_mask.h"
32
33 #define UMC_8_INST_DIST                 0x40000
34
35 const uint32_t
36         umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = {
37                 {2, 11},  {4, 13},
38                 {1, 8},   {7, 14},
39                 {10, 3},  {12, 5},
40                 {9, 0},   {15, 6}
41 };
42
43 static inline uint32_t get_umc_v8_7_reg_offset(struct amdgpu_device *adev,
44                                             uint32_t umc_inst,
45                                             uint32_t ch_inst)
46 {
47         return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst;
48 }
49
50 static void umc_v8_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
51                                                 uint32_t umc_inst, uint32_t ch_inst,
52                                                 unsigned long *error_count)
53 {
54         uint64_t mc_umc_status;
55         uint32_t eccinfo_table_idx;
56         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
57
58         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
59
60         /* check for SRAM correctable error
61          * MCUMC_STATUS is a 64 bit register
62          */
63         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
64         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
65             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
66                 *error_count += 1;
67 }
68
69 static void umc_v8_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
70                                                         uint32_t umc_inst, uint32_t ch_inst,
71                                                         unsigned long *error_count)
72 {
73         uint64_t mc_umc_status;
74         uint32_t eccinfo_table_idx;
75         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
76
77         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
78
79         /* check the MCUMC_STATUS */
80         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
81         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
82             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
83             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
84             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
85             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
86             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
87                 *error_count += 1;
88 }
89
90 static void umc_v8_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
91                                         void *ras_error_status)
92 {
93         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
94
95         uint32_t umc_inst        = 0;
96         uint32_t ch_inst         = 0;
97
98         /* TODO: driver needs to toggle DF Cstate to ensure
99          * safe access of UMC registers. Will add the protection
100          */
101         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
102                 umc_v8_7_ecc_info_query_correctable_error_count(adev,
103                                                         umc_inst, ch_inst,
104                                                         &(err_data->ce_count));
105                 umc_v8_7_ecc_info_querry_uncorrectable_error_count(adev,
106                                                         umc_inst, ch_inst,
107                                                         &(err_data->ue_count));
108         }
109 }
110
111 static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev,
112                                         struct ras_err_data *err_data,
113                                         uint32_t ch_inst,
114                                         uint32_t umc_inst)
115 {
116         uint64_t mc_umc_status, err_addr, retired_page;
117         uint32_t channel_index;
118         uint32_t eccinfo_table_idx;
119         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
120
121         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
122         channel_index =
123                 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
124
125         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
126
127         if (mc_umc_status == 0)
128                 return;
129
130         if (!err_data->err_addr)
131                 return;
132
133         /* calculate error address if ue/ce error is detected */
134         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
135             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
136             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
137
138                 err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
139                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
140
141                 /* translate umc channel address to soc pa, 3 parts are included */
142                 retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
143                                 ADDR_OF_256B_BLOCK(channel_index) |
144                                 OFFSET_IN_256B_BLOCK(err_addr);
145
146                 /* we only save ue error information currently, ce is skipped */
147                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
148                                 == 1)
149                         amdgpu_umc_fill_error_record(err_data, err_addr,
150                                         retired_page, channel_index, umc_inst);
151         }
152 }
153
154 static void umc_v8_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
155                                         void *ras_error_status)
156 {
157         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
158
159         uint32_t umc_inst        = 0;
160         uint32_t ch_inst         = 0;
161
162         /* TODO: driver needs to toggle DF Cstate to ensure
163          * safe access of UMC resgisters. Will add the protection
164          * when firmware interface is ready
165          */
166         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
167                 umc_v8_7_ecc_info_query_error_address(adev,
168                                                 err_data,
169                                                 ch_inst,
170                                                 umc_inst);
171         }
172 }
173
174 static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev,
175                                         uint32_t umc_reg_offset)
176 {
177         uint32_t ecc_err_cnt_addr;
178         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
179
180         ecc_err_cnt_sel_addr =
181                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
182         ecc_err_cnt_addr =
183                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
184
185         /* select the lower chip */
186         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
187                                         umc_reg_offset) * 4);
188         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
189                                         UMCCH0_0_GeccErrCntSel,
190                                         GeccErrCntCsSel, 0);
191         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
192                         ecc_err_cnt_sel);
193
194         /* clear lower chip error count */
195         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
196                         UMC_V8_7_CE_CNT_INIT);
197
198         /* select the higher chip */
199         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
200                                         umc_reg_offset) * 4);
201         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
202                                         UMCCH0_0_GeccErrCntSel,
203                                         GeccErrCntCsSel, 1);
204         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
205                         ecc_err_cnt_sel);
206
207         /* clear higher chip error count */
208         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
209                         UMC_V8_7_CE_CNT_INIT);
210 }
211
212 static void umc_v8_7_clear_error_count(struct amdgpu_device *adev)
213 {
214         uint32_t umc_inst        = 0;
215         uint32_t ch_inst         = 0;
216         uint32_t umc_reg_offset  = 0;
217
218         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
219                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
220                                                 umc_inst,
221                                                 ch_inst);
222
223                 umc_v8_7_clear_error_count_per_channel(adev,
224                                                 umc_reg_offset);
225         }
226 }
227
228 static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev,
229                                                    uint32_t umc_reg_offset,
230                                                    unsigned long *error_count)
231 {
232         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
233         uint32_t ecc_err_cnt, ecc_err_cnt_addr;
234         uint64_t mc_umc_status;
235         uint32_t mc_umc_status_addr;
236
237         /* UMC 8_7_2 registers */
238         ecc_err_cnt_sel_addr =
239                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
240         ecc_err_cnt_addr =
241                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
242         mc_umc_status_addr =
243                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
244
245         /* select the lower chip and check the error count */
246         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
247         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
248                                         GeccErrCntCsSel, 0);
249         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
250
251         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
252         *error_count +=
253                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
254                  UMC_V8_7_CE_CNT_INIT);
255
256         /* select the higher chip and check the err counter */
257         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
258                                         GeccErrCntCsSel, 1);
259         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
260
261         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
262         *error_count +=
263                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
264                  UMC_V8_7_CE_CNT_INIT);
265
266         /* check for SRAM correctable error
267           MCUMC_STATUS is a 64 bit register */
268         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
269         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
270             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
271             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
272                 *error_count += 1;
273 }
274
275 static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
276                                                       uint32_t umc_reg_offset,
277                                                       unsigned long *error_count)
278 {
279         uint64_t mc_umc_status;
280         uint32_t mc_umc_status_addr;
281
282         mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
283
284         /* check the MCUMC_STATUS */
285         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
286         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
287             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
288             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
289             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
290             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
291             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
292                 *error_count += 1;
293 }
294
295 static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev,
296                                            void *ras_error_status)
297 {
298         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
299
300         uint32_t umc_inst        = 0;
301         uint32_t ch_inst         = 0;
302         uint32_t umc_reg_offset  = 0;
303
304         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
305                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
306                                                       umc_inst,
307                                                       ch_inst);
308
309                 umc_v8_7_query_correctable_error_count(adev,
310                                                        umc_reg_offset,
311                                                        &(err_data->ce_count));
312                 umc_v8_7_querry_uncorrectable_error_count(adev,
313                                                           umc_reg_offset,
314                                                           &(err_data->ue_count));
315         }
316
317         umc_v8_7_clear_error_count(adev);
318 }
319
320 static void umc_v8_7_query_error_address(struct amdgpu_device *adev,
321                                          struct ras_err_data *err_data,
322                                          uint32_t umc_reg_offset,
323                                          uint32_t ch_inst,
324                                          uint32_t umc_inst)
325 {
326         uint32_t lsb, mc_umc_status_addr;
327         uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
328         uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
329
330         mc_umc_status_addr =
331                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
332         mc_umc_addrt0 =
333                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
334
335         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
336
337         if (mc_umc_status == 0)
338                 return;
339
340         if (!err_data->err_addr) {
341                 /* clear umc status */
342                 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
343                 return;
344         }
345
346         /* calculate error address if ue/ce error is detected */
347         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
348             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
349             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
350
351                 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
352                 /* the lowest lsb bits should be ignored */
353                 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
354                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
355                 err_addr &= ~((0x1ULL << lsb) - 1);
356
357                 /* translate umc channel address to soc pa, 3 parts are included */
358                 retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
359                                 ADDR_OF_256B_BLOCK(channel_index) |
360                                 OFFSET_IN_256B_BLOCK(err_addr);
361
362                 /* we only save ue error information currently, ce is skipped */
363                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
364                                 == 1)
365                         amdgpu_umc_fill_error_record(err_data, err_addr,
366                                         retired_page, channel_index, umc_inst);
367         }
368
369         /* clear umc status */
370         WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
371 }
372
373 static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev,
374                                              void *ras_error_status)
375 {
376         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
377
378         uint32_t umc_inst        = 0;
379         uint32_t ch_inst         = 0;
380         uint32_t umc_reg_offset  = 0;
381
382         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
383                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
384                                                       umc_inst,
385                                                       ch_inst);
386
387                 umc_v8_7_query_error_address(adev,
388                                              err_data,
389                                              umc_reg_offset,
390                                              ch_inst,
391                                              umc_inst);
392         }
393 }
394
395 static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev,
396                                               uint32_t umc_reg_offset)
397 {
398         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
399         uint32_t ecc_err_cnt_addr;
400
401         ecc_err_cnt_sel_addr =
402                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
403         ecc_err_cnt_addr =
404                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
405
406         /* select the lower chip and check the error count */
407         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
408         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
409                                         GeccErrCntCsSel, 0);
410         /* set ce error interrupt type to APIC based interrupt */
411         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
412                                         GeccErrInt, 0x1);
413         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
414         /* set error count to initial value */
415         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
416
417         /* select the higher chip and check the err counter */
418         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
419                                         GeccErrCntCsSel, 1);
420         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
421         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
422 }
423
424 static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
425 {
426         uint32_t umc_inst        = 0;
427         uint32_t ch_inst         = 0;
428         uint32_t umc_reg_offset  = 0;
429
430         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
431                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
432                                                       umc_inst,
433                                                       ch_inst);
434
435                 umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset);
436         }
437 }
438
439 const struct amdgpu_ras_block_hw_ops umc_v8_7_ras_hw_ops = {
440         .query_ras_error_count = umc_v8_7_query_ras_error_count,
441         .query_ras_error_address = umc_v8_7_query_ras_error_address,
442 };
443
444 struct amdgpu_umc_ras umc_v8_7_ras = {
445         .ras_block = {
446                 .hw_ops = &umc_v8_7_ras_hw_ops,
447         },
448         .err_cnt_init = umc_v8_7_err_cnt_init,
449         .ecc_info_query_ras_error_count = umc_v8_7_ecc_info_query_ras_error_count,
450         .ecc_info_query_ras_error_address = umc_v8_7_ecc_info_query_ras_error_address,
451 };
This page took 0.061508 seconds and 4 git commands to generate.