]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
Merge tag 'io_uring-6.9-20240322' of git://git.kernel.dk/linux
[linux.git] / drivers / gpu / drm / amd / amdgpu / umc_v12_0.c
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v12_0.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27 #include "umc/umc_12_0_0_offset.h"
28 #include "umc/umc_12_0_0_sh_mask.h"
29 #include "mp/mp_13_0_6_sh_mask.h"
30
31 const uint32_t
32         umc_v12_0_channel_idx_tbl[]
33                         [UMC_V12_0_UMC_INSTANCE_NUM]
34                         [UMC_V12_0_CHANNEL_INSTANCE_NUM] = {
35                 {{3,   7,   11,  15,  2,   6,   10,  14},  {1,   5,   9,   13,  0,   4,   8,   12},
36                  {19,  23,  27,  31,  18,  22,  26,  30},  {17,  21,  25,  29,  16,  20,  24,  28}},
37                 {{47,  43,  39,  35,  46,  42,  38,  34},  {45,  41,  37,  33,  44,  40,  36,  32},
38                  {63,  59,  55,  51,  62,  58,  54,  50},  {61,  57,  53,  49,  60,  56,  52,  48}},
39                 {{79,  75,  71,  67,  78,  74,  70,  66},  {77,  73,  69,  65,  76,  72,  68,  64},
40                  {95,  91,  87,  83,  94,  90,  86,  82},  {93,  89,  85,  81,  92,  88,  84,  80}},
41                 {{99,  103, 107, 111, 98,  102, 106, 110}, {97,  101, 105, 109, 96,  100, 104, 108},
42                  {115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}}
43         };
44
45 /* mapping of MCA error address to normalized address */
46 static const uint32_t umc_v12_0_ma2na_mapping[] = {
47         0,  5,  6,  8,  9,  14, 12, 13,
48         10, 11, 15, 16, 17, 18, 19, 20,
49         21, 22, 23, 24, 25, 26, 27, 28,
50         24, 7,  29, 30,
51 };
52
53 static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
54                                             uint32_t node_inst,
55                                             uint32_t umc_inst,
56                                             uint32_t ch_inst)
57 {
58         uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
59         uint64_t cross_node_offset = (node_inst == 0) ? 0 : UMC_V12_0_CROSS_NODE_OFFSET;
60
61         umc_inst = index / 4;
62         ch_inst = index % 4;
63
64         return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST * umc_inst +
65                 UMC_V12_0_NODE_DIST * node_inst + cross_node_offset;
66 }
67
68 static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device *adev,
69                                         uint32_t node_inst, uint32_t umc_inst,
70                                         uint32_t ch_inst, void *data)
71 {
72         uint64_t odecc_err_cnt_addr;
73         uint64_t umc_reg_offset =
74                 get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
75
76         odecc_err_cnt_addr =
77                 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
78
79         /* clear error count */
80         WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4,
81                         UMC_V12_0_CE_CNT_INIT);
82
83         return 0;
84 }
85
86 static void umc_v12_0_reset_error_count(struct amdgpu_device *adev)
87 {
88         amdgpu_umc_loop_channels(adev,
89                 umc_v12_0_reset_error_count_per_channel, NULL);
90 }
91
92 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
93 {
94         dev_info(adev->dev,
95                 "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n",
96                 mc_umc_status,
97                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
98                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison),
99                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred),
100                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
101                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
102                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
103         );
104
105         return (amdgpu_ras_is_poison_mode_supported(adev) &&
106                 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
107                 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1));
108 }
109
110 bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
111 {
112         if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
113                 return false;
114
115         return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
116                 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
117                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
118                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1));
119 }
120
121 bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
122 {
123         if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
124                 return false;
125
126         return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
127                 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1 ||
128                 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 &&
129                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 0) ||
130                 /* Identify data parity error in replay mode */
131                 ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0x5 ||
132                 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0xb) &&
133                 !(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)))));
134 }
135
136 static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev,
137                                                    uint64_t umc_reg_offset,
138                                                    unsigned long *error_count,
139                                                    check_error_type_func error_type_func)
140 {
141         uint64_t mc_umc_status;
142         uint64_t mc_umc_status_addr;
143
144         mc_umc_status_addr =
145                 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
146
147         /* Check MCUMC_STATUS */
148         mc_umc_status =
149                 RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
150
151         if (error_type_func(adev, mc_umc_status))
152                 *error_count += 1;
153 }
154
155 static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
156                                         uint32_t node_inst, uint32_t umc_inst,
157                                         uint32_t ch_inst, void *data)
158 {
159         struct ras_err_data *err_data = (struct ras_err_data *)data;
160         unsigned long ue_count = 0, ce_count = 0, de_count = 0;
161
162         /* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3],
163          * which can be used as die ID directly */
164         struct amdgpu_smuio_mcm_config_info mcm_info = {
165                 .socket_id = adev->smuio.funcs->get_socket_id(adev),
166                 .die_id = node_inst,
167         };
168
169         uint64_t umc_reg_offset =
170                 get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
171
172         umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
173                                             &ce_count, umc_v12_0_is_correctable_error);
174         umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
175                                             &ue_count, umc_v12_0_is_uncorrectable_error);
176         umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
177                                             &de_count, umc_v12_0_is_deferred_error);
178
179         amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
180         amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
181         amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, de_count);
182
183         return 0;
184 }
185
186 static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
187                                            void *ras_error_status)
188 {
189         amdgpu_umc_loop_channels(adev,
190                 umc_v12_0_query_error_count, ras_error_status);
191
192         umc_v12_0_reset_error_count(adev);
193 }
194
195 static bool umc_v12_0_bit_wise_xor(uint32_t val)
196 {
197         bool result = 0;
198         int i;
199
200         for (i = 0; i < 32; i++)
201                 result = result ^ ((val >> i) & 0x1);
202
203         return result;
204 }
205
206 static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev,
207                                         uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst,
208                                         uint32_t node_inst,
209                                         struct ta_ras_query_address_output *addr_out)
210 {
211         uint32_t channel_index, i;
212         uint64_t na, soc_pa;
213         uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
214         uint32_t bank0, bank1, bank2, bank3, bank;
215
216         bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
217         bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL;
218         bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL;
219         bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;
220         col = (err_addr >> 1) & 0x1fULL;
221         row = (err_addr >> 10) & 0x3fffULL;
222
223         /* apply bank hash algorithm */
224         bank0 =
225                 bank_hash0 ^ (UMC_V12_0_XOR_EN0 &
226                 (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^
227                 (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0))));
228         bank1 =
229                 bank_hash1 ^ (UMC_V12_0_XOR_EN1 &
230                 (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^
231                 (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1))));
232         bank2 =
233                 bank_hash2 ^ (UMC_V12_0_XOR_EN2 &
234                 (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^
235                 (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2))));
236         bank3 =
237                 bank_hash3 ^ (UMC_V12_0_XOR_EN3 &
238                 (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^
239                 (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3))));
240
241         bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3);
242         err_addr &= ~0x3c0ULL;
243         err_addr |= (bank << UMC_V12_0_MCA_B0_BIT);
244
245         na = 0x0;
246         /* convert mca error address to normalized address */
247         for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++)
248                 na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i];
249
250         channel_index =
251                 adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
252                         adev->umc.channel_inst_num +
253                         umc_inst * adev->umc.channel_inst_num +
254                         ch_inst];
255         /* translate umc channel address to soc pa, 3 parts are included */
256         soc_pa = ADDR_OF_32KB_BLOCK(na) |
257                 ADDR_OF_256B_BLOCK(channel_index) |
258                 OFFSET_IN_256B_BLOCK(na);
259
260         /* the umc channel bits are not original values, they are hashed */
261         UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa);
262
263         addr_out->pa.pa = soc_pa;
264         addr_out->pa.bank = bank;
265         addr_out->pa.channel_idx = channel_index;
266 }
267
268 static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
269                                             struct ras_err_data *err_data, uint64_t err_addr,
270                                             uint32_t ch_inst, uint32_t umc_inst,
271                                             uint32_t node_inst)
272 {
273         uint32_t col, row, row_xor, bank, channel_index;
274         uint64_t soc_pa, retired_page, column;
275         struct ta_ras_query_address_input addr_in;
276         struct ta_ras_query_address_output addr_out;
277
278         addr_in.addr_type = TA_RAS_MCA_TO_PA;
279         addr_in.ma.err_addr = err_addr;
280         addr_in.ma.ch_inst = ch_inst;
281         addr_in.ma.umc_inst = umc_inst;
282         addr_in.ma.node_inst = node_inst;
283
284         if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out))
285                 /* fallback to old path if fail to get pa from psp */
286                 umc_v12_0_mca_addr_to_pa(adev, err_addr, ch_inst, umc_inst,
287                                 node_inst, &addr_out);
288
289         soc_pa = addr_out.pa.pa;
290         bank = addr_out.pa.bank;
291         channel_index = addr_out.pa.channel_idx;
292
293         col = (err_addr >> 1) & 0x1fULL;
294         row = (err_addr >> 10) & 0x3fffULL;
295         row_xor = row ^ (0x1ULL << 13);
296         /* clear [C3 C2] in soc physical address */
297         soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
298         /* clear [C4] in soc physical address */
299         soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
300
301         /* loop for all possibilities of [C4 C3 C2] */
302         for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
303                 retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
304                 retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
305                 /* include column bit 0 and 1 */
306                 col &= 0x3;
307                 col |= (column << 2);
308                 dev_info(adev->dev,
309                         "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
310                         retired_page, row, col, bank, channel_index);
311                 amdgpu_umc_fill_error_record(err_data, err_addr,
312                         retired_page, channel_index, umc_inst);
313
314                 /* shift R13 bit */
315                 retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
316                 dev_info(adev->dev,
317                         "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
318                         retired_page, row_xor, col, bank, channel_index);
319                 amdgpu_umc_fill_error_record(err_data, err_addr,
320                         retired_page, channel_index, umc_inst);
321         }
322 }
323
324 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
325                                         uint32_t node_inst, uint32_t umc_inst,
326                                         uint32_t ch_inst, void *data)
327 {
328         uint64_t mc_umc_status_addr;
329         uint64_t mc_umc_status, err_addr;
330         uint64_t mc_umc_addrt0;
331         struct ras_err_data *err_data = (struct ras_err_data *)data;
332         uint64_t umc_reg_offset =
333                 get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
334
335         mc_umc_status_addr =
336                 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
337
338         mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
339
340         if (mc_umc_status == 0)
341                 return 0;
342
343         if (!err_data->err_addr) {
344                 /* clear umc status */
345                 WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
346
347                 return 0;
348         }
349
350         /* calculate error address if ue error is detected */
351         if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
352             umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
353                 mc_umc_addrt0 =
354                         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
355
356                 err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 + umc_reg_offset) * 4);
357
358                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
359
360                 umc_v12_0_convert_error_address(adev, err_data, err_addr,
361                                         ch_inst, umc_inst, node_inst);
362         }
363
364         /* clear umc status */
365         WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
366
367         return 0;
368 }
369
370 static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev,
371                                              void *ras_error_status)
372 {
373         amdgpu_umc_loop_channels(adev,
374                 umc_v12_0_query_error_address, ras_error_status);
375 }
376
377 static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
378                                         uint32_t node_inst, uint32_t umc_inst,
379                                         uint32_t ch_inst, void *data)
380 {
381         uint32_t odecc_cnt_sel;
382         uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr;
383         uint64_t umc_reg_offset =
384                 get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
385
386         odecc_cnt_sel_addr =
387                 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel);
388         odecc_err_cnt_addr =
389                 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
390
391         odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4);
392
393         /* set ce error interrupt type to APIC based interrupt */
394         odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel,
395                                         OdEccErrInt, 0x1);
396         WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4, odecc_cnt_sel);
397
398         /* set error count to initial value */
399         WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V12_0_CE_CNT_INIT);
400
401         return 0;
402 }
403
404 static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
405                                         void *ras_error_status)
406 {
407         amdgpu_mca_smu_log_ras_error(adev,
408                 AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status);
409         amdgpu_mca_smu_log_ras_error(adev,
410                 AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status);
411 }
412
413 static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
414                                         void *ras_error_status)
415 {
416         struct ras_err_node *err_node;
417         uint64_t mc_umc_status;
418         struct ras_err_info *err_info;
419         struct ras_err_addr *mca_err_addr, *tmp;
420         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
421
422         for_each_ras_error(err_node, err_data) {
423                 err_info = &err_node->err_info;
424                 if (list_empty(&err_info->err_addr_list))
425                         continue;
426
427                 list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {
428                         mc_umc_status = mca_err_addr->err_status;
429                         if (mc_umc_status &&
430                                 (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
431                                  umc_v12_0_is_deferred_error(adev, mc_umc_status))) {
432                                 uint64_t mca_addr, err_addr, mca_ipid;
433                                 uint32_t InstanceIdLo;
434
435                                 mca_addr = mca_err_addr->err_addr;
436                                 mca_ipid = mca_err_addr->err_ipid;
437
438                                 err_addr = REG_GET_FIELD(mca_addr,
439                                                         MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
440                                 InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
441
442                                 dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
443                                         mca_ipid,
444                                         err_info->mcm_info.die_id,
445                                         MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
446                                         MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
447                                         err_addr);
448
449                                 umc_v12_0_convert_error_address(adev,
450                                         err_data, err_addr,
451                                         MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
452                                         MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
453                                         err_info->mcm_info.die_id);
454                         }
455
456                         /* Delete error address node from list and free memory */
457                         amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr);
458                 }
459         }
460 }
461
462 static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
463                         enum amdgpu_mca_error_type type, void *ras_error_status)
464 {
465         uint64_t mc_umc_status = *(uint64_t *)ras_error_status;
466
467         switch (type) {
468         case AMDGPU_MCA_ERROR_TYPE_UE:
469                 return umc_v12_0_is_uncorrectable_error(adev, mc_umc_status);
470         case AMDGPU_MCA_ERROR_TYPE_CE:
471                 return umc_v12_0_is_correctable_error(adev, mc_umc_status);
472         case AMDGPU_MCA_ERROR_TYPE_DE:
473                 return umc_v12_0_is_deferred_error(adev, mc_umc_status);
474         default:
475                 return false;
476         }
477
478         return false;
479 }
480
481 static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
482 {
483         amdgpu_umc_loop_channels(adev,
484                 umc_v12_0_err_cnt_init_per_channel, NULL);
485 }
486
487 static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev)
488 {
489         /*
490          * Force return true, because regUMCCH0_EccCtrl
491          * is not accessible from host side
492          */
493         return true;
494 }
495
496 const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {
497         .query_ras_error_count = umc_v12_0_query_ras_error_count,
498         .query_ras_error_address = umc_v12_0_query_ras_error_address,
499 };
500
501 static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
502                                               struct aca_bank_report *report, void *data)
503 {
504         struct amdgpu_device *adev = handle->adev;
505         u64 status;
506         int ret;
507
508         ret = aca_bank_info_decode(bank, &report->info);
509         if (ret)
510                 return ret;
511
512         status = bank->regs[ACA_REG_IDX_STATUS];
513         switch (type) {
514         case ACA_ERROR_TYPE_UE:
515                 if (umc_v12_0_is_uncorrectable_error(adev, status)) {
516                         report->count[type] = 1;
517                 }
518                 break;
519         case ACA_ERROR_TYPE_CE:
520                 if (umc_v12_0_is_correctable_error(adev, status)) {
521                         report->count[type] = 1;
522                 }
523                 break;
524         default:
525                 return -EINVAL;
526         }
527
528         return 0;
529 }
530
531 static const struct aca_bank_ops umc_v12_0_aca_bank_ops = {
532         .aca_bank_generate_report = umc_v12_0_aca_bank_generate_report,
533 };
534
535 const struct aca_info umc_v12_0_aca_info = {
536         .hwip = ACA_HWIP_TYPE_UMC,
537         .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
538         .bank_ops = &umc_v12_0_aca_bank_ops,
539 };
540
541 static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
542 {
543         int ret;
544
545         ret = amdgpu_umc_ras_late_init(adev, ras_block);
546         if (ret)
547                 return ret;
548
549         ret = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__UMC,
550                                   &umc_v12_0_aca_info, NULL);
551         if (ret)
552                 return ret;
553
554         return 0;
555 }
556
557 struct amdgpu_umc_ras umc_v12_0_ras = {
558         .ras_block = {
559                 .hw_ops = &umc_v12_0_ras_hw_ops,
560                 .ras_late_init = umc_v12_0_ras_late_init,
561         },
562         .err_cnt_init = umc_v12_0_err_cnt_init,
563         .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
564         .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
565         .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
566         .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
567 };
568
This page took 0.069318 seconds and 4 git commands to generate.