]>
Commit | Line | Data |
---|---|---|
09c434b8 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
1da177e4 LT |
2 | /* |
3 | * File: mca_drv.c | |
4 | * Purpose: Generic MCA handling layer | |
5 | * | |
6 | * Copyright (C) 2004 FUJITSU LIMITED | |
fe77efb8 | 7 | * Copyright (C) 2004 Hidetoshi Seto <[email protected]> |
7f613c7d KO |
8 | * Copyright (C) 2005 Silicon Graphics, Inc |
9 | * Copyright (C) 2005 Keith Owens <[email protected]> | |
d2a28ad9 | 10 | * Copyright (C) 2006 Russ Anderson <[email protected]> |
1da177e4 | 11 | */ |
1da177e4 LT |
12 | #include <linux/types.h> |
13 | #include <linux/init.h> | |
14 | #include <linux/sched.h> | |
15 | #include <linux/interrupt.h> | |
16 | #include <linux/irq.h> | |
17 | #include <linux/kallsyms.h> | |
57c8a661 | 18 | #include <linux/memblock.h> |
1da177e4 LT |
19 | #include <linux/acpi.h> |
20 | #include <linux/timer.h> | |
21 | #include <linux/module.h> | |
22 | #include <linux/kernel.h> | |
23 | #include <linux/smp.h> | |
24 | #include <linux/workqueue.h> | |
25 | #include <linux/mm.h> | |
5a0e3ad6 | 26 | #include <linux/slab.h> |
1da177e4 LT |
27 | |
28 | #include <asm/delay.h> | |
1da177e4 LT |
29 | #include <asm/page.h> |
30 | #include <asm/ptrace.h> | |
1da177e4 LT |
31 | #include <asm/sal.h> |
32 | #include <asm/mca.h> | |
33 | ||
34 | #include <asm/irq.h> | |
35 | #include <asm/hw_irq.h> | |
36 | ||
37 | #include "mca_drv.h" | |
38 | ||
39 | /* max size of SAL error record (default) */ | |
40 | static int sal_rec_max = 10000; | |
41 | ||
1da177e4 LT |
42 | /* from mca_drv_asm.S */ |
43 | extern void *mca_handler_bhhook(void); | |
44 | ||
45 | static DEFINE_SPINLOCK(mca_bh_lock); | |
46 | ||
47 | typedef enum { | |
48 | MCA_IS_LOCAL = 0, | |
49 | MCA_IS_GLOBAL = 1 | |
50 | } mca_type_t; | |
51 | ||
52 | #define MAX_PAGE_ISOLATE 1024 | |
53 | ||
54 | static struct page *page_isolate[MAX_PAGE_ISOLATE]; | |
55 | static int num_page_isolate = 0; | |
56 | ||
57 | typedef enum { | |
4881e2cd HS |
58 | ISOLATE_NG, |
59 | ISOLATE_OK, | |
60 | ISOLATE_NONE | |
1da177e4 LT |
61 | } isolate_status_t; |
62 | ||
18997961 RA |
63 | typedef enum { |
64 | MCA_NOT_RECOVERED = 0, | |
65 | MCA_RECOVERED = 1 | |
66 | } recovery_status_t; | |
67 | ||
1da177e4 LT |
68 | /* |
69 | * This pool keeps pointers to the section part of SAL error record | |
70 | */ | |
71 | static struct { | |
72 | slidx_list_t *buffer; /* section pointer list pool */ | |
73 | int cur_idx; /* Current index of section pointer list pool */ | |
74 | int max_idx; /* Maximum index of section pointer list pool */ | |
75 | } slidx_pool; | |
76 | ||
18997961 RA |
77 | static int |
78 | fatal_mca(const char *fmt, ...) | |
79 | { | |
80 | va_list args; | |
43ed3baf | 81 | char buf[256]; |
18997961 RA |
82 | |
83 | va_start(args, fmt); | |
43ed3baf | 84 | vsnprintf(buf, sizeof(buf), fmt, args); |
18997961 | 85 | va_end(args); |
43ed3baf | 86 | ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf); |
18997961 RA |
87 | |
88 | return MCA_NOT_RECOVERED; | |
89 | } | |
90 | ||
43ed3baf HS |
91 | static int |
92 | mca_recovered(const char *fmt, ...) | |
93 | { | |
94 | va_list args; | |
95 | char buf[256]; | |
96 | ||
97 | va_start(args, fmt); | |
98 | vsnprintf(buf, sizeof(buf), fmt, args); | |
99 | va_end(args); | |
100 | ia64_mca_printk(KERN_INFO "MCA: %s\n", buf); | |
101 | ||
102 | return MCA_RECOVERED; | |
103 | } | |
104 | ||
1da177e4 LT |
105 | /** |
106 | * mca_page_isolate - isolate a poisoned page in order not to use it later | |
107 | * @paddr: poisoned memory location | |
108 | * | |
109 | * Return value: | |
4881e2cd | 110 | * one of isolate_status_t, ISOLATE_OK/NG/NONE. |
1da177e4 LT |
111 | */ |
112 | ||
113 | static isolate_status_t | |
114 | mca_page_isolate(unsigned long paddr) | |
115 | { | |
116 | int i; | |
117 | struct page *p; | |
118 | ||
119 | /* whether physical address is valid or not */ | |
20305e59 | 120 | if (!ia64_phys_addr_valid(paddr)) |
4881e2cd HS |
121 | return ISOLATE_NONE; |
122 | ||
56f87b82 | 123 | if (!pfn_valid(paddr >> PAGE_SHIFT)) |
4881e2cd | 124 | return ISOLATE_NONE; |
1da177e4 LT |
125 | |
126 | /* convert physical address to physical page number */ | |
127 | p = pfn_to_page(paddr>>PAGE_SHIFT); | |
128 | ||
129 | /* check whether a page number have been already registered or not */ | |
20305e59 HS |
130 | for (i = 0; i < num_page_isolate; i++) |
131 | if (page_isolate[i] == p) | |
1da177e4 LT |
132 | return ISOLATE_OK; /* already listed */ |
133 | ||
134 | /* limitation check */ | |
20305e59 | 135 | if (num_page_isolate == MAX_PAGE_ISOLATE) |
1da177e4 LT |
136 | return ISOLATE_NG; |
137 | ||
138 | /* kick pages having attribute 'SLAB' or 'Reserved' */ | |
20305e59 | 139 | if (PageSlab(p) || PageReserved(p)) |
1da177e4 LT |
140 | return ISOLATE_NG; |
141 | ||
142 | /* add attribute 'Reserved' and register the page */ | |
cbb92144 | 143 | get_page(p); |
1da177e4 LT |
144 | SetPageReserved(p); |
145 | page_isolate[num_page_isolate++] = p; | |
146 | ||
147 | return ISOLATE_OK; | |
148 | } | |
149 | ||
150 | /** | |
151 | * mca_hanlder_bh - Kill the process which occurred memory read error | |
152 | * @paddr: poisoned address received from MCA Handler | |
153 | */ | |
154 | ||
155 | void | |
d2a28ad9 | 156 | mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr) |
1da177e4 | 157 | { |
43ed3baf | 158 | ia64_mlogbuf_dump(); |
d2a28ad9 RA |
159 | printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, " |
160 | "iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n", | |
6c1ee033 EB |
161 | raw_smp_processor_id(), current->pid, |
162 | from_kuid(&init_user_ns, current_uid()), | |
d2a28ad9 | 163 | iip, ipsr, paddr, current->comm); |
1da177e4 LT |
164 | |
165 | spin_lock(&mca_bh_lock); | |
4881e2cd HS |
166 | switch (mca_page_isolate(paddr)) { |
167 | case ISOLATE_OK: | |
1da177e4 | 168 | printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr); |
4881e2cd HS |
169 | break; |
170 | case ISOLATE_NG: | |
ea0e92a6 | 171 | printk(KERN_CRIT "Page isolation: ( %lx ) failure.\n", paddr); |
4881e2cd HS |
172 | break; |
173 | default: | |
174 | break; | |
1da177e4 LT |
175 | } |
176 | spin_unlock(&mca_bh_lock); | |
177 | ||
178 | /* This process is about to be killed itself */ | |
b1b901c2 | 179 | do_exit(SIGKILL); |
1da177e4 LT |
180 | } |
181 | ||
182 | /** | |
183 | * mca_make_peidx - Make index of processor error section | |
184 | * @slpi: pointer to record of processor error section | |
185 | * @peidx: pointer to index of processor error section | |
186 | */ | |
187 | ||
20305e59 | 188 | static void |
1da177e4 LT |
189 | mca_make_peidx(sal_log_processor_info_t *slpi, peidx_table_t *peidx) |
190 | { | |
20305e59 | 191 | /* |
1da177e4 LT |
192 | * calculate the start address of |
193 | * "struct cpuid_info" and "sal_processor_static_info_t". | |
194 | */ | |
195 | u64 total_check_num = slpi->valid.num_cache_check | |
196 | + slpi->valid.num_tlb_check | |
197 | + slpi->valid.num_bus_check | |
198 | + slpi->valid.num_reg_file_check | |
199 | + slpi->valid.num_ms_check; | |
200 | u64 head_size = sizeof(sal_log_mod_error_info_t) * total_check_num | |
201 | + sizeof(sal_log_processor_info_t); | |
202 | u64 mid_size = slpi->valid.cpuid_info * sizeof(struct sal_cpuid_info); | |
203 | ||
204 | peidx_head(peidx) = slpi; | |
205 | peidx_mid(peidx) = (struct sal_cpuid_info *) | |
206 | (slpi->valid.cpuid_info ? ((char*)slpi + head_size) : NULL); | |
207 | peidx_bottom(peidx) = (sal_processor_static_info_t *) | |
208 | (slpi->valid.psi_static_struct ? | |
209 | ((char*)slpi + head_size + mid_size) : NULL); | |
210 | } | |
211 | ||
212 | /** | |
20305e59 | 213 | * mca_make_slidx - Make index of SAL error record |
1da177e4 LT |
214 | * @buffer: pointer to SAL error record |
215 | * @slidx: pointer to index of SAL error record | |
216 | * | |
217 | * Return value: | |
218 | * 1 if record has platform error / 0 if not | |
219 | */ | |
220 | #define LOG_INDEX_ADD_SECT_PTR(sect, ptr) \ | |
20305e59 HS |
221 | {slidx_list_t *hl = &slidx_pool.buffer[slidx_pool.cur_idx]; \ |
222 | hl->hdr = ptr; \ | |
223 | list_add(&hl->list, &(sect)); \ | |
224 | slidx_pool.cur_idx = (slidx_pool.cur_idx + 1)%slidx_pool.max_idx; } | |
1da177e4 | 225 | |
20305e59 | 226 | static int |
1da177e4 LT |
227 | mca_make_slidx(void *buffer, slidx_table_t *slidx) |
228 | { | |
229 | int platform_err = 0; | |
230 | int record_len = ((sal_log_record_header_t*)buffer)->len; | |
231 | u32 ercd_pos; | |
232 | int sects; | |
233 | sal_log_section_hdr_t *sp; | |
234 | ||
235 | /* | |
236 | * Initialize index referring current record | |
237 | */ | |
238 | INIT_LIST_HEAD(&(slidx->proc_err)); | |
239 | INIT_LIST_HEAD(&(slidx->mem_dev_err)); | |
240 | INIT_LIST_HEAD(&(slidx->sel_dev_err)); | |
241 | INIT_LIST_HEAD(&(slidx->pci_bus_err)); | |
242 | INIT_LIST_HEAD(&(slidx->smbios_dev_err)); | |
243 | INIT_LIST_HEAD(&(slidx->pci_comp_err)); | |
244 | INIT_LIST_HEAD(&(slidx->plat_specific_err)); | |
245 | INIT_LIST_HEAD(&(slidx->host_ctlr_err)); | |
246 | INIT_LIST_HEAD(&(slidx->plat_bus_err)); | |
247 | INIT_LIST_HEAD(&(slidx->unsupported)); | |
248 | ||
249 | /* | |
250 | * Extract a Record Header | |
251 | */ | |
252 | slidx->header = buffer; | |
253 | ||
254 | /* | |
255 | * Extract each section records | |
256 | * (arranged from "int ia64_log_platform_info_print()") | |
257 | */ | |
258 | for (ercd_pos = sizeof(sal_log_record_header_t), sects = 0; | |
259 | ercd_pos < record_len; ercd_pos += sp->len, sects++) { | |
260 | sp = (sal_log_section_hdr_t *)((char*)buffer + ercd_pos); | |
261 | if (!efi_guidcmp(sp->guid, SAL_PROC_DEV_ERR_SECT_GUID)) { | |
262 | LOG_INDEX_ADD_SECT_PTR(slidx->proc_err, sp); | |
20305e59 HS |
263 | } else if (!efi_guidcmp(sp->guid, |
264 | SAL_PLAT_MEM_DEV_ERR_SECT_GUID)) { | |
1da177e4 LT |
265 | platform_err = 1; |
266 | LOG_INDEX_ADD_SECT_PTR(slidx->mem_dev_err, sp); | |
20305e59 HS |
267 | } else if (!efi_guidcmp(sp->guid, |
268 | SAL_PLAT_SEL_DEV_ERR_SECT_GUID)) { | |
1da177e4 LT |
269 | platform_err = 1; |
270 | LOG_INDEX_ADD_SECT_PTR(slidx->sel_dev_err, sp); | |
20305e59 HS |
271 | } else if (!efi_guidcmp(sp->guid, |
272 | SAL_PLAT_PCI_BUS_ERR_SECT_GUID)) { | |
1da177e4 LT |
273 | platform_err = 1; |
274 | LOG_INDEX_ADD_SECT_PTR(slidx->pci_bus_err, sp); | |
20305e59 HS |
275 | } else if (!efi_guidcmp(sp->guid, |
276 | SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID)) { | |
1da177e4 LT |
277 | platform_err = 1; |
278 | LOG_INDEX_ADD_SECT_PTR(slidx->smbios_dev_err, sp); | |
20305e59 HS |
279 | } else if (!efi_guidcmp(sp->guid, |
280 | SAL_PLAT_PCI_COMP_ERR_SECT_GUID)) { | |
1da177e4 LT |
281 | platform_err = 1; |
282 | LOG_INDEX_ADD_SECT_PTR(slidx->pci_comp_err, sp); | |
20305e59 HS |
283 | } else if (!efi_guidcmp(sp->guid, |
284 | SAL_PLAT_SPECIFIC_ERR_SECT_GUID)) { | |
1da177e4 LT |
285 | platform_err = 1; |
286 | LOG_INDEX_ADD_SECT_PTR(slidx->plat_specific_err, sp); | |
20305e59 HS |
287 | } else if (!efi_guidcmp(sp->guid, |
288 | SAL_PLAT_HOST_CTLR_ERR_SECT_GUID)) { | |
1da177e4 LT |
289 | platform_err = 1; |
290 | LOG_INDEX_ADD_SECT_PTR(slidx->host_ctlr_err, sp); | |
20305e59 HS |
291 | } else if (!efi_guidcmp(sp->guid, |
292 | SAL_PLAT_BUS_ERR_SECT_GUID)) { | |
1da177e4 LT |
293 | platform_err = 1; |
294 | LOG_INDEX_ADD_SECT_PTR(slidx->plat_bus_err, sp); | |
295 | } else { | |
296 | LOG_INDEX_ADD_SECT_PTR(slidx->unsupported, sp); | |
297 | } | |
298 | } | |
299 | slidx->n_sections = sects; | |
300 | ||
301 | return platform_err; | |
302 | } | |
303 | ||
304 | /** | |
305 | * init_record_index_pools - Initialize pool of lists for SAL record index | |
306 | * | |
307 | * Return value: | |
308 | * 0 on Success / -ENOMEM on Failure | |
309 | */ | |
20305e59 | 310 | static int |
1da177e4 LT |
311 | init_record_index_pools(void) |
312 | { | |
313 | int i; | |
314 | int rec_max_size; /* Maximum size of SAL error records */ | |
315 | int sect_min_size; /* Minimum size of SAL error sections */ | |
316 | /* minimum size table of each section */ | |
20305e59 HS |
317 | static int sal_log_sect_min_sizes[] = { |
318 | sizeof(sal_log_processor_info_t) | |
319 | + sizeof(sal_processor_static_info_t), | |
1da177e4 LT |
320 | sizeof(sal_log_mem_dev_err_info_t), |
321 | sizeof(sal_log_sel_dev_err_info_t), | |
322 | sizeof(sal_log_pci_bus_err_info_t), | |
323 | sizeof(sal_log_smbios_dev_err_info_t), | |
324 | sizeof(sal_log_pci_comp_err_info_t), | |
325 | sizeof(sal_log_plat_specific_err_info_t), | |
326 | sizeof(sal_log_host_ctlr_err_info_t), | |
327 | sizeof(sal_log_plat_bus_err_info_t), | |
328 | }; | |
329 | ||
330 | /* | |
331 | * MCA handler cannot allocate new memory on flight, | |
332 | * so we preallocate enough memory to handle a SAL record. | |
333 | * | |
334 | * Initialize a handling set of slidx_pool: | |
335 | * 1. Pick up the max size of SAL error records | |
336 | * 2. Pick up the min size of SAL error sections | |
337 | * 3. Allocate the pool as enough to 2 SAL records | |
338 | * (now we can estimate the maxinum of section in a record.) | |
339 | */ | |
340 | ||
341 | /* - 1 - */ | |
342 | rec_max_size = sal_rec_max; | |
343 | ||
344 | /* - 2 - */ | |
345 | sect_min_size = sal_log_sect_min_sizes[0]; | |
346 | for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++) | |
347 | if (sect_min_size > sal_log_sect_min_sizes[i]) | |
348 | sect_min_size = sal_log_sect_min_sizes[i]; | |
349 | ||
350 | /* - 3 - */ | |
351 | slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1; | |
7c13e0d1 | 352 | slidx_pool.buffer = |
6da2ec56 KC |
353 | kmalloc_array(slidx_pool.max_idx, sizeof(slidx_list_t), |
354 | GFP_KERNEL); | |
1da177e4 LT |
355 | |
356 | return slidx_pool.buffer ? 0 : -ENOMEM; | |
357 | } | |
358 | ||
359 | ||
360 | /***************************************************************************** | |
361 | * Recovery functions * | |
362 | *****************************************************************************/ | |
363 | ||
364 | /** | |
365 | * is_mca_global - Check whether this MCA is global or not | |
366 | * @peidx: pointer of index of processor error section | |
367 | * @pbci: pointer to pal_bus_check_info_t | |
20305e59 | 368 | * @sos: pointer to hand off struct between SAL and OS |
1da177e4 LT |
369 | * |
370 | * Return value: | |
371 | * MCA_IS_LOCAL / MCA_IS_GLOBAL | |
372 | */ | |
373 | ||
374 | static mca_type_t | |
7f613c7d KO |
375 | is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci, |
376 | struct ia64_sal_os_state *sos) | |
1da177e4 | 377 | { |
20305e59 HS |
378 | pal_processor_state_info_t *psp = |
379 | (pal_processor_state_info_t*)peidx_psp(peidx); | |
1da177e4 | 380 | |
20305e59 | 381 | /* |
1da177e4 | 382 | * PAL can request a rendezvous, if the MCA has a global scope. |
20305e59 | 383 | * If "rz_always" flag is set, SAL requests MCA rendezvous |
1da177e4 LT |
384 | * in spite of global MCA. |
385 | * Therefore it is local MCA when rendezvous has not been requested. | |
386 | * Failed to rendezvous, the system must be down. | |
387 | */ | |
7f613c7d | 388 | switch (sos->rv_rc) { |
1da177e4 LT |
389 | case -1: /* SAL rendezvous unsuccessful */ |
390 | return MCA_IS_GLOBAL; | |
391 | case 0: /* SAL rendezvous not required */ | |
392 | return MCA_IS_LOCAL; | |
393 | case 1: /* SAL rendezvous successful int */ | |
394 | case 2: /* SAL rendezvous successful int with init */ | |
395 | default: | |
396 | break; | |
397 | } | |
398 | ||
399 | /* | |
400 | * If One or more Cache/TLB/Reg_File/Uarch_Check is here, | |
401 | * it would be a local MCA. (i.e. processor internal error) | |
402 | */ | |
403 | if (psp->tc || psp->cc || psp->rc || psp->uc) | |
404 | return MCA_IS_LOCAL; | |
405 | ||
406 | /* | |
407 | * Bus_Check structure with Bus_Check.ib (internal bus error) flag set | |
408 | * would be a global MCA. (e.g. a system bus address parity error) | |
409 | */ | |
410 | if (!pbci || pbci->ib) | |
411 | return MCA_IS_GLOBAL; | |
412 | ||
413 | /* | |
414 | * Bus_Check structure with Bus_Check.eb (external bus error) flag set | |
415 | * could be either a local MCA or a global MCA. | |
416 | * | |
417 | * Referring Bus_Check.bsi: | |
418 | * 0: Unknown/unclassified | |
419 | * 1: BERR# | |
420 | * 2: BINIT# | |
421 | * 3: Hard Fail | |
422 | * (FIXME: Are these SGI specific or generic bsi values?) | |
423 | */ | |
424 | if (pbci->eb) | |
425 | switch (pbci->bsi) { | |
426 | case 0: | |
427 | /* e.g. a load from poisoned memory */ | |
428 | return MCA_IS_LOCAL; | |
429 | case 1: | |
430 | case 2: | |
431 | case 3: | |
432 | return MCA_IS_GLOBAL; | |
433 | } | |
434 | ||
435 | return MCA_IS_GLOBAL; | |
436 | } | |
437 | ||
264b0f99 RA |
438 | /** |
439 | * get_target_identifier - Get the valid Cache or Bus check target identifier. | |
440 | * @peidx: pointer of index of processor error section | |
441 | * | |
442 | * Return value: | |
72fdbdce | 443 | * target address on Success / 0 on Failure |
264b0f99 RA |
444 | */ |
445 | static u64 | |
446 | get_target_identifier(peidx_table_t *peidx) | |
447 | { | |
448 | u64 target_address = 0; | |
449 | sal_log_mod_error_info_t *smei; | |
450 | pal_cache_check_info_t *pcci; | |
451 | int i, level = 9; | |
452 | ||
453 | /* | |
454 | * Look through the cache checks for a valid target identifier | |
455 | * If more than one valid target identifier, return the one | |
456 | * with the lowest cache level. | |
457 | */ | |
458 | for (i = 0; i < peidx_cache_check_num(peidx); i++) { | |
459 | smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i); | |
460 | if (smei->valid.target_identifier && smei->target_identifier) { | |
461 | pcci = (pal_cache_check_info_t *)&(smei->check_info); | |
462 | if (!target_address || (pcci->level < level)) { | |
463 | target_address = smei->target_identifier; | |
464 | level = pcci->level; | |
465 | continue; | |
466 | } | |
467 | } | |
468 | } | |
469 | if (target_address) | |
470 | return target_address; | |
471 | ||
472 | /* | |
473 | * Look at the bus check for a valid target identifier | |
474 | */ | |
475 | smei = peidx_bus_check(peidx, 0); | |
476 | if (smei && smei->valid.target_identifier) | |
477 | return smei->target_identifier; | |
478 | ||
479 | return 0; | |
480 | } | |
481 | ||
1da177e4 LT |
482 | /** |
483 | * recover_from_read_error - Try to recover the errors which type are "read"s. | |
484 | * @slidx: pointer of index of SAL error record | |
485 | * @peidx: pointer of index of processor error section | |
486 | * @pbci: pointer of pal_bus_check_info | |
20305e59 | 487 | * @sos: pointer to hand off struct between SAL and OS |
1da177e4 LT |
488 | * |
489 | * Return value: | |
490 | * 1 on Success / 0 on Failure | |
491 | */ | |
492 | ||
493 | static int | |
20305e59 HS |
494 | recover_from_read_error(slidx_table_t *slidx, |
495 | peidx_table_t *peidx, pal_bus_check_info_t *pbci, | |
7f613c7d | 496 | struct ia64_sal_os_state *sos) |
1da177e4 | 497 | { |
264b0f99 | 498 | u64 target_identifier; |
1da177e4 LT |
499 | pal_min_state_area_t *pmsa; |
500 | struct ia64_psr *psr1, *psr2; | |
501 | ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook; | |
502 | ||
503 | /* Is target address valid? */ | |
264b0f99 RA |
504 | target_identifier = get_target_identifier(peidx); |
505 | if (!target_identifier) | |
43ed3baf | 506 | return fatal_mca("target address not valid"); |
1da177e4 LT |
507 | |
508 | /* | |
509 | * cpu read or memory-mapped io read | |
510 | * | |
511 | * offending process affected process OS MCA do | |
512 | * kernel mode kernel mode down system | |
513 | * kernel mode user mode kill the process | |
514 | * user mode kernel mode down system (*) | |
515 | * user mode user mode kill the process | |
516 | * | |
517 | * (*) You could terminate offending user-mode process | |
518 | * if (pbci->pv && pbci->pl != 0) *and* if you sure | |
519 | * the process not have any locks of kernel. | |
520 | */ | |
521 | ||
a9474646 HS |
522 | /* Is minstate valid? */ |
523 | if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) | |
43ed3baf | 524 | return fatal_mca("minstate not valid"); |
1da177e4 | 525 | psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); |
d2a28ad9 | 526 | psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); |
1da177e4 LT |
527 | |
528 | /* | |
529 | * Check the privilege level of interrupted context. | |
530 | * If it is user-mode, then terminate affected process. | |
531 | */ | |
d2a28ad9 RA |
532 | |
533 | pmsa = sos->pal_min_state; | |
534 | if (psr1->cpl != 0 || | |
535 | ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) { | |
264b0f99 RA |
536 | /* |
537 | * setup for resume to bottom half of MCA, | |
538 | * "mca_handler_bhhook" | |
539 | */ | |
540 | /* pass to bhhook as argument (gr8, ...) */ | |
541 | pmsa->pmsa_gr[8-1] = target_identifier; | |
542 | pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip; | |
543 | pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr; | |
544 | /* set interrupted return address (but no use) */ | |
545 | pmsa->pmsa_br0 = pmsa->pmsa_iip; | |
546 | /* change resume address to bottom half */ | |
547 | pmsa->pmsa_iip = mca_hdlr_bh->fp; | |
548 | pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp; | |
549 | /* set cpl with kernel mode */ | |
550 | psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; | |
551 | psr2->cpl = 0; | |
552 | psr2->ri = 0; | |
553 | psr2->bn = 1; | |
554 | psr2->i = 0; | |
555 | ||
556 | return mca_recovered("user memory corruption. " | |
43ed3baf | 557 | "kill affected process - recovered."); |
1da177e4 LT |
558 | } |
559 | ||
43ed3baf HS |
560 | return fatal_mca("kernel context not recovered, iip 0x%lx\n", |
561 | pmsa->pmsa_iip); | |
1da177e4 LT |
562 | } |
563 | ||
564 | /** | |
565 | * recover_from_platform_error - Recover from platform error. | |
566 | * @slidx: pointer of index of SAL error record | |
567 | * @peidx: pointer of index of processor error section | |
568 | * @pbci: pointer of pal_bus_check_info | |
20305e59 | 569 | * @sos: pointer to hand off struct between SAL and OS |
1da177e4 LT |
570 | * |
571 | * Return value: | |
572 | * 1 on Success / 0 on Failure | |
573 | */ | |
574 | ||
575 | static int | |
20305e59 HS |
576 | recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, |
577 | pal_bus_check_info_t *pbci, | |
7f613c7d | 578 | struct ia64_sal_os_state *sos) |
1da177e4 LT |
579 | { |
580 | int status = 0; | |
20305e59 HS |
581 | pal_processor_state_info_t *psp = |
582 | (pal_processor_state_info_t*)peidx_psp(peidx); | |
1da177e4 LT |
583 | |
584 | if (psp->bc && pbci->eb && pbci->bsi == 0) { | |
585 | switch(pbci->type) { | |
586 | case 1: /* partial read */ | |
587 | case 3: /* full line(cpu) read */ | |
588 | case 9: /* I/O space read */ | |
20305e59 HS |
589 | status = recover_from_read_error(slidx, peidx, pbci, |
590 | sos); | |
1da177e4 LT |
591 | break; |
592 | case 0: /* unknown */ | |
593 | case 2: /* partial write */ | |
594 | case 4: /* full line write */ | |
595 | case 5: /* implicit or explicit write-back operation */ | |
596 | case 6: /* snoop probe */ | |
597 | case 7: /* incoming or outgoing ptc.g */ | |
598 | case 8: /* write coalescing transactions */ | |
599 | case 10: /* I/O space write */ | |
600 | case 11: /* inter-processor interrupt message(IPI) */ | |
20305e59 HS |
601 | case 12: /* interrupt acknowledge or |
602 | external task priority cycle */ | |
1da177e4 LT |
603 | default: |
604 | break; | |
605 | } | |
396e8e76 RA |
606 | } else if (psp->cc && !psp->bc) { /* Cache error */ |
607 | status = recover_from_read_error(slidx, peidx, pbci, sos); | |
1da177e4 LT |
608 | } |
609 | ||
610 | return status; | |
611 | } | |
612 | ||
618b206f RA |
613 | /* |
614 | * recover_from_tlb_check | |
615 | * @peidx: pointer of index of processor error section | |
616 | * | |
617 | * Return value: | |
618 | * 1 on Success / 0 on Failure | |
619 | */ | |
620 | static int | |
621 | recover_from_tlb_check(peidx_table_t *peidx) | |
622 | { | |
623 | sal_log_mod_error_info_t *smei; | |
624 | pal_tlb_check_info_t *ptci; | |
625 | ||
626 | smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0); | |
627 | ptci = (pal_tlb_check_info_t *)&(smei->check_info); | |
628 | ||
629 | /* | |
630 | * Look for signature of a duplicate TLB DTC entry, which is | |
631 | * a SW bug and always fatal. | |
632 | */ | |
633 | if (ptci->op == PAL_TLB_CHECK_OP_PURGE | |
634 | && !(ptci->itr || ptci->dtc || ptci->itc)) | |
635 | return fatal_mca("Duplicate TLB entry"); | |
636 | ||
637 | return mca_recovered("TLB check recovered"); | |
638 | } | |
639 | ||
1da177e4 LT |
640 | /** |
641 | * recover_from_processor_error | |
642 | * @platform: whether there are some platform error section or not | |
643 | * @slidx: pointer of index of SAL error record | |
644 | * @peidx: pointer of index of processor error section | |
645 | * @pbci: pointer of pal_bus_check_info | |
20305e59 | 646 | * @sos: pointer to hand off struct between SAL and OS |
1da177e4 LT |
647 | * |
648 | * Return value: | |
649 | * 1 on Success / 0 on Failure | |
650 | */ | |
1da177e4 LT |
651 | |
652 | static int | |
20305e59 HS |
653 | recover_from_processor_error(int platform, slidx_table_t *slidx, |
654 | peidx_table_t *peidx, pal_bus_check_info_t *pbci, | |
7f613c7d | 655 | struct ia64_sal_os_state *sos) |
1da177e4 | 656 | { |
20305e59 HS |
657 | pal_processor_state_info_t *psp = |
658 | (pal_processor_state_info_t*)peidx_psp(peidx); | |
1da177e4 | 659 | |
20305e59 | 660 | /* |
a14f25a0 RA |
661 | * Processor recovery status must key off of the PAL recovery |
662 | * status in the Processor State Parameter. | |
1da177e4 | 663 | */ |
a14f25a0 RA |
664 | |
665 | /* | |
666 | * The machine check is corrected. | |
667 | */ | |
668 | if (psp->cm == 1) | |
43ed3baf | 669 | return mca_recovered("machine check is already corrected."); |
a14f25a0 RA |
670 | |
671 | /* | |
672 | * The error was not contained. Software must be reset. | |
673 | */ | |
674 | if (psp->us || psp->ci == 0) | |
43ed3baf | 675 | return fatal_mca("error not contained"); |
1da177e4 | 676 | |
618b206f RA |
677 | /* |
678 | * Look for recoverable TLB check | |
679 | */ | |
680 | if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) | |
681 | return recover_from_tlb_check(peidx); | |
682 | ||
1da177e4 | 683 | /* |
e1c48554 RA |
684 | * The cache check and bus check bits have four possible states |
685 | * cc bc | |
e1c48554 | 686 | * 1 1 Memory error, attempt recovery |
396e8e76 RA |
687 | * 1 0 Cache error, attempt recovery |
688 | * 0 1 I/O error, attempt recovery | |
689 | * 0 0 Other error type, not recovered | |
1da177e4 | 690 | */ |
396e8e76 RA |
691 | if (psp->cc == 0 && (psp->bc == 0 || pbci == NULL)) |
692 | return fatal_mca("No cache or bus check"); | |
1da177e4 LT |
693 | |
694 | /* | |
396e8e76 | 695 | * Cannot handle more than one bus check. |
1da177e4 LT |
696 | */ |
697 | if (peidx_bus_check_num(peidx) > 1) | |
43ed3baf | 698 | return fatal_mca("Too many bus checks"); |
396e8e76 | 699 | |
18997961 | 700 | if (pbci->ib) |
43ed3baf | 701 | return fatal_mca("Internal Bus error"); |
1da177e4 | 702 | if (pbci->eb && pbci->bsi > 0) |
43ed3baf | 703 | return fatal_mca("External bus check fatal status"); |
1da177e4 LT |
704 | |
705 | /* | |
72fdbdce | 706 | * This is a local MCA and estimated as a recoverable error. |
1da177e4 | 707 | */ |
20305e59 | 708 | if (platform) |
7f613c7d | 709 | return recover_from_platform_error(slidx, peidx, pbci, sos); |
396e8e76 | 710 | |
20305e59 HS |
711 | /* |
712 | * On account of strange SAL error record, we cannot recover. | |
1da177e4 | 713 | */ |
43ed3baf | 714 | return fatal_mca("Strange SAL record"); |
1da177e4 LT |
715 | } |
716 | ||
717 | /** | |
718 | * mca_try_to_recover - Try to recover from MCA | |
719 | * @rec: pointer to a SAL error record | |
20305e59 | 720 | * @sos: pointer to hand off struct between SAL and OS |
1da177e4 LT |
721 | * |
722 | * Return value: | |
723 | * 1 on Success / 0 on Failure | |
724 | */ | |
725 | ||
726 | static int | |
20305e59 | 727 | mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) |
1da177e4 LT |
728 | { |
729 | int platform_err; | |
730 | int n_proc_err; | |
731 | slidx_table_t slidx; | |
732 | peidx_table_t peidx; | |
733 | pal_bus_check_info_t pbci; | |
734 | ||
1da177e4 LT |
735 | /* Make index of SAL error record */ |
736 | platform_err = mca_make_slidx(rec, &slidx); | |
737 | ||
738 | /* Count processor error sections */ | |
739 | n_proc_err = slidx_count(&slidx, proc_err); | |
740 | ||
741 | /* Now, OS can recover when there is one processor error section */ | |
742 | if (n_proc_err > 1) | |
43ed3baf | 743 | return fatal_mca("Too Many Errors"); |
18997961 | 744 | else if (n_proc_err == 0) |
43ed3baf HS |
745 | /* Weird SAL record ... We can't do anything */ |
746 | return fatal_mca("Weird SAL record"); | |
1da177e4 LT |
747 | |
748 | /* Make index of processor error section */ | |
20305e59 HS |
749 | mca_make_peidx((sal_log_processor_info_t*) |
750 | slidx_first_entry(&slidx.proc_err)->hdr, &peidx); | |
1da177e4 LT |
751 | |
752 | /* Extract Processor BUS_CHECK[0] */ | |
753 | *((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0); | |
754 | ||
755 | /* Check whether MCA is global or not */ | |
7f613c7d | 756 | if (is_mca_global(&peidx, &pbci, sos)) |
43ed3baf | 757 | return fatal_mca("global MCA"); |
1da177e4 LT |
758 | |
759 | /* Try to recover a processor error */ | |
20305e59 HS |
760 | return recover_from_processor_error(platform_err, &slidx, &peidx, |
761 | &pbci, sos); | |
1da177e4 LT |
762 | } |
763 | ||
764 | /* | |
765 | * ============================================================================= | |
766 | */ | |
767 | ||
768 | int __init mca_external_handler_init(void) | |
769 | { | |
770 | if (init_record_index_pools()) | |
771 | return -ENOMEM; | |
772 | ||
773 | /* register external mca handlers */ | |
20305e59 | 774 | if (ia64_reg_MCA_extension(mca_try_to_recover)) { |
1da177e4 LT |
775 | printk(KERN_ERR "ia64_reg_MCA_extension failed.\n"); |
776 | kfree(slidx_pool.buffer); | |
777 | return -EFAULT; | |
778 | } | |
779 | return 0; | |
780 | } | |
781 | ||
782 | void __exit mca_external_handler_exit(void) | |
783 | { | |
784 | /* unregister external mca handlers */ | |
785 | ia64_unreg_MCA_extension(); | |
786 | kfree(slidx_pool.buffer); | |
787 | } | |
788 | ||
789 | module_init(mca_external_handler_init); | |
790 | module_exit(mca_external_handler_exit); | |
791 | ||
792 | module_param(sal_rec_max, int, 0644); | |
793 | MODULE_PARM_DESC(sal_rec_max, "Max size of SAL error record"); | |
794 | ||
795 | MODULE_DESCRIPTION("ia64 platform dependent mca handler driver"); | |
796 | MODULE_LICENSE("GPL"); |