2 * UEFI Common Platform Error Record (CPER) support
4 * Copyright (C) 2010, Intel Corp.
7 * CPER is the format used to describe platform hardware error by
8 * various tables, such as ERST, BERT and HEST etc.
10 * For more information about CPER, please refer to Appendix N of UEFI
11 * Specification version 2.4.
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version
15 * 2 as published by the Free Software Foundation.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 #include <linux/kernel.h>
28 #include <linux/module.h>
29 #include <linux/time.h>
30 #include <linux/cper.h>
31 #include <linux/dmi.h>
32 #include <linux/acpi.h>
33 #include <linux/pci.h>
34 #include <linux/aer.h>
35 #include <linux/printk.h>
36 #include <linux/bcd.h>
37 #include <acpi/ghes.h>
38 #include <ras/ras_event.h>
40 static char rcd_decode_str[CPER_REC_LEN];
43 * CPER record ID need to be unique even after reboot, because record
44 * ID is used as index for ERST storage, while CPER records from
45 * multiple boot may co-exist in ERST.
47 u64 cper_next_record_id(void)
49 static atomic64_t seq;
51 if (!atomic64_read(&seq)) {
52 time64_t time = ktime_get_real_seconds();
55 * This code is unlikely to still be needed in year 2106,
56 * but just in case, let's use a few more bits for timestamps
57 * after y2038 to be sure they keep increasing monotonically
58 * for the next few hundred years...
60 if (time < 0x80000000)
61 atomic64_set(&seq, (ktime_get_real_seconds()) << 32);
63 atomic64_set(&seq, 0x8000000000000000ull |
64 ktime_get_real_seconds() << 24);
67 return atomic64_inc_return(&seq);
69 EXPORT_SYMBOL_GPL(cper_next_record_id);
71 static const char * const severity_strs[] = {
78 const char *cper_severity_str(unsigned int severity)
80 return severity < ARRAY_SIZE(severity_strs) ?
81 severity_strs[severity] : "unknown";
83 EXPORT_SYMBOL_GPL(cper_severity_str);
86 * cper_print_bits - print strings for set bits
87 * @pfx: prefix for each line, including log level and prefix string
89 * @strs: string array, indexed by bit position
90 * @strs_size: size of the string array: @strs
92 * For each set bit in @bits, print the corresponding string in @strs.
93 * If the output length is longer than 80, multiple line will be
94 * printed, with @pfx is printed at the beginning of each line.
96 void cper_print_bits(const char *pfx, unsigned int bits,
97 const char * const strs[], unsigned int strs_size)
103 for (i = 0; i < strs_size; i++) {
104 if (!(bits & (1U << i)))
109 if (len && len + strlen(str) + 2 > 80) {
114 len = snprintf(buf, sizeof(buf), "%s%s", pfx, str);
116 len += snprintf(buf+len, sizeof(buf)-len, ", %s", str);
122 static const char * const proc_type_strs[] = {
128 static const char * const proc_isa_strs[] = {
136 const char * const cper_proc_error_type_strs[] = {
140 "micro-architectural error",
143 static const char * const proc_op_strs[] = {
144 "unknown or generic",
147 "instruction execution",
150 static const char * const proc_flag_strs[] = {
157 static void cper_print_proc_generic(const char *pfx,
158 const struct cper_sec_proc_generic *proc)
160 if (proc->validation_bits & CPER_PROC_VALID_TYPE)
161 printk("%s""processor_type: %d, %s\n", pfx, proc->proc_type,
162 proc->proc_type < ARRAY_SIZE(proc_type_strs) ?
163 proc_type_strs[proc->proc_type] : "unknown");
164 if (proc->validation_bits & CPER_PROC_VALID_ISA)
165 printk("%s""processor_isa: %d, %s\n", pfx, proc->proc_isa,
166 proc->proc_isa < ARRAY_SIZE(proc_isa_strs) ?
167 proc_isa_strs[proc->proc_isa] : "unknown");
168 if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) {
169 printk("%s""error_type: 0x%02x\n", pfx, proc->proc_error_type);
170 cper_print_bits(pfx, proc->proc_error_type,
171 cper_proc_error_type_strs,
172 ARRAY_SIZE(cper_proc_error_type_strs));
174 if (proc->validation_bits & CPER_PROC_VALID_OPERATION)
175 printk("%s""operation: %d, %s\n", pfx, proc->operation,
176 proc->operation < ARRAY_SIZE(proc_op_strs) ?
177 proc_op_strs[proc->operation] : "unknown");
178 if (proc->validation_bits & CPER_PROC_VALID_FLAGS) {
179 printk("%s""flags: 0x%02x\n", pfx, proc->flags);
180 cper_print_bits(pfx, proc->flags, proc_flag_strs,
181 ARRAY_SIZE(proc_flag_strs));
183 if (proc->validation_bits & CPER_PROC_VALID_LEVEL)
184 printk("%s""level: %d\n", pfx, proc->level);
185 if (proc->validation_bits & CPER_PROC_VALID_VERSION)
186 printk("%s""version_info: 0x%016llx\n", pfx, proc->cpu_version);
187 if (proc->validation_bits & CPER_PROC_VALID_ID)
188 printk("%s""processor_id: 0x%016llx\n", pfx, proc->proc_id);
189 if (proc->validation_bits & CPER_PROC_VALID_TARGET_ADDRESS)
190 printk("%s""target_address: 0x%016llx\n",
191 pfx, proc->target_addr);
192 if (proc->validation_bits & CPER_PROC_VALID_REQUESTOR_ID)
193 printk("%s""requestor_id: 0x%016llx\n",
194 pfx, proc->requestor_id);
195 if (proc->validation_bits & CPER_PROC_VALID_RESPONDER_ID)
196 printk("%s""responder_id: 0x%016llx\n",
197 pfx, proc->responder_id);
198 if (proc->validation_bits & CPER_PROC_VALID_IP)
199 printk("%s""IP: 0x%016llx\n", pfx, proc->ip);
202 static const char * const mem_err_type_strs[] = {
207 "single-symbol chipkill ECC",
208 "multi-symbol chipkill ECC",
216 "scrub corrected error",
217 "scrub uncorrected error",
218 "physical memory map-out event",
221 const char *cper_mem_err_type_str(unsigned int etype)
223 return etype < ARRAY_SIZE(mem_err_type_strs) ?
224 mem_err_type_strs[etype] : "unknown";
226 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
228 static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
236 len = CPER_REC_LEN - 1;
237 if (mem->validation_bits & CPER_MEM_VALID_NODE)
238 n += scnprintf(msg + n, len - n, "node: %d ", mem->node);
239 if (mem->validation_bits & CPER_MEM_VALID_CARD)
240 n += scnprintf(msg + n, len - n, "card: %d ", mem->card);
241 if (mem->validation_bits & CPER_MEM_VALID_MODULE)
242 n += scnprintf(msg + n, len - n, "module: %d ", mem->module);
243 if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
244 n += scnprintf(msg + n, len - n, "rank: %d ", mem->rank);
245 if (mem->validation_bits & CPER_MEM_VALID_BANK)
246 n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank);
247 if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
248 n += scnprintf(msg + n, len - n, "device: %d ", mem->device);
249 if (mem->validation_bits & CPER_MEM_VALID_ROW)
250 n += scnprintf(msg + n, len - n, "row: %d ", mem->row);
251 if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
252 n += scnprintf(msg + n, len - n, "column: %d ", mem->column);
253 if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
254 n += scnprintf(msg + n, len - n, "bit_position: %d ",
256 if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
257 n += scnprintf(msg + n, len - n, "requestor_id: 0x%016llx ",
259 if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
260 n += scnprintf(msg + n, len - n, "responder_id: 0x%016llx ",
262 if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID)
263 scnprintf(msg + n, len - n, "target_id: 0x%016llx ",
270 static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
273 const char *bank = NULL, *device = NULL;
275 if (!msg || !(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE))
279 len = CPER_REC_LEN - 1;
280 dmi_memdev_name(mem->mem_dev_handle, &bank, &device);
282 n = snprintf(msg, len, "DIMM location: %s %s ", bank, device);
284 n = snprintf(msg, len,
285 "DIMM location: not present. DMI handle: 0x%.4x ",
286 mem->mem_dev_handle);
292 void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
293 struct cper_mem_err_compact *cmem)
295 cmem->validation_bits = mem->validation_bits;
296 cmem->node = mem->node;
297 cmem->card = mem->card;
298 cmem->module = mem->module;
299 cmem->bank = mem->bank;
300 cmem->device = mem->device;
301 cmem->row = mem->row;
302 cmem->column = mem->column;
303 cmem->bit_pos = mem->bit_pos;
304 cmem->requestor_id = mem->requestor_id;
305 cmem->responder_id = mem->responder_id;
306 cmem->target_id = mem->target_id;
307 cmem->rank = mem->rank;
308 cmem->mem_array_handle = mem->mem_array_handle;
309 cmem->mem_dev_handle = mem->mem_dev_handle;
312 const char *cper_mem_err_unpack(struct trace_seq *p,
313 struct cper_mem_err_compact *cmem)
315 const char *ret = trace_seq_buffer_ptr(p);
317 if (cper_mem_err_location(cmem, rcd_decode_str))
318 trace_seq_printf(p, "%s", rcd_decode_str);
319 if (cper_dimm_err_location(cmem, rcd_decode_str))
320 trace_seq_printf(p, "%s", rcd_decode_str);
321 trace_seq_putc(p, '\0');
326 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem,
329 struct cper_mem_err_compact cmem;
331 /* Don't trust UEFI 2.1/2.2 structure with bad validation bits */
332 if (len == sizeof(struct cper_sec_mem_err_old) &&
333 (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) {
334 pr_err(FW_WARN "valid bits set for fields beyond structure\n");
337 if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
338 printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
339 if (mem->validation_bits & CPER_MEM_VALID_PA)
340 printk("%s""physical_address: 0x%016llx\n",
341 pfx, mem->physical_addr);
342 if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
343 printk("%s""physical_address_mask: 0x%016llx\n",
344 pfx, mem->physical_addr_mask);
345 cper_mem_err_pack(mem, &cmem);
346 if (cper_mem_err_location(&cmem, rcd_decode_str))
347 printk("%s%s\n", pfx, rcd_decode_str);
348 if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
349 u8 etype = mem->error_type;
350 printk("%s""error_type: %d, %s\n", pfx, etype,
351 cper_mem_err_type_str(etype));
353 if (cper_dimm_err_location(&cmem, rcd_decode_str))
354 printk("%s%s\n", pfx, rcd_decode_str);
357 static const char * const pcie_port_type_strs[] = {
359 "legacy PCI end point",
363 "upstream switch port",
364 "downstream switch port",
365 "PCIe to PCI/PCI-X bridge",
366 "PCI/PCI-X to PCIe bridge",
367 "root complex integrated endpoint device",
368 "root complex event collector",
371 static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
372 const struct acpi_hest_generic_data *gdata)
374 if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
375 printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
376 pcie->port_type < ARRAY_SIZE(pcie_port_type_strs) ?
377 pcie_port_type_strs[pcie->port_type] : "unknown");
378 if (pcie->validation_bits & CPER_PCIE_VALID_VERSION)
379 printk("%s""version: %d.%d\n", pfx,
380 pcie->version.major, pcie->version.minor);
381 if (pcie->validation_bits & CPER_PCIE_VALID_COMMAND_STATUS)
382 printk("%s""command: 0x%04x, status: 0x%04x\n", pfx,
383 pcie->command, pcie->status);
384 if (pcie->validation_bits & CPER_PCIE_VALID_DEVICE_ID) {
386 printk("%s""device_id: %04x:%02x:%02x.%x\n", pfx,
387 pcie->device_id.segment, pcie->device_id.bus,
388 pcie->device_id.device, pcie->device_id.function);
389 printk("%s""slot: %d\n", pfx,
390 pcie->device_id.slot >> CPER_PCIE_SLOT_SHIFT);
391 printk("%s""secondary_bus: 0x%02x\n", pfx,
392 pcie->device_id.secondary_bus);
393 printk("%s""vendor_id: 0x%04x, device_id: 0x%04x\n", pfx,
394 pcie->device_id.vendor_id, pcie->device_id.device_id);
395 p = pcie->device_id.class_code;
396 printk("%s""class_code: %02x%02x%02x\n", pfx, p[0], p[1], p[2]);
398 if (pcie->validation_bits & CPER_PCIE_VALID_SERIAL_NUMBER)
399 printk("%s""serial number: 0x%04x, 0x%04x\n", pfx,
400 pcie->serial_number.lower, pcie->serial_number.upper);
401 if (pcie->validation_bits & CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS)
403 "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
404 pfx, pcie->bridge.secondary_status, pcie->bridge.control);
407 static void cper_print_tstamp(const char *pfx,
408 struct acpi_hest_generic_data_v300 *gdata)
410 __u8 hour, min, sec, day, mon, year, century, *timestamp;
412 if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
413 timestamp = (__u8 *)&(gdata->time_stamp);
414 sec = bcd2bin(timestamp[0]);
415 min = bcd2bin(timestamp[1]);
416 hour = bcd2bin(timestamp[2]);
417 day = bcd2bin(timestamp[4]);
418 mon = bcd2bin(timestamp[5]);
419 year = bcd2bin(timestamp[6]);
420 century = bcd2bin(timestamp[7]);
422 printk("%s%ststamp: %02d%02d-%02d-%02d %02d:%02d:%02d\n", pfx,
423 (timestamp[3] & 0x1 ? "precise " : "imprecise "),
424 century, year, mon, day, hour, min, sec);
429 cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
432 guid_t *sec_type = (guid_t *)gdata->section_type;
436 if (acpi_hest_get_version(gdata) >= 3)
437 cper_print_tstamp(pfx, (struct acpi_hest_generic_data_v300 *)gdata);
439 severity = gdata->error_severity;
440 printk("%s""Error %d, type: %s\n", pfx, sec_no,
441 cper_severity_str(severity));
442 if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
443 printk("%s""fru_id: %pUl\n", pfx, gdata->fru_id);
444 if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
445 printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
447 snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
448 if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) {
449 struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
451 printk("%s""section_type: general processor error\n", newpfx);
452 if (gdata->error_data_length >= sizeof(*proc_err))
453 cper_print_proc_generic(newpfx, proc_err);
455 goto err_section_too_small;
456 } else if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
457 struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
459 printk("%s""section_type: memory error\n", newpfx);
460 if (gdata->error_data_length >=
461 sizeof(struct cper_sec_mem_err_old))
462 cper_print_mem(newpfx, mem_err,
463 gdata->error_data_length);
465 goto err_section_too_small;
466 } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
467 struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata);
469 printk("%s""section_type: PCIe error\n", newpfx);
470 if (gdata->error_data_length >= sizeof(*pcie))
471 cper_print_pcie(newpfx, pcie, gdata);
473 goto err_section_too_small;
474 #if defined(CONFIG_ARM64) || defined(CONFIG_ARM)
475 } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
476 struct cper_sec_proc_arm *arm_err = acpi_hest_get_payload(gdata);
478 printk("%ssection_type: ARM processor error\n", newpfx);
479 if (gdata->error_data_length >= sizeof(*arm_err))
480 cper_print_proc_arm(newpfx, arm_err);
482 goto err_section_too_small;
484 #if defined(CONFIG_UEFI_CPER_X86)
485 } else if (guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
486 struct cper_sec_proc_ia *ia_err = acpi_hest_get_payload(gdata);
488 printk("%ssection_type: IA32/X64 processor error\n", newpfx);
489 if (gdata->error_data_length >= sizeof(*ia_err))
490 cper_print_proc_ia(newpfx, ia_err);
492 goto err_section_too_small;
495 const void *err = acpi_hest_get_payload(gdata);
497 printk("%ssection type: unknown, %pUl\n", newpfx, sec_type);
498 printk("%ssection length: %#x\n", newpfx,
499 gdata->error_data_length);
500 print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, err,
501 gdata->error_data_length, true);
506 err_section_too_small:
507 pr_err(FW_WARN "error section length is too small\n");
510 void cper_estatus_print(const char *pfx,
511 const struct acpi_hest_generic_status *estatus)
513 struct acpi_hest_generic_data *gdata;
518 severity = estatus->error_severity;
519 if (severity == CPER_SEV_CORRECTED)
520 printk("%s%s\n", pfx,
521 "It has been corrected by h/w "
522 "and requires no further action");
523 printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
524 snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
526 apei_estatus_for_each_section(estatus, gdata) {
527 cper_estatus_print_section(newpfx, gdata, sec_no);
531 EXPORT_SYMBOL_GPL(cper_estatus_print);
533 int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus)
535 if (estatus->data_length &&
536 estatus->data_length < sizeof(struct acpi_hest_generic_data))
538 if (estatus->raw_data_length &&
539 estatus->raw_data_offset < sizeof(*estatus) + estatus->data_length)
544 EXPORT_SYMBOL_GPL(cper_estatus_check_header);
546 int cper_estatus_check(const struct acpi_hest_generic_status *estatus)
548 struct acpi_hest_generic_data *gdata;
549 unsigned int data_len, gedata_len;
552 rc = cper_estatus_check_header(estatus);
555 data_len = estatus->data_length;
557 apei_estatus_for_each_section(estatus, gdata) {
558 gedata_len = acpi_hest_get_error_length(gdata);
559 if (gedata_len > data_len - acpi_hest_get_size(gdata))
561 data_len -= acpi_hest_get_record_size(gdata);
568 EXPORT_SYMBOL_GPL(cper_estatus_check);