From 0a00fd5e20fd5dc89e976e163588d7c54edaf745 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Tue, 3 Jun 2014 16:32:53 +0800 Subject: ACPICA: Restore error table definitions to reduce code differences between Linux and ACPICA upstream. The following commit has changed ACPICA table header definitions: Commit: 88f074f4871a8c212b212b725e4dcdcdb09613c1 Subject: ACPI, CPER: Update cper info While such definitions are currently maintained in ACPICA. As the modifications applying to the table definitions affect other OSPMs' drivers, it is very difficult for ACPICA to initiate a process to complete the merge. Thus this commit finally only leaves us divergences. Revert such naming modifications to reduce the source code differecnes between Linux and ACPICA upstream. No functional changes. Signed-off-by: Lv Zheng Cc: Bob Moore Cc: Chen, Gong Cc: Tony Luck Cc: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_extlog.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/acpi/acpi_extlog.c') diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 185334114d71..340d09518f8e 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -69,11 +69,11 @@ static u32 l1_percpu_entry; #define ELOG_ENTRY_ADDR(phyaddr) \ (phyaddr - elog_base + (u8 *)elog_addr) -static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank) +static struct acpi_hest_generic_status *extlog_elog_entry_check(int cpu, int bank) { int idx; u64 data; - struct acpi_generic_status *estatus; + struct acpi_hest_generic_status *estatus; WARN_ON(cpu < 0); idx = ELOG_IDX(cpu, bank); @@ -82,7 +82,7 @@ static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank) return NULL; data &= EXT_ELOG_ENTRY_MASK; - estatus = (struct acpi_generic_status *)ELOG_ENTRY_ADDR(data); + estatus = (struct acpi_hest_generic_status *)ELOG_ENTRY_ADDR(data); /* if no valid data in elog entry, just return */ if (estatus->block_status == 0) @@ -92,7 +92,7 @@ static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank) } static void __print_extlog_rcd(const char *pfx, - struct acpi_generic_status *estatus, int cpu) + struct acpi_hest_generic_status *estatus, int cpu) { static atomic_t seqno; unsigned int curr_seqno; @@ -111,7 +111,7 @@ static void __print_extlog_rcd(const char *pfx, } static int print_extlog_rcd(const char *pfx, - struct acpi_generic_status *estatus, int cpu) + struct acpi_hest_generic_status *estatus, int cpu) { /* Not more than 2 messages every 5 seconds */ static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2); @@ -137,7 +137,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, struct mce *mce = (struct mce *)data; int bank = mce->bank; int cpu = mce->extcpu; - struct acpi_generic_status *estatus; + struct acpi_hest_generic_status *estatus; int rc; estatus = extlog_elog_entry_check(cpu, bank); @@ -148,7 +148,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, /* clear record status to enable BIOS to update it again */ estatus->block_status = 0; - rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu); + rc = print_extlog_rcd(NULL, (struct acpi_hest_generic_status *)elog_buf, cpu); return NOTIFY_STOP; } -- cgit v1.2.3 From 2dfb7d51a61d7ca91b131c8db612f27d9390f2d5 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Tue, 17 Jun 2014 22:33:07 -0400 Subject: trace, RAS: Add eMCA trace event interface Add trace interface to elaborate all H/W error related information. Signed-off-by: Chen, Gong Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- drivers/acpi/Kconfig | 4 ++- drivers/acpi/acpi_extlog.c | 27 ++++++++++++++++--- drivers/firmware/efi/cper.c | 45 ++++++++++++++++++++++++++++--- drivers/ras/ras.c | 3 +++ include/linux/cper.h | 23 ++++++++++++++++ include/ras/ras_event.h | 64 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 158 insertions(+), 8 deletions(-) (limited to 'drivers/acpi/acpi_extlog.c') diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index a34a22841002..206942b8d105 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -370,6 +370,7 @@ config ACPI_EXTLOG tristate "Extended Error Log support" depends on X86_MCE && X86_LOCAL_APIC select UEFI_CPER + select RAS default n help Certain usages such as Predictive Failure Analysis (PFA) require @@ -384,6 +385,7 @@ config ACPI_EXTLOG Enhanced MCA Logging allows firmware to provide additional error information to system software, synchronous with MCE or CMCI. This - driver adds support for that functionality. + driver adds support for that functionality with corresponding + tracepoint which carries that information to userspace. endif # ACPI diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 185334114d71..e61da957f30f 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -16,6 +16,7 @@ #include #include "apei/apei-internal.h" +#include #define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */ @@ -137,8 +138,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, struct mce *mce = (struct mce *)data; int bank = mce->bank; int cpu = mce->extcpu; - struct acpi_generic_status *estatus; - int rc; + struct acpi_generic_status *estatus, *tmp; + struct acpi_generic_data *gdata; + const uuid_le *fru_id = &NULL_UUID_LE; + char *fru_text = ""; + uuid_le *sec_type; + static u32 err_seq; estatus = extlog_elog_entry_check(cpu, bank); if (estatus == NULL) @@ -148,7 +153,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, /* clear record status to enable BIOS to update it again */ estatus->block_status = 0; - rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu); + tmp = (struct acpi_generic_status *)elog_buf; + print_extlog_rcd(NULL, tmp, cpu); + + /* log event via trace */ + err_seq++; + gdata = (struct acpi_generic_data *)(tmp + 1); + if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) + fru_id = (uuid_le *)gdata->fru_id; + if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) + fru_text = gdata->fru_text; + sec_type = (uuid_le *)gdata->section_type; + if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { + struct cper_sec_mem_err *mem = (void *)(gdata + 1); + if (gdata->error_data_length >= sizeof(*mem)) + trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, + (u8)gdata->error_severity); + } return NOTIFY_STOP; } diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index ac33a9fed341..437e6fd47311 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -207,7 +207,7 @@ const char *cper_mem_err_type_str(unsigned int etype) } EXPORT_SYMBOL_GPL(cper_mem_err_type_str); -static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg) +static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) { u32 len, n; @@ -249,7 +249,7 @@ static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg) return n; } -static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg) +static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) { u32 len, n; const char *bank = NULL, *device = NULL; @@ -271,8 +271,44 @@ static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg) return n; } +void cper_mem_err_pack(const struct cper_sec_mem_err *mem, + struct cper_mem_err_compact *cmem) +{ + cmem->validation_bits = mem->validation_bits; + cmem->node = mem->node; + cmem->card = mem->card; + cmem->module = mem->module; + cmem->bank = mem->bank; + cmem->device = mem->device; + cmem->row = mem->row; + cmem->column = mem->column; + cmem->bit_pos = mem->bit_pos; + cmem->requestor_id = mem->requestor_id; + cmem->responder_id = mem->responder_id; + cmem->target_id = mem->target_id; + cmem->rank = mem->rank; + cmem->mem_array_handle = mem->mem_array_handle; + cmem->mem_dev_handle = mem->mem_dev_handle; +} + +const char *cper_mem_err_unpack(struct trace_seq *p, + struct cper_mem_err_compact *cmem) +{ + const char *ret = p->buffer + p->len; + + if (cper_mem_err_location(cmem, rcd_decode_str)) + trace_seq_printf(p, "%s", rcd_decode_str); + if (cper_dimm_err_location(cmem, rcd_decode_str)) + trace_seq_printf(p, "%s", rcd_decode_str); + trace_seq_putc(p, '\0'); + + return ret; +} + static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) { + struct cper_mem_err_compact cmem; + if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); if (mem->validation_bits & CPER_MEM_VALID_PA) @@ -281,14 +317,15 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) printk("%s""physical_address_mask: 0x%016llx\n", pfx, mem->physical_addr_mask); - if (cper_mem_err_location(mem, rcd_decode_str)) + cper_mem_err_pack(mem, &cmem); + if (cper_mem_err_location(&cmem, rcd_decode_str)) printk("%s%s\n", pfx, rcd_decode_str); if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { u8 etype = mem->error_type; printk("%s""error_type: %d, %s\n", pfx, etype, cper_mem_err_type_str(etype)); } - if (cper_dimm_err_location(mem, rcd_decode_str)) + if (cper_dimm_err_location(&cmem, rcd_decode_str)) printk("%s%s\n", pfx, rcd_decode_str); } diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 4cac43a1e25c..b67dd362b7b6 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -23,4 +23,7 @@ static int __init ras_init(void) } subsys_initcall(ras_init); +#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) +EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); +#endif EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); diff --git a/include/linux/cper.h b/include/linux/cper.h index ed088b9c1298..76abba4b238e 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -22,6 +22,7 @@ #define LINUX_CPER_H #include +#include /* CPER record signature and the size */ #define CPER_SIG_RECORD "CPER" @@ -363,6 +364,24 @@ struct cper_sec_mem_err { __u16 mem_dev_handle; /* module handle in UEFI 2.4 */ }; +struct cper_mem_err_compact { + __u64 validation_bits; + __u16 node; + __u16 card; + __u16 module; + __u16 bank; + __u16 device; + __u16 row; + __u16 column; + __u16 bit_pos; + __u64 requestor_id; + __u64 responder_id; + __u64 target_id; + __u16 rank; + __u16 mem_array_handle; + __u16 mem_dev_handle; +}; + struct cper_sec_pcie { __u64 validation_bits; __u32 port_type; @@ -406,5 +425,9 @@ const char *cper_severity_str(unsigned int); const char *cper_mem_err_type_str(unsigned int); void cper_print_bits(const char *prefix, unsigned int bits, const char * const strs[], unsigned int strs_size); +void cper_mem_err_pack(const struct cper_sec_mem_err *, + struct cper_mem_err_compact *); +const char *cper_mem_err_unpack(struct trace_seq *, + struct cper_mem_err_compact *); #endif diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index acbcbb88eaaa..47da53c27ffa 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -9,6 +9,70 @@ #include #include #include +#include + +/* + * MCE Extended Error Log trace event + * + * These events are generated when hardware detects a corrected or + * uncorrected event. + */ + +/* memory trace event */ + +#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) +TRACE_EVENT(extlog_mem_event, + TP_PROTO(struct cper_sec_mem_err *mem, + u32 err_seq, + const uuid_le *fru_id, + const char *fru_text, + u8 sev), + + TP_ARGS(mem, err_seq, fru_id, fru_text, sev), + + TP_STRUCT__entry( + __field(u32, err_seq) + __field(u8, etype) + __field(u8, sev) + __field(u64, pa) + __field(u8, pa_mask_lsb) + __field_struct(uuid_le, fru_id) + __string(fru_text, fru_text) + __field_struct(struct cper_mem_err_compact, data) + ), + + TP_fast_assign( + __entry->err_seq = err_seq; + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) + __entry->etype = mem->error_type; + else + __entry->etype = ~0; + __entry->sev = sev; + if (mem->validation_bits & CPER_MEM_VALID_PA) + __entry->pa = mem->physical_addr; + else + __entry->pa = ~0ull; + + if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) + __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask); + else + __entry->pa_mask_lsb = ~0; + __entry->fru_id = *fru_id; + __assign_str(fru_text, fru_text); + cper_mem_err_pack(mem, &__entry->data); + ), + + TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s", + __entry->err_seq, + cper_severity_str(__entry->sev), + cper_mem_err_type_str(__entry->etype), + __entry->pa, + __entry->pa_mask_lsb, + cper_mem_err_unpack(p, &__entry->data), + &__entry->fru_id, + __get_str(fru_text)) +); +#endif /* * Hardware Events Report -- cgit v1.2.3 From d6cae935ec5b7873a8ccd8f0331bef2df729e86a Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 11 Jun 2014 04:34:50 -0400 Subject: trace, eMCA: Add a knob to adjust where to save event log To avoid saving two copies for one H/W event, add a new file under debugfs to control how to save event log. Once this file is opened, the perf/trace will be used, in the meanwhile, kernel will stop printing event log to the console. On the other hand, if this file is closed, kernel will print event log to the console again. Signed-off-by: Chen, Gong Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- drivers/acpi/acpi_extlog.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/acpi/acpi_extlog.c') diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index e61da957f30f..a99d4a6156dc 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -154,7 +155,11 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, estatus->block_status = 0; tmp = (struct acpi_generic_status *)elog_buf; - print_extlog_rcd(NULL, tmp, cpu); + + if (!ras_userspace_consumers()) { + print_extlog_rcd(NULL, tmp, cpu); + goto out; + } /* log event via trace */ err_seq++; @@ -171,6 +176,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, (u8)gdata->error_severity); } +out: return NOTIFY_STOP; } -- cgit v1.2.3 From 7c76bb5f7a3d052339b873374333dd0dcc35ce28 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 11 Jun 2014 04:34:51 -0400 Subject: RAS, extlog: Adjust init flow Unless the platform has eMCA related capability, don't need to check if there is conflict with EDAC driver. Signed-off-by: Chen, Gong Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- drivers/acpi/acpi_extlog.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'drivers/acpi/acpi_extlog.c') diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index a99d4a6156dc..0ad6f389d922 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -223,19 +223,16 @@ static int __init extlog_init(void) u64 cap; int rc; + rdmsrl(MSR_IA32_MCG_CAP, cap); + + if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr()) + return -ENODEV; + if (get_edac_report_status() == EDAC_REPORTING_FORCE) { pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n"); return -EPERM; } - rc = -ENODEV; - rdmsrl(MSR_IA32_MCG_CAP, cap); - if (!(cap & MCG_ELOG_P)) - return rc; - - if (!extlog_get_l1addr()) - return rc; - rc = -EINVAL; /* get L1 header to fetch necessary information */ l1_hdr_size = sizeof(struct extlog_l1_head); -- cgit v1.2.3