From 80f5ab097b87c86581cb9736a8e55c5a3047d4bb Mon Sep 17 00:00:00 2001 From: Shaun Ruffell Date: Sun, 19 Aug 2012 01:11:24 -0300 Subject: edac: edac_mc no longer deals with kobjects directly There are no more embedded kobjects in struct mem_ctl_info. Remove a header and a comment that does not reflect the code anymore. Signed-off-by: Shaun Ruffell Signed-off-by: Mauro Carvalho Chehab --- include/linux/edac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/edac.h') diff --git a/include/linux/edac.h b/include/linux/edac.h index 1b8c02b36f76..4784213c819d 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -14,7 +14,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From c66b5a79a9348ccd6d1cd81416027d0e12da965d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 15 Feb 2013 07:21:08 -0300 Subject: edac: add a new memory layer type There are some cases where the memory controller layout is completely hidden. This is the case of firmware-driven error code, like the one provided by GHES. Add a new layer to be used on such memory error report mechanisms. Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/edac_mc.c | 1 + include/linux/edac.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/linux/edac.h') diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index fb219bc5cb2c..78d8c7d6e76a 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -900,6 +900,7 @@ const char *edac_layer_name[] = { [EDAC_MC_LAYER_CHANNEL] = "channel", [EDAC_MC_LAYER_SLOT] = "slot", [EDAC_MC_LAYER_CHIP_SELECT] = "csrow", + [EDAC_MC_LAYER_ALL_MEM] = "memory", }; EXPORT_SYMBOL_GPL(edac_layer_name); diff --git a/include/linux/edac.h b/include/linux/edac.h index 4784213c819d..1b7744c219b8 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -375,6 +375,9 @@ enum scrub_type { * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel" * @EDAC_MC_LAYER_SLOT: memory layer is named "slot" * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select" + * @EDAC_MC_LAYER_ALL_MEM: memory layout is unknown. All memory is mapped + * as a single memory area. This is used when + * retrieving errors from a firmware driven driver. * * This enum is used by the drivers to tell edac_mc_sysfs what name should * be used when describing a memory stick location. @@ -384,6 +387,7 @@ enum edac_mc_layer_type { EDAC_MC_LAYER_CHANNEL, EDAC_MC_LAYER_SLOT, EDAC_MC_LAYER_CHIP_SELECT, + EDAC_MC_LAYER_ALL_MEM, }; /** -- cgit v1.2.3 From c2c93dbc97622e26dc19edc71e50ebaa996d7804 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 19 Feb 2013 06:50:05 -0300 Subject: edac: remove proc_name from mci structure proc_name isn't used anywhere. Remove it. Signed-off-by: Mauro Carvalho Chehab --- include/linux/edac.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/edac.h') diff --git a/include/linux/edac.h b/include/linux/edac.h index 1b7744c219b8..ff18efc754f3 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -48,7 +48,6 @@ static inline void opstate_init(void) } #define EDAC_MC_LABEL_LEN 31 -#define MC_PROC_NAME_MAX_LEN 7 /** * enum dev_type - describe the type of memory DRAM chips used at the stick @@ -633,7 +632,6 @@ struct mem_ctl_info { const char *mod_ver; const char *ctl_name; const char *dev_name; - char proc_name[MC_PROC_NAME_MAX_LEN + 1]; void *pvt_info; unsigned long start_time; /* mci load start time (in jiffies) */ -- cgit v1.2.3 From c7ef7645544131b0750478d1cf94cdfa945c809d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Feb 2013 13:36:45 -0300 Subject: edac: reduce stack pressure by using a pre-allocated buffer The number of variables at the stack is too big. Reduces the stack usage by using a pre-allocated error buffer. Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/edac_mc.c | 81 ++++++++++++++++++++++++++++++-------------------- include/linux/edac.h | 56 ++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 33 deletions(-) (limited to 'include/linux/edac.h') diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 34eb9703ed33..4f18dd755939 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1065,7 +1065,6 @@ static void edac_ue_error(struct mem_ctl_info *mci, edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); } -#define OTHER_LABEL " or " /** * edac_mc_handle_error - reports a memory event to userspace @@ -1097,19 +1096,28 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, const char *msg, const char *other_detail) { - /* FIXME: too much for stack: move it to some pre-alocated area */ - char detail[80], location[80]; - char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms]; + char detail[80]; char *p; int row = -1, chan = -1; int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; - int i; - long grain; - bool enable_per_layer_report = false; + int i, n_labels = 0; u8 grain_bits; + struct edac_raw_error_desc *e = &mci->error_desc; edac_dbg(3, "MC%d\n", mci->mc_idx); + /* Fills the error report buffer */ + memset(e, 0, sizeof (*e)); + e->error_count = error_count; + e->top_layer = top_layer; + e->mid_layer = mid_layer; + e->low_layer = low_layer; + e->page_frame_number = page_frame_number; + e->offset_in_page = offset_in_page; + e->syndrome = syndrome; + e->msg = msg; + e->other_detail = other_detail; + /* * Check if the event report is consistent and if the memory * location is known. If it is known, enable_per_layer_report will be @@ -1132,7 +1140,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, pos[i] = -1; } if (pos[i] >= 0) - enable_per_layer_report = true; + e->enable_per_layer_report = true; } /* @@ -1146,8 +1154,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, * where each memory belongs to a separate channel within the same * branch. */ - grain = 0; - p = label; + p = e->label; *p = '\0'; for (i = 0; i < mci->tot_dimms; i++) { @@ -1161,8 +1168,8 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, continue; /* get the max grain, over the error match range */ - if (dimm->grain > grain) - grain = dimm->grain; + if (dimm->grain > e->grain) + e->grain = dimm->grain; /* * If the error is memory-controller wide, there's no need to @@ -1170,8 +1177,13 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, * channel/memory controller/... may be affected. * Also, don't show errors for empty DIMM slots. */ - if (enable_per_layer_report && dimm->nr_pages) { - if (p != label) { + if (e->enable_per_layer_report && dimm->nr_pages) { + if (n_labels >= EDAC_MAX_LABELS) { + e->enable_per_layer_report = false; + break; + } + n_labels++; + if (p != e->label) { strcpy(p, OTHER_LABEL); p += strlen(OTHER_LABEL); } @@ -1198,12 +1210,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, } } - if (!enable_per_layer_report) { - strcpy(label, "any memory"); + if (!e->enable_per_layer_report) { + strcpy(e->label, "any memory"); } else { edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan); - if (p == label) - strcpy(label, "unknown memory"); + if (p == e->label) + strcpy(e->label, "unknown memory"); if (type == HW_EVENT_ERR_CORRECTED) { if (row >= 0) { mci->csrows[row]->ce_count += error_count; @@ -1216,7 +1228,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, } /* Fill the RAM location data */ - p = location; + p = e->location; for (i = 0; i < mci->n_layers; i++) { if (pos[i] < 0) @@ -1226,32 +1238,35 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, edac_layer_name[mci->layers[i].type], pos[i]); } - if (p > location) + if (p > e->location) *(p - 1) = '\0'; /* Report the error via the trace interface */ - grain_bits = fls_long(grain) + 1; - trace_mc_event(type, msg, label, error_count, - mci->mc_idx, top_layer, mid_layer, low_layer, - PAGES_TO_MiB(page_frame_number) | offset_in_page, - grain_bits, syndrome, other_detail); + grain_bits = fls_long(e->grain) + 1; + trace_mc_event(type, e->msg, e->label, e->error_count, + mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, + PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page, + grain_bits, e->syndrome, other_detail); /* Memory type dependent details about the error */ if (type == HW_EVENT_ERR_CORRECTED) { snprintf(detail, sizeof(detail), "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", - page_frame_number, offset_in_page, - grain, syndrome); - edac_ce_error(mci, error_count, pos, msg, location, label, - detail, other_detail, enable_per_layer_report, - page_frame_number, offset_in_page, grain); + e->page_frame_number, e->offset_in_page, + e->grain, e->syndrome); + edac_ce_error(mci, e->error_count, pos, e->msg, e->location, + e->label, detail, other_detail, + e->enable_per_layer_report, + e->page_frame_number, e->offset_in_page, + e->grain); } else { snprintf(detail, sizeof(detail), "page:0x%lx offset:0x%lx grain:%ld", - page_frame_number, offset_in_page, grain); + page_frame_number, offset_in_page, e->grain); - edac_ue_error(mci, error_count, pos, msg, location, label, - detail, other_detail, enable_per_layer_report); + edac_ue_error(mci, e->error_count, pos, e->msg, e->location, + e->label, detail, other_detail, + e->enable_per_layer_report); } } EXPORT_SYMBOL_GPL(edac_mc_handle_error); diff --git a/include/linux/edac.h b/include/linux/edac.h index ff18efc754f3..096b7fcdf484 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -47,8 +47,18 @@ static inline void opstate_init(void) return; } +/* Max length of a DIMM label*/ #define EDAC_MC_LABEL_LEN 31 +/* Maximum size of the location string */ +#define LOCATION_SIZE 80 + +/* Defines the maximum number of labels that can be reported */ +#define EDAC_MAX_LABELS 8 + +/* String used to join two or more labels */ +#define OTHER_LABEL " or " + /** * enum dev_type - describe the type of memory DRAM chips used at the stick * @DEV_UNKNOWN: Can't be determined, or MC doesn't support detect it @@ -553,6 +563,46 @@ struct errcount_attribute_data { int layer0, layer1, layer2; }; +/** + * edac_raw_error_desc - Raw error report structure + * @grain: minimum granularity for an error report, in bytes + * @error_count: number of errors of the same type + * @top_layer: top layer of the error (layer[0]) + * @mid_layer: middle layer of the error (layer[1]) + * @low_layer: low layer of the error (layer[2]) + * @page_frame_number: page where the error happened + * @offset_in_page: page offset + * @syndrome: syndrome of the error (or 0 if unknown or if + * the syndrome is not applicable) + * @msg: error message + * @location: location of the error + * @label: label of the affected DIMM(s) + * @other_detail: other driver-specific detail about the error + * @enable_per_layer_report: if false, the error affects all layers + * (typically, a memory controller error) + */ +struct edac_raw_error_desc { + /* + * NOTE: everything before grain won't be cleaned by + * edac_raw_error_desc_clean() + */ + char location[LOCATION_SIZE]; + char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS]; + long grain; + + /* the vars below and grain will be cleaned on every new error report */ + u16 error_count; + int top_layer; + int mid_layer; + int low_layer; + unsigned long page_frame_number; + unsigned long offset_in_page; + unsigned long syndrome; + const char *msg; + const char *other_detail; + bool enable_per_layer_report; +}; + /* MEMORY controller information structure */ struct mem_ctl_info { @@ -660,6 +710,12 @@ struct mem_ctl_info { /* work struct for this MC */ struct delayed_work work; + /* + * Used to report an error - by being at the global struct + * makes the memory allocated by the EDAC core + */ + struct edac_raw_error_desc error_desc; + /* the internal state of this controller instance */ int op_state; -- cgit v1.2.3 From 8dd93d450bff251575c56b8f058393124e1f00fb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 19 Feb 2013 21:26:22 -0300 Subject: edac: add support for error type "Info" The CPER spec defines a forth type of error: informational logs. Add support for it at the edac API and at the trace event interface. Signed-off-by: Mauro Carvalho Chehab --- include/linux/edac.h | 16 ++++++++++++++++ include/ras/ras_event.h | 4 +--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux/edac.h') diff --git a/include/linux/edac.h b/include/linux/edac.h index 096b7fcdf484..4fd4999ccb5b 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -109,8 +109,24 @@ enum hw_event_mc_err_type { HW_EVENT_ERR_CORRECTED, HW_EVENT_ERR_UNCORRECTED, HW_EVENT_ERR_FATAL, + HW_EVENT_ERR_INFO, }; +static inline char *mc_event_error_type(const unsigned int err_type) +{ + switch (err_type) { + case HW_EVENT_ERR_CORRECTED: + return "Corrected"; + case HW_EVENT_ERR_UNCORRECTED: + return "Uncorrected"; + case HW_EVENT_ERR_FATAL: + return "Fatal"; + default: + case HW_EVENT_ERR_INFO: + return "Info"; + } +} + /** * enum mem_type - memory types. For a more detailed reference, please see * http://en.wikipedia.org/wiki/DRAM diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 260470e72483..21cdb0b7b0fb 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -78,9 +78,7 @@ TRACE_EVENT(mc_event, TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)", __entry->error_count, - (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" : - ((__entry->error_type == HW_EVENT_ERR_FATAL) ? - "Fatal" : "Uncorrected"), + mc_event_error_type(__entry->error_type), __entry->error_count > 1 ? "s" : "", ((char *)__get_str(msg))[0] ? " " : "", __get_str(msg), -- cgit v1.2.3