Skip to content

Commit

Permalink
trace, RAS: Add eMCA trace event interface
Browse files Browse the repository at this point in the history
Add trace interface to elaborate all H/W error related information.

Signed-off-by: Chen, Gong <[email protected]>
Acked-by: Borislav Petkov <[email protected]>
Signed-off-by: Tony Luck <[email protected]>
  • Loading branch information
Chen, Gong authored and aegl committed Jun 25, 2014
1 parent d963cd9 commit 2dfb7d5
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 8 deletions.
4 changes: 3 additions & 1 deletion drivers/acpi/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ config ACPI_EXTLOG
tristate "Extended Error Log support"
depends on X86_MCE && X86_LOCAL_APIC
select UEFI_CPER
select RAS
default n
help
Certain usages such as Predictive Failure Analysis (PFA) require
Expand All @@ -384,6 +385,7 @@ config ACPI_EXTLOG

Enhanced MCA Logging allows firmware to provide additional error
information to system software, synchronous with MCE or CMCI. This
driver adds support for that functionality.
driver adds support for that functionality with corresponding
tracepoint which carries that information to userspace.

endif # ACPI
27 changes: 24 additions & 3 deletions drivers/acpi/acpi_extlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <asm/mce.h>

#include "apei/apei-internal.h"
#include <ras/ras_event.h>

#define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */

Expand Down Expand Up @@ -137,8 +138,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
struct mce *mce = (struct mce *)data;
int bank = mce->bank;
int cpu = mce->extcpu;
struct acpi_generic_status *estatus;
int rc;
struct acpi_generic_status *estatus, *tmp;
struct acpi_generic_data *gdata;
const uuid_le *fru_id = &NULL_UUID_LE;
char *fru_text = "";
uuid_le *sec_type;
static u32 err_seq;

estatus = extlog_elog_entry_check(cpu, bank);
if (estatus == NULL)
Expand All @@ -148,7 +153,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
/* clear record status to enable BIOS to update it again */
estatus->block_status = 0;

rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
tmp = (struct acpi_generic_status *)elog_buf;
print_extlog_rcd(NULL, tmp, cpu);

/* log event via trace */
err_seq++;
gdata = (struct acpi_generic_data *)(tmp + 1);
if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
fru_id = (uuid_le *)gdata->fru_id;
if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
fru_text = gdata->fru_text;
sec_type = (uuid_le *)gdata->section_type;
if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem = (void *)(gdata + 1);
if (gdata->error_data_length >= sizeof(*mem))
trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
(u8)gdata->error_severity);
}

return NOTIFY_STOP;
}
Expand Down
45 changes: 41 additions & 4 deletions drivers/firmware/efi/cper.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ const char *cper_mem_err_type_str(unsigned int etype)
}
EXPORT_SYMBOL_GPL(cper_mem_err_type_str);

static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
{
u32 len, n;

Expand Down Expand Up @@ -249,7 +249,7 @@ static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
return n;
}

static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
{
u32 len, n;
const char *bank = NULL, *device = NULL;
Expand All @@ -271,8 +271,44 @@ static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
return n;
}

void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
struct cper_mem_err_compact *cmem)
{
cmem->validation_bits = mem->validation_bits;
cmem->node = mem->node;
cmem->card = mem->card;
cmem->module = mem->module;
cmem->bank = mem->bank;
cmem->device = mem->device;
cmem->row = mem->row;
cmem->column = mem->column;
cmem->bit_pos = mem->bit_pos;
cmem->requestor_id = mem->requestor_id;
cmem->responder_id = mem->responder_id;
cmem->target_id = mem->target_id;
cmem->rank = mem->rank;
cmem->mem_array_handle = mem->mem_array_handle;
cmem->mem_dev_handle = mem->mem_dev_handle;
}

const char *cper_mem_err_unpack(struct trace_seq *p,
struct cper_mem_err_compact *cmem)
{
const char *ret = p->buffer + p->len;

if (cper_mem_err_location(cmem, rcd_decode_str))
trace_seq_printf(p, "%s", rcd_decode_str);
if (cper_dimm_err_location(cmem, rcd_decode_str))
trace_seq_printf(p, "%s", rcd_decode_str);
trace_seq_putc(p, '\0');

return ret;
}

static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
{
struct cper_mem_err_compact cmem;

if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
if (mem->validation_bits & CPER_MEM_VALID_PA)
Expand All @@ -281,14 +317,15 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
printk("%s""physical_address_mask: 0x%016llx\n",
pfx, mem->physical_addr_mask);
if (cper_mem_err_location(mem, rcd_decode_str))
cper_mem_err_pack(mem, &cmem);
if (cper_mem_err_location(&cmem, rcd_decode_str))
printk("%s%s\n", pfx, rcd_decode_str);
if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
u8 etype = mem->error_type;
printk("%s""error_type: %d, %s\n", pfx, etype,
cper_mem_err_type_str(etype));
}
if (cper_dimm_err_location(mem, rcd_decode_str))
if (cper_dimm_err_location(&cmem, rcd_decode_str))
printk("%s%s\n", pfx, rcd_decode_str);
}

Expand Down
3 changes: 3 additions & 0 deletions drivers/ras/ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ static int __init ras_init(void)
}
subsys_initcall(ras_init);

#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
23 changes: 23 additions & 0 deletions include/linux/cper.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#define LINUX_CPER_H

#include <linux/uuid.h>
#include <linux/trace_seq.h>

/* CPER record signature and the size */
#define CPER_SIG_RECORD "CPER"
Expand Down Expand Up @@ -363,6 +364,24 @@ struct cper_sec_mem_err {
__u16 mem_dev_handle; /* module handle in UEFI 2.4 */
};

struct cper_mem_err_compact {
__u64 validation_bits;
__u16 node;
__u16 card;
__u16 module;
__u16 bank;
__u16 device;
__u16 row;
__u16 column;
__u16 bit_pos;
__u64 requestor_id;
__u64 responder_id;
__u64 target_id;
__u16 rank;
__u16 mem_array_handle;
__u16 mem_dev_handle;
};

struct cper_sec_pcie {
__u64 validation_bits;
__u32 port_type;
Expand Down Expand Up @@ -406,5 +425,9 @@ const char *cper_severity_str(unsigned int);
const char *cper_mem_err_type_str(unsigned int);
void cper_print_bits(const char *prefix, unsigned int bits,
const char * const strs[], unsigned int strs_size);
void cper_mem_err_pack(const struct cper_sec_mem_err *,
struct cper_mem_err_compact *);
const char *cper_mem_err_unpack(struct trace_seq *,
struct cper_mem_err_compact *);

#endif
64 changes: 64 additions & 0 deletions include/ras/ras_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,70 @@
#include <linux/edac.h>
#include <linux/ktime.h>
#include <linux/aer.h>
#include <linux/cper.h>

/*
* MCE Extended Error Log trace event
*
* These events are generated when hardware detects a corrected or
* uncorrected event.
*/

/* memory trace event */

#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
TRACE_EVENT(extlog_mem_event,
TP_PROTO(struct cper_sec_mem_err *mem,
u32 err_seq,
const uuid_le *fru_id,
const char *fru_text,
u8 sev),

TP_ARGS(mem, err_seq, fru_id, fru_text, sev),

TP_STRUCT__entry(
__field(u32, err_seq)
__field(u8, etype)
__field(u8, sev)
__field(u64, pa)
__field(u8, pa_mask_lsb)
__field_struct(uuid_le, fru_id)
__string(fru_text, fru_text)
__field_struct(struct cper_mem_err_compact, data)
),

TP_fast_assign(
__entry->err_seq = err_seq;
if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
__entry->etype = mem->error_type;
else
__entry->etype = ~0;
__entry->sev = sev;
if (mem->validation_bits & CPER_MEM_VALID_PA)
__entry->pa = mem->physical_addr;
else
__entry->pa = ~0ull;

if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
__entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
else
__entry->pa_mask_lsb = ~0;
__entry->fru_id = *fru_id;
__assign_str(fru_text, fru_text);
cper_mem_err_pack(mem, &__entry->data);
),

TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
__entry->err_seq,
cper_severity_str(__entry->sev),
cper_mem_err_type_str(__entry->etype),
__entry->pa,
__entry->pa_mask_lsb,
cper_mem_err_unpack(p, &__entry->data),
&__entry->fru_id,
__get_str(fru_text))
);
#endif

/*
* Hardware Events Report
Expand Down

0 comments on commit 2dfb7d5

Please sign in to comment.