Skip to content

Commit

Permalink
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/lin…
Browse files Browse the repository at this point in the history
…ux/kernel/git/tip/tip

Pull x86 RAS updates from Ingo Molnar:

 - various AMD SMCA error parsing/reporting improvements (Yazen Ghannam)

 - extend Intel CMCI error reporting to more cases (Xie XiuQi)

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/MCE: Make correctable error detection look at the Deferred bit
  x86/MCE: Report only DRAM ECC as memory errors on AMD systems
  x86/MCE/AMD: Define a function to get SMCA bank type
  x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems
  x86/MCE: Extend table to report action optional errors through CMCI too
  • Loading branch information
torvalds committed Jan 30, 2018
2 parents d8b91dd + 179eb85 commit a1c75e1
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 14 deletions.
2 changes: 2 additions & 0 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ struct smca_bank {
extern struct smca_bank smca_banks[MAX_NR_BANKS];

extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);

extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);
Expand All @@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);

static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };

#endif

Expand Down
26 changes: 17 additions & 9 deletions arch/x86/kernel/cpu/mcheck/mce-severity.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ static struct severity {
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)

Expand Down Expand Up @@ -101,6 +102,22 @@ static struct severity {
NOSER, BITCLR(MCI_STATUS_UC)
),

/*
* known AO MCACODs reported via MCE or CMC:
*
* SRAO could be signaled either via a machine check exception or
* CMCI with the corresponding bit S 1 or 0. So we don't need to
* check bit S for SRAO.
*/
MCESEV(
AO, "Action optional: memory scrubbing error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
),

/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
Expand Down Expand Up @@ -149,15 +166,6 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),

/* known AO MCACODs: */
MCESEV(
AO, "Action optional: memory scrubbing error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
Expand Down
17 changes: 13 additions & 4 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
bool mce_is_memory_error(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return amd_mce_is_memory_error(m);

return (xec == 0x0 || xec == 0x8);
} else if (m->cpuvendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
Expand All @@ -530,14 +528,25 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);

static bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
return false;

if (m->status & MCI_STATUS_UC)
return false;

return true;
}

static bool cec_add_mce(struct mce *m)
{
if (!m)
return false;

/* We eat only correctable DRAM errors with usable addresses. */
if (mce_is_memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_is_correctable(m) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
return true;
Expand Down
29 changes: 28 additions & 1 deletion arch/x86/kernel/cpu/mcheck/mce_amd.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);

static enum smca_bank_types smca_get_bank_type(struct mce *m)
{
struct smca_bank *b;

if (m->bank >= N_SMCA_BANK_TYPES)
return N_SMCA_BANK_TYPES;

b = &smca_banks[m->bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;

return b->hwid->bank_type;
}

static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype, xec_bitmap } */

Expand Down Expand Up @@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
(deferred_error_int_vector != amd_deferred_error_interrupt))
deferred_error_int_vector = amd_deferred_error_interrupt;

low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
if (!mce_flags.smca)
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;

wrmsr(MSR_CU_DEF_ERR, low, high);
}

Expand Down Expand Up @@ -738,6 +754,17 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
}
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);

bool amd_mce_is_memory_error(struct mce *m)
{
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;

if (mce_flags.smca)
return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;

return m->bank == 4 && xec == 0x8;
}

static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
struct mce m;
Expand Down

0 comments on commit a1c75e1

Please sign in to comment.