Skip to content

Commit

Permalink
Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linu…
Browse files Browse the repository at this point in the history
…x/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "The main changes in this cycle are:

   - RAS tracing/events infrastructure, by Gong Chen.

   - Various generalizations of the APEI code to make it available to
     non-x86 architectures, by Tomasz Nowicki"

* 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/ras: Fix build warnings in <linux/aer.h>
  acpi, apei, ghes: Factor out ioremap virtual memory for IRQ and NMI context.
  acpi, apei, ghes: Make NMI error notification to be GHES architecture extension.
  apei, mce: Factor out APEI architecture specific MCE calls.
  RAS, extlog: Adjust init flow
  trace, eMCA: Add a knob to adjust where to save event log
  trace, RAS: Add eMCA trace event interface
  RAS, debugfs: Add debugfs interface for RAS subsystem
  CPER: Adjust code flow of some functions
  x86, MCE: Robustify mcheck_init_device
  trace, AER: Move trace into unified interface
  trace, RAS: Add basic RAS trace event
  x86, MCE: Kill CPU_POST_DEAD
  • Loading branch information
torvalds committed Aug 5, 2014
2 parents 8556d44 + c3107e3 commit d782ceb
Show file tree
Hide file tree
Showing 28 changed files with 653 additions and 247 deletions.
2 changes: 2 additions & 0 deletions arch/x86/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ config X86
select GENERIC_CPU_AUTOPROBE
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI

config INSTRUCTION_DECODER
def_bool y
Expand Down
1 change: 1 addition & 0 deletions arch/x86/kernel/acpi/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o

ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
Expand Down
62 changes: 62 additions & 0 deletions arch/x86/kernel/acpi/apei.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Arch-specific APEI-related functions.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/

#include <acpi/apei.h>

#include <asm/mce.h>
#include <asm/tlbflush.h>

int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
{
#ifdef CONFIG_X86_MCE
int i;
struct acpi_hest_ia_corrected *cmc;
struct acpi_hest_ia_error_bank *mc_bank;

if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
return 0;

cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
if (!cmc->enabled)
return 0;

/*
* We expect HEST to provide a list of MC banks that report errors
* in firmware first mode. Otherwise, return non-zero value to
* indicate that we are done parsing HEST.
*/
if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
!cmc->num_hardware_banks)
return 1;

pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");

mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
mce_disable_bank(mc_bank->bank_number);
#endif
return 1;
}

void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
#ifdef CONFIG_X86_MCE
apei_mce_report_mem_error(sev, mem_err);
#endif
}

void arch_apei_flush_tlb_one(unsigned long addr)
{
__flush_tlb_one(addr);
}
9 changes: 4 additions & 5 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
mce_intel_hcpu_update(cpu);

/* intentionally ignoring frozen here */
if (!(action & CPU_TASKS_FROZEN))
cmci_rediscover();
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
Expand All @@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
}

if (action == CPU_POST_DEAD) {
/* intentionally ignoring frozen here */
cmci_rediscover();
}

return NOTIFY_OK;
}

Expand Down
2 changes: 2 additions & 0 deletions drivers/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -176,4 +176,6 @@ source "drivers/powercap/Kconfig"

source "drivers/mcb/Kconfig"

source "drivers/ras/Kconfig"

endmenu
1 change: 1 addition & 0 deletions drivers/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ obj-$(CONFIG_NTB) += ntb/
obj-$(CONFIG_FMC) += fmc/
obj-$(CONFIG_POWERCAP) += powercap/
obj-$(CONFIG_MCB) += mcb/
obj-$(CONFIG_RAS) += ras/
4 changes: 3 additions & 1 deletion drivers/acpi/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ config ACPI_EXTLOG
tristate "Extended Error Log support"
depends on X86_MCE && X86_LOCAL_APIC
select UEFI_CPER
select RAS
default n
help
Certain usages such as Predictive Failure Analysis (PFA) require
Expand All @@ -384,6 +385,7 @@ config ACPI_EXTLOG

Enhanced MCA Logging allows firmware to provide additional error
information to system software, synchronous with MCE or CMCI. This
driver adds support for that functionality.
driver adds support for that functionality with corresponding
tracepoint which carries that information to userspace.

endif # ACPI
46 changes: 35 additions & 11 deletions drivers/acpi/acpi_extlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
#include <linux/cper.h>
#include <linux/ratelimit.h>
#include <linux/edac.h>
#include <linux/ras.h>
#include <asm/cpu.h>
#include <asm/mce.h>

#include "apei/apei-internal.h"
#include <ras/ras_event.h>

#define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */

Expand Down Expand Up @@ -137,8 +139,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
struct mce *mce = (struct mce *)data;
int bank = mce->bank;
int cpu = mce->extcpu;
struct acpi_generic_status *estatus;
int rc;
struct acpi_generic_status *estatus, *tmp;
struct acpi_generic_data *gdata;
const uuid_le *fru_id = &NULL_UUID_LE;
char *fru_text = "";
uuid_le *sec_type;
static u32 err_seq;

estatus = extlog_elog_entry_check(cpu, bank);
if (estatus == NULL)
Expand All @@ -148,8 +154,29 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
/* clear record status to enable BIOS to update it again */
estatus->block_status = 0;

rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
tmp = (struct acpi_generic_status *)elog_buf;

if (!ras_userspace_consumers()) {
print_extlog_rcd(NULL, tmp, cpu);
goto out;
}

/* log event via trace */
err_seq++;
gdata = (struct acpi_generic_data *)(tmp + 1);
if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
fru_id = (uuid_le *)gdata->fru_id;
if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
fru_text = gdata->fru_text;
sec_type = (uuid_le *)gdata->section_type;
if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem = (void *)(gdata + 1);
if (gdata->error_data_length >= sizeof(*mem))
trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
(u8)gdata->error_severity);
}

out:
return NOTIFY_STOP;
}

Expand Down Expand Up @@ -196,19 +223,16 @@ static int __init extlog_init(void)
u64 cap;
int rc;

rdmsrl(MSR_IA32_MCG_CAP, cap);

if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr())
return -ENODEV;

if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
return -EPERM;
}

rc = -ENODEV;
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (!(cap & MCG_ELOG_P))
return rc;

if (!extlog_get_l1addr())
return rc;

rc = -EINVAL;
/* get L1 header to fetch necessary information */
l1_hdr_size = sizeof(struct extlog_l1_head);
Expand Down
8 changes: 7 additions & 1 deletion drivers/acpi/apei/Kconfig
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
config HAVE_ACPI_APEI
bool

config HAVE_ACPI_APEI_NMI
bool

config ACPI_APEI
bool "ACPI Platform Error Interface (APEI)"
select MISC_FILESYSTEMS
select PSTORE
select UEFI_CPER
depends on X86
depends on HAVE_ACPI_APEI
help
APEI allows to report errors (for example from the chipset)
to the operating system. This improves NMI handling
Expand Down
13 changes: 13 additions & 0 deletions drivers/acpi/apei/apei-base.c
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,19 @@ struct dentry *apei_get_debugfs_dir(void)
}
EXPORT_SYMBOL_GPL(apei_get_debugfs_dir);

int __weak arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr,
void *data)
{
return 1;
}
EXPORT_SYMBOL_GPL(arch_apei_enable_cmcff);

void __weak arch_apei_report_mem_error(int sev,
struct cper_sec_mem_err *mem_err)
{
}
EXPORT_SYMBOL_GPL(arch_apei_report_mem_error);

int apei_osc_setup(void)
{
static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c";
Expand Down
Loading

0 comments on commit d782ceb

Please sign in to comment.