Skip to content

Commit

Permalink
powerpc/powernv: Dump PHB diag-data immediately
Browse files Browse the repository at this point in the history
The PHB diag-data is important to help locating the root cause for
EEH errors such as frozen PE or fenced PHB. However, the EEH core
enables IO path by clearing part of HW registers before collecting
this data causing it to be corrupted.

This patch fixes this by dumping the PHB diag-data immediately when
frozen/fenced state on PE or PHB is detected for the first time in
eeh_ops::get_state() or next_error() backend.

Signed-off-by: Gavin Shan <[email protected]>
CC: <[email protected]>
Signed-off-by: Benjamin Herrenschmidt <[email protected]>
  • Loading branch information
shangw authored and ozbenh committed Feb 28, 2014
1 parent 573ebfa commit 9471660
Showing 1 changed file with 43 additions and 53 deletions.
96 changes: 43 additions & 53 deletions arch/powerpc/platforms/powernv/eeh-ioda.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get,
ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
#endif /* CONFIG_DEBUG_FS */


/**
* ioda_eeh_post_init - Chip dependent post initialization
* @hose: PCI controller
Expand Down Expand Up @@ -221,6 +222,22 @@ static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
return ret;
}

static void ioda_eeh_phb_diag(struct pci_controller *hose)
{
struct pnv_phb *phb = hose->private_data;
long rc;

rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
PNV_PCI_DIAG_BUF_SIZE);
if (rc != OPAL_SUCCESS) {
pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
__func__, hose->global_number, rc);
return;
}

pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
}

/**
* ioda_eeh_get_state - Retrieve the state of PE
* @pe: EEH PE
Expand Down Expand Up @@ -272,6 +289,9 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
result |= EEH_STATE_DMA_ACTIVE;
result |= EEH_STATE_MMIO_ENABLED;
result |= EEH_STATE_DMA_ENABLED;
} else if (!(pe->state & EEH_PE_ISOLATED)) {
eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
ioda_eeh_phb_diag(hose);
}

return result;
Expand Down Expand Up @@ -315,6 +335,15 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
__func__, fstate, hose->global_number, pe_no);
}

/* Dump PHB diag-data for frozen PE */
if (result != EEH_STATE_NOT_SUPPORT &&
(result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) !=
(EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) &&
!(pe->state & EEH_PE_ISOLATED)) {
eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
ioda_eeh_phb_diag(hose);
}

return result;
}

Expand Down Expand Up @@ -529,42 +558,6 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
return ret;
}

/**
* ioda_eeh_get_log - Retrieve error log
* @pe: EEH PE
* @severity: Severity level of the log
* @drv_log: buffer to store the log
* @len: space of the log buffer
*
* The function is used to retrieve error log from P7IOC.
*/
static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
char *drv_log, unsigned long len)
{
s64 ret;
unsigned long flags;
struct pci_controller *hose = pe->phb;
struct pnv_phb *phb = hose->private_data;

spin_lock_irqsave(&phb->lock, flags);

ret = opal_pci_get_phb_diag_data2(phb->opal_id,
phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
if (ret) {
spin_unlock_irqrestore(&phb->lock, flags);
pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
__func__, hose->global_number, pe->addr, ret);
return -EIO;
}

/* The PHB diag-data is always indicative */
pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);

spin_unlock_irqrestore(&phb->lock, flags);

return 0;
}

/**
* ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
* @pe: EEH PE
Expand Down Expand Up @@ -646,22 +639,6 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose)
}
}

static void ioda_eeh_phb_diag(struct pci_controller *hose)
{
struct pnv_phb *phb = hose->private_data;
long rc;

rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
PNV_PCI_DIAG_BUF_SIZE);
if (rc != OPAL_SUCCESS) {
pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
__func__, hose->global_number, rc);
return;
}

pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
}

static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
struct eeh_pe **pe)
{
Expand Down Expand Up @@ -834,6 +811,20 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
__func__, err_type);
}

/*
* EEH core will try recover from fenced PHB or
* frozen PE. In the time for frozen PE, EEH core
* enable IO path for that before collecting logs,
* but it ruins the site. So we have to dump the
* log in advance here.
*/
if ((ret == EEH_NEXT_ERR_FROZEN_PE ||
ret == EEH_NEXT_ERR_FENCED_PHB) &&
!((*pe)->state & EEH_PE_ISOLATED)) {
eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
ioda_eeh_phb_diag(hose);
}

/*
* If we have no errors on the specific PHB or only
* informative error there, we continue poking it.
Expand All @@ -852,7 +843,6 @@ struct pnv_eeh_ops ioda_eeh_ops = {
.set_option = ioda_eeh_set_option,
.get_state = ioda_eeh_get_state,
.reset = ioda_eeh_reset,
.get_log = ioda_eeh_get_log,
.configure_bridge = ioda_eeh_configure_bridge,
.next_error = ioda_eeh_next_error
};

0 comments on commit 9471660

Please sign in to comment.