Skip to content

Commit

Permalink
net/mlx5: PCI error recovery health care simulation
Browse files Browse the repository at this point in the history
In case that the kernel PCI error handlers are not called, we will
trigger our own recovery flow.

The health work will give priority to the kernel pci error handlers to
recover the PCI by waiting for a small period, if the pci error handlers
are not triggered the manual recovery flow will be executed.

We don't save pci state in case of manual recovery because it will ruin the
pci configuration space and we will lose dma sync.

Fixes: 89d44f0 ('net/mlx5_core: Add pci error handlers to mlx5_core driver')
Signed-off-by: Mohamad Haj Yahia <[email protected]>
Signed-off-by: Saeed Mahameed <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
Mohamad Haj Yahia authored and davem330 committed Oct 29, 2016
1 parent 05ac2c0 commit 04c0c1a
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 9 deletions.
45 changes: 41 additions & 4 deletions drivers/net/ethernet/mellanox/mlx5/core/health.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,15 @@ enum {
enum {
MLX5_NIC_IFC_FULL = 0,
MLX5_NIC_IFC_DISABLED = 1,
MLX5_NIC_IFC_NO_DRAM_NIC = 2
MLX5_NIC_IFC_NO_DRAM_NIC = 2,
MLX5_NIC_IFC_INVALID = 3
};

enum {
MLX5_DROP_NEW_HEALTH_WORK,
};

static u8 get_nic_interface(struct mlx5_core_dev *dev)
static u8 get_nic_state(struct mlx5_core_dev *dev)
{
return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
}
Expand Down Expand Up @@ -101,7 +102,7 @@ static int in_fatal(struct mlx5_core_dev *dev)
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;

if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
return 1;

if (ioread32be(&h->fw_ver) == 0xffffffff)
Expand Down Expand Up @@ -131,7 +132,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev)

static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
{
u8 nic_interface = get_nic_interface(dev);
u8 nic_interface = get_nic_state(dev);

switch (nic_interface) {
case MLX5_NIC_IFC_FULL:
Expand All @@ -153,8 +154,34 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
mlx5_disable_device(dev);
}

static void health_recover(struct work_struct *work)
{
struct mlx5_core_health *health;
struct delayed_work *dwork;
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
u8 nic_state;

dwork = container_of(work, struct delayed_work, work);
health = container_of(dwork, struct mlx5_core_health, recover_work);
priv = container_of(health, struct mlx5_priv, health);
dev = container_of(priv, struct mlx5_core_dev, priv);

nic_state = get_nic_state(dev);
if (nic_state == MLX5_NIC_IFC_INVALID) {
dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
return;
}

dev_err(&dev->pdev->dev, "starting health recovery flow\n");
mlx5_recover_device(dev);
}

/* How much time to wait until health resetting the driver (in msecs) */
#define MLX5_RECOVERY_DELAY_MSECS 60000
static void health_care(struct work_struct *work)
{
unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS);
struct mlx5_core_health *health;
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
Expand All @@ -164,6 +191,14 @@ static void health_care(struct work_struct *work)
dev = container_of(priv, struct mlx5_core_dev, priv);
mlx5_core_warn(dev, "handling bad device here\n");
mlx5_handle_bad_state(dev);

spin_lock(&health->wq_lock);
if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
schedule_delayed_work(&health->recover_work, recover_delay);
else
dev_err(&dev->pdev->dev,
"new health works are not permitted at this stage\n");
spin_unlock(&health->wq_lock);
}

static const char *hsynd_str(u8 synd)
Expand Down Expand Up @@ -316,6 +351,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
spin_lock(&health->wq_lock);
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
spin_unlock(&health->wq_lock);
cancel_delayed_work_sync(&health->recover_work);
cancel_work_sync(&health->work);
}

Expand Down Expand Up @@ -344,6 +380,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
return -ENOMEM;
spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_care);
INIT_DELAYED_WORK(&health->recover_work, health_recover);

return 0;
}
18 changes: 13 additions & 5 deletions drivers/net/ethernet/mellanox/mlx5/core/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1313,6 +1313,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
struct mlx5_priv *priv = &dev->priv;

dev_info(&pdev->dev, "%s was called\n", __func__);

mlx5_enter_error_state(dev);
mlx5_unload_one(dev, priv, false);
/* In case of kernel call save the pci state and drain health wq */
Expand Down Expand Up @@ -1378,11 +1379,6 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
return PCI_ERS_RESULT_RECOVERED;
}

void mlx5_disable_device(struct mlx5_core_dev *dev)
{
mlx5_pci_err_detected(dev->pdev, 0);
}

static void mlx5_pci_resume(struct pci_dev *pdev)
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
Expand Down Expand Up @@ -1432,6 +1428,18 @@ static const struct pci_device_id mlx5_core_pci_table[] = {

MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);

void mlx5_disable_device(struct mlx5_core_dev *dev)
{
mlx5_pci_err_detected(dev->pdev, 0);
}

void mlx5_recover_device(struct mlx5_core_dev *dev)
{
mlx5_pci_disable_device(dev);
if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED)
mlx5_pci_resume(dev->pdev);
}

static struct pci_driver mlx5_core_driver = {
.name = DRIVER_NAME,
.id_table = mlx5_core_pci_table,
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
unsigned long param);
void mlx5_enter_error_state(struct mlx5_core_dev *dev);
void mlx5_disable_device(struct mlx5_core_dev *dev);
void mlx5_recover_device(struct mlx5_core_dev *dev);
int mlx5_sriov_init(struct mlx5_core_dev *dev);
void mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
int mlx5_sriov_attach(struct mlx5_core_dev *dev);
Expand Down
1 change: 1 addition & 0 deletions include/linux/mlx5/driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ struct mlx5_core_health {
struct workqueue_struct *wq;
unsigned long flags;
struct work_struct work;
struct delayed_work recover_work;
};

struct mlx5_cq_table {
Expand Down

0 comments on commit 04c0c1a

Please sign in to comment.