Skip to content

Commit

Permalink
net/mlx5: Fix deadlock in sync reset flow
Browse files Browse the repository at this point in the history
The sync reset flow can lead to the following deadlock when
poll_sync_reset() is called by timer softirq and waiting on
del_timer_sync() for the same timer. Fix that by moving the part of the
flow that waits for the timer to reset_reload_work.

It fixes the following kernel Trace:
RIP: 0010:del_timer_sync+0x32/0x40
...
Call Trace:
 <IRQ>
 mlx5_sync_reset_clear_reset_requested+0x26/0x50 [mlx5_core]
 poll_sync_reset.cold+0x36/0x52 [mlx5_core]
 call_timer_fn+0x32/0x130
 __run_timers.part.0+0x180/0x280
 ? tick_sched_handle+0x33/0x60
 ? tick_sched_timer+0x3d/0x80
 ? ktime_get+0x3e/0xa0
 run_timer_softirq+0x2a/0x50
 __do_softirq+0xe1/0x2d6
 ? hrtimer_interrupt+0x136/0x220
 irq_exit+0xae/0xb0
 smp_apic_timer_interrupt+0x7b/0x140
 apic_timer_interrupt+0xf/0x20
 </IRQ>

Fixes: 3c5193a ("net/mlx5: Use del_timer_sync in fw reset flow of halting poll")
Signed-off-by: Moshe Shemesh <[email protected]>
Reviewed-by: Maher Sanalla <[email protected]>
Signed-off-by: Saeed Mahameed <[email protected]>
  • Loading branch information
mosheshemesh2 authored and Saeed Mahameed committed May 4, 2022
1 parent b781bff commit cb7786a
Showing 1 changed file with 17 additions and 17 deletions.
34 changes: 17 additions & 17 deletions drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,22 +155,6 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
}
}

static void mlx5_sync_reset_reload_work(struct work_struct *work)
{
struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
reset_reload_work);
struct mlx5_core_dev *dev = fw_reset->dev;
int err;

mlx5_enter_error_state(dev, true);
mlx5_unload_one(dev);
err = mlx5_health_wait_pci_up(dev);
if (err)
mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
fw_reset->ret = err;
mlx5_fw_reset_complete_reload(dev);
}

static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
{
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
Expand All @@ -188,6 +172,23 @@ static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, boo
mlx5_start_health_poll(dev);
}

static void mlx5_sync_reset_reload_work(struct work_struct *work)
{
struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
reset_reload_work);
struct mlx5_core_dev *dev = fw_reset->dev;
int err;

mlx5_sync_reset_clear_reset_requested(dev, false);
mlx5_enter_error_state(dev, true);
mlx5_unload_one(dev);
err = mlx5_health_wait_pci_up(dev);
if (err)
mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
fw_reset->ret = err;
mlx5_fw_reset_complete_reload(dev);
}

#define MLX5_RESET_POLL_INTERVAL (HZ / 10)
static void poll_sync_reset(struct timer_list *t)
{
Expand All @@ -202,7 +203,6 @@ static void poll_sync_reset(struct timer_list *t)

if (fatal_error) {
mlx5_core_warn(dev, "Got Device Reset\n");
mlx5_sync_reset_clear_reset_requested(dev, false);
queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
return;
}
Expand Down

0 comments on commit cb7786a

Please sign in to comment.