Skip to content

Commit

Permalink
rcu: Improve diagnostics for spurious RCU CPU stall warnings
Browse files Browse the repository at this point in the history
The current RCU CPU stall warning code will print "Stall ended before
state dump start" any time that the stall-warning code is triggered on
a CPU that has already reported a quiescent state for the current grace
period and if all quiescent states have been reported for the current
grace period.  However, a true stall can result in these symptoms, for
example, by preventing RCU's grace-period kthreads from ever running

This commit therefore checks for this condition, reporting the end of
the stall only if one of the grace-period counters has actually advanced.
Otherwise, it reports the last time that the grace-period kthread made
meaningful progress.  (In normal situations, the grace-period kthread
should make meaningful progress at least every jiffies_till_next_fqs
jiffies.)

Reported-by: Miroslav Benes <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Tested-by: Miroslav Benes <[email protected]>
  • Loading branch information
paulmck committed Jan 6, 2015
1 parent fc908ed commit 6ccd2ec
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 5 deletions.
5 changes: 5 additions & 0 deletions Documentation/RCU/stallwarn.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,11 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
behavior, you might need to replace some of the cond_resched()
calls with calls to cond_resched_rcu_qs().

o Anything that prevents RCU's grace-period kthreads from running.
This can result in the "All QSes seen" console-log message.
This message will include information on when the kthread last
ran and how often it should be expected to run.

o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
happen to preempt a low-priority task in the middle of an RCU
read-side critical section. This is especially damaging if
Expand Down
32 changes: 27 additions & 5 deletions kernel/rcu/tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -1066,11 +1066,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
}
}

static void print_other_cpu_stall(struct rcu_state *rsp)
static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
long delta;
unsigned long flags;
unsigned long gpa;
unsigned long j;
int ndetected = 0;
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
Expand Down Expand Up @@ -1123,10 +1125,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
(long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected == 0)
pr_err("INFO: Stall ended before state dump start\n");
else
if (ndetected) {
rcu_dump_cpu_stacks(rsp);
} else {
if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
ACCESS_ONCE(rsp->completed) == gpnum) {
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
gpa = ACCESS_ONCE(rsp->gp_activity);
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
rsp->name, j - gpa, j, gpa,
jiffies_till_next_fqs);
/* In this case, the current CPU might be at fault. */
sched_show_task(current);
}
}

/* Complain about tasks blocking the grace period. */

Expand Down Expand Up @@ -1226,7 +1240,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {

/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(rsp);
print_other_cpu_stall(rsp, gpnum);
}
}

Expand Down Expand Up @@ -1622,6 +1636,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);

ACCESS_ONCE(rsp->gp_activity) = jiffies;
rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
Expand Down Expand Up @@ -1682,6 +1697,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
ACCESS_ONCE(rsp->gp_activity) = jiffies;
}

mutex_unlock(&rsp->onoff_mutex);
Expand All @@ -1698,6 +1714,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);

ACCESS_ONCE(rsp->gp_activity) = jiffies;
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
Expand Down Expand Up @@ -1736,6 +1753,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);

ACCESS_ONCE(rsp->gp_activity) = jiffies;
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
gp_duration = jiffies - rsp->gp_start;
Expand Down Expand Up @@ -1772,6 +1790,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
ACCESS_ONCE(rsp->gp_activity) = jiffies;
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
Expand Down Expand Up @@ -1821,6 +1840,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
if (rcu_gp_init(rsp))
break;
cond_resched_rcu_qs();
ACCESS_ONCE(rsp->gp_activity) = jiffies;
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
Expand Down Expand Up @@ -1864,9 +1884,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
ACCESS_ONCE(rsp->gpnum),
TPS("fqsend"));
cond_resched_rcu_qs();
ACCESS_ONCE(rsp->gp_activity) = jiffies;
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
ACCESS_ONCE(rsp->gp_activity) = jiffies;
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
Expand Down
2 changes: 2 additions & 0 deletions kernel/rcu/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,8 @@ struct rcu_state {
/* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
unsigned long gp_activity; /* Time of last GP kthread */
/* activity in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
unsigned long jiffies_resched; /* Time at which to resched */
Expand Down

0 comments on commit 6ccd2ec

Please sign in to comment.