Skip to content

Commit

Permalink
Merge tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/sc…
Browse files Browse the repository at this point in the history
…m/linux/kernel/git/tip/tip

Pull scheduling fixes from Borislav Petkov:

 - Add PREEMPT_RT maintainers

 - Fix another aspect of delayed dequeued tasks wrt determining their
   state, i.e., whether they're runnable or blocked

 - Handle delayed dequeued tasks and their migration wrt PSI properly

 - Fix the situation where a delayed dequeue task gets enqueued into a
   new class, which should not happen

 - Fix a case where memory allocation would happen while the runqueue
   lock is held, which is a no-no

 - Do not over-schedule when tasks with shorter slices preempt the
   currently running task

 - Make sure delayed to deque entities are properly handled before
   unthrottling

 - Other smaller cleanups and improvements

* tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  MAINTAINERS: Add an entry for PREEMPT_RT.
  sched/fair: Fix external p->on_rq users
  sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug
  sched/core: Dequeue PSI signals for blocked tasks that are delayed
  sched: Fix delayed_dequeue vs switched_from_fair()
  sched/core: Disable page allocation in task_tick_mm_cid()
  sched/deadline: Use hrtick_enabled_dl() before start_hrtick_dl()
  sched/eevdf: Fix wakeup-preempt by checking cfs_rq->nr_running
  sched: Fix sched_delayed vs cfs_bandwidth
  • Loading branch information
torvalds committed Oct 20, 2024
2 parents a5ee44c + 5ec36fe commit 2b4d250
Show file tree
Hide file tree
Showing 17 changed files with 146 additions and 72 deletions.
8 changes: 8 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -19527,6 +19527,14 @@ S: Maintained
F: Documentation/tools/rtla/
F: tools/tracing/rtla/

Real-time Linux (PREEMPT_RT)
M: Sebastian Andrzej Siewior <[email protected]>
M: Clark Williams <[email protected]>
M: Steven Rostedt <[email protected]>
L: [email protected]
S: Supported
K: PREEMPT_RT

REALTEK AUDIO CODECS
M: Oder Chiou <[email protected]>
S: Maintained
Expand Down
5 changes: 5 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -2133,6 +2133,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)

#endif /* CONFIG_SMP */

static inline bool task_is_runnable(struct task_struct *p)
{
return p->on_rq && !p->se.sched_delayed;
}

extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);
Expand Down
5 changes: 4 additions & 1 deletion include/linux/task_work.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
}

enum task_work_notify_mode {
TWA_NONE,
TWA_NONE = 0,
TWA_RESUME,
TWA_SIGNAL,
TWA_SIGNAL_NO_IPI,
TWA_NMI_CURRENT,

TWA_FLAGS = 0xff00,
TWAF_NO_ALLOC = 0x0100,
};

static inline bool task_work_pending(struct task_struct *task)
Expand Down
2 changes: 1 addition & 1 deletion kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task,
},
};

if (!sched_in && task->on_rq) {
if (!sched_in && task_is_runnable(task)) {
switch_event.event_id.header.misc |=
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
}
Expand Down
7 changes: 6 additions & 1 deletion kernel/freezer.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
{
unsigned int state = READ_ONCE(p->__state);

if (p->on_rq)
/*
* Allow freezing the sched_delayed tasks; they will not execute until
* ttwu() fixes them up, so it is safe to swap their state now, instead
* of waiting for them to get fully dequeued.
*/
if (task_is_runnable(p))
return 0;

if (p != current && task_curr(p))
Expand Down
9 changes: 9 additions & 0 deletions kernel/rcu/tasks.h
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
if (!READ_ONCE(t->on_rq))
return false;

/*
* t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
* since it is a spurious state (it will transition into the
* traditional blocked state or get woken up without outside
* dependencies), not considering it such should only affect timing.
*
* Be conservative for now and not include it.
*/

/*
* Idle tasks (or idle injection) within the idle loop are RCU-tasks
* quiescent states. But CPU boot code performed by the idle task
Expand Down
61 changes: 39 additions & 22 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
* Additionally it is possible to be ->on_rq but still be considered not
* runnable when p->se.sched_delayed is true. These tasks are on the runqueue
* but will be dequeued as soon as they get picked again. See the
* task_is_runnable() helper.
*
* p->on_cpu <- { 0, 1 }:
*
* is set by prepare_task() and cleared by finish_task() such that it will be
Expand Down Expand Up @@ -2012,18 +2017,18 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);

if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}

p->sched_class->enqueue_task(rq, p, flags);
/*
* Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
* ->sched_delayed.
*/
uclamp_rq_inc(rq, p);

if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
psi_enqueue(p, flags & ENQUEUE_MIGRATED);
}

if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
Expand All @@ -2041,7 +2046,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)

if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeue(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
}

/*
Expand Down Expand Up @@ -4323,9 +4328,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
* @arg: Argument to function.
*
* Fix the task in it's current state by avoiding wakeups and or rq operations
* and call @func(@arg) on it. This function can use ->on_rq and task_curr()
* to work out what the state is, if required. Given that @func can be invoked
* with a runqueue lock held, it had better be quite lightweight.
* and call @func(@arg) on it. This function can use task_is_runnable() and
* task_curr() to work out what the state is, if required. Given that @func
* can be invoked with a runqueue lock held, it had better be quite
* lightweight.
*
* Returns:
* Whatever @func returns
Expand Down Expand Up @@ -6544,6 +6550,7 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
bool block = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
Expand Down Expand Up @@ -6629,6 +6636,7 @@ static void __sched notrace __schedule(int sched_mode)
* After this, schedule() must not care about p->state any more.
*/
block_task(rq, prev, flags);
block = true;
}
switch_count = &prev->nvcsw;
}
Expand Down Expand Up @@ -6674,7 +6682,7 @@ static void __sched notrace __schedule(int sched_mode)

migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
psi_sched_switch(prev, next, block);

trace_sched_switch(preempt, prev, next, prev_state);

Expand Down Expand Up @@ -7017,20 +7025,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);

void __setscheduler_prio(struct task_struct *p, int prio)
const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
{
if (dl_prio(prio))
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
return &dl_sched_class;

if (rt_prio(prio))
return &rt_sched_class;

#ifdef CONFIG_SCHED_CLASS_EXT
else if (task_should_scx(p))
p->sched_class = &ext_sched_class;
if (task_should_scx(p))
return &ext_sched_class;
#endif
else
p->sched_class = &fair_sched_class;

p->prio = prio;
return &fair_sched_class;
}

#ifdef CONFIG_RT_MUTEXES
Expand Down Expand Up @@ -7076,7 +7084,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
int prio, oldprio, queued, running, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
struct rq *rq;

Expand Down Expand Up @@ -7134,6 +7142,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
queue_flag &= ~DEQUEUE_MOVE;

prev_class = p->sched_class;
next_class = __setscheduler_class(p, prio);

if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);

queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
Expand Down Expand Up @@ -7171,7 +7184,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->rt.timeout = 0;
}

__setscheduler_prio(p, prio);
p->sched_class = next_class;
p->prio = prio;

check_class_changing(rq, p, prev_class);

if (queued)
Expand Down Expand Up @@ -10465,7 +10480,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
return;
if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
return;
task_work_add(curr, work, TWA_RESUME);

/* No page allocation under rq lock */
task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
}

void sched_mm_cid_exit_signals(struct task_struct *t)
Expand Down
2 changes: 1 addition & 1 deletion kernel/sched/deadline.c
Original file line number Diff line number Diff line change
Expand Up @@ -2385,7 +2385,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)

deadline_queue_push_tasks(rq);

if (hrtick_enabled(rq))
if (hrtick_enabled_dl(rq))
start_hrtick_dl(rq, &p->dl);
}

Expand Down
4 changes: 2 additions & 2 deletions kernel/sched/ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -4493,7 +4493,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)

sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);

__setscheduler_prio(p, p->prio);
p->sched_class = __setscheduler_class(p, p->prio);
check_class_changing(task_rq(p), p, old_class);

sched_enq_and_set_task(&ctx);
Expand Down Expand Up @@ -5204,7 +5204,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);

p->scx.slice = SCX_SLICE_DFL;
__setscheduler_prio(p, p->prio);
p->sched_class = __setscheduler_class(p, p->prio);
check_class_changing(task_rq(p), p, old_class);

sched_enq_and_set_task(&ctx);
Expand Down
27 changes: 7 additions & 20 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq)

account_cfs_rq_runtime(cfs_rq, delta_exec);

if (rq->nr_running == 1)
if (cfs_rq->nr_running == 1)
return;

if (resched || did_preempt_short(cfs_rq, curr)) {
Expand Down Expand Up @@ -6058,10 +6058,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);

if (se->on_rq) {
SCHED_WARN_ON(se->sched_delayed);
/* Handle any unfinished DELAY_DEQUEUE business first. */
if (se->sched_delayed) {
int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;

dequeue_entity(qcfs_rq, se, flags);
} else if (se->on_rq)
break;
}
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);

if (cfs_rq_is_idle(group_cfs_rq(se)))
Expand Down Expand Up @@ -13174,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
/*
* Since this is called after changing class, this is a little weird
* and we cannot use DEQUEUE_DELAYED.
*/
if (p->se.sched_delayed) {
/* First, dequeue it from its new class' structures */
dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
/*
* Now, clean up the fair_sched_class side of things
* related to sched_delayed being true and that wasn't done
* due to the generic dequeue not using DEQUEUE_DELAYED.
*/
finish_delayed_dequeue_entity(&p->se);
p->se.rel_deadline = 0;
__block_task(rq, p);
}
}

static void switched_to_fair(struct rq *rq, struct task_struct *p)
Expand Down
2 changes: 1 addition & 1 deletion kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -3800,7 +3800,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)

extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
extern void __setscheduler_prio(struct task_struct *p, int prio);
extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
Expand Down
Loading

0 comments on commit 2b4d250

Please sign in to comment.