Skip to content

Commit

Permalink
sched: Make schedstats helpers independent of fair sched class
Browse files Browse the repository at this point in the history
The original prototype of the schedstats helpers are

  update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)

The cfs_rq in these helpers is used to get the rq_clock, and the se is
used to get the struct sched_statistics and the struct task_struct. In
order to make these helpers available by all sched classes, we can pass
the rq, sched_statistics and task_struct directly.

Then the new helpers are

  update_stats_wait_*(struct rq *rq, struct task_struct *p,
                      struct sched_statistics *stats)

which are independent of fair sched class.

To avoid vmlinux growing too large or introducing ovehead when
!schedstat_enabled(), some new helpers after schedstat_enabled() are also
introduced, Suggested by Mel. These helpers are in sched/stats.c,

  __update_stats_wait_*(struct rq *rq, struct task_struct *p,
                        struct sched_statistics *stats)

The size of vmlinux as follows,
                      Before          After
  Size of vmlinux     826308552       826304640
The size is a litte smaller as some functions are not inlined again after
the change.

I also compared the sched performance with 'perf bench sched pipe',
suggested by Mel. The result as followsi (in usecs/op),
                             Before                After
  kernel.sched_schedstats=0  5.2~5.4               5.2~5.4
  kernel.sched_schedstats=1  5.3~5.5               5.3~5.5

[These data is a little difference with the prev version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no difference.

No functional change.

[[email protected]: reported build failure in prev version]

Signed-off-by: Yafang Shao <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Mel Gorman <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
  • Loading branch information
laoar authored and Peter Zijlstra committed Oct 5, 2021
1 parent ceeadb8 commit 60f2415
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 115 deletions.
134 changes: 19 additions & 115 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -887,32 +887,27 @@ static void update_curr_fair(struct rq *rq)
}

static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 wait_start, prev_wait_start;
struct sched_statistics *stats;
struct task_struct *p = NULL;

if (!schedstat_enabled())
return;

stats = __schedstats_from_se(se);

wait_start = rq_clock(rq_of(cfs_rq));
prev_wait_start = schedstat_val(stats->wait_start);

if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
if (entity_is_task(se))
p = task_of(se);

__schedstat_set(stats->wait_start, wait_start);
__update_stats_wait_start(rq_of(cfs_rq), p, stats);
}

static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_statistics *stats;
struct task_struct *p = NULL;
u64 delta;

if (!schedstat_enabled())
return;
Expand All @@ -928,105 +923,34 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (unlikely(!schedstat_val(stats->wait_start)))
return;

delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start);

if (entity_is_task(se)) {
if (entity_is_task(se))
p = task_of(se);
if (task_on_rq_migrating(p)) {
/*
* Preserve migrating task's wait time so wait_start
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
__schedstat_set(stats->wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}

__schedstat_set(stats->wait_max,
max(schedstat_val(stats->wait_max), delta));
__schedstat_inc(stats->wait_count);
__schedstat_add(stats->wait_sum, delta);
__schedstat_set(stats->wait_start, 0);
__update_stats_wait_end(rq_of(cfs_rq), p, stats);
}

static inline void
update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_statistics *stats;
struct task_struct *tsk = NULL;
u64 sleep_start, block_start;

if (!schedstat_enabled())
return;

stats = __schedstats_from_se(se);

sleep_start = schedstat_val(stats->sleep_start);
block_start = schedstat_val(stats->block_start);

if (entity_is_task(se))
tsk = task_of(se);

if (sleep_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;

if ((s64)delta < 0)
delta = 0;

if (unlikely(delta > schedstat_val(stats->sleep_max)))
__schedstat_set(stats->sleep_max, delta);

__schedstat_set(stats->sleep_start, 0);
__schedstat_add(stats->sum_sleep_runtime, delta);

if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (block_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;

if ((s64)delta < 0)
delta = 0;

if (unlikely(delta > schedstat_val(stats->block_max)))
__schedstat_set(stats->block_max, delta);

__schedstat_set(stats->block_start, 0);
__schedstat_add(stats->sum_sleep_runtime, delta);

if (tsk) {
if (tsk->in_iowait) {
__schedstat_add(stats->iowait_sum, delta);
__schedstat_inc(stats->iowait_count);
trace_sched_stat_iowait(tsk, delta);
}

trace_sched_stat_blocked(tsk, delta);

/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
__update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
}

/*
* Task is being enqueued - update stats:
*/
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
Expand All @@ -1036,14 +960,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
update_stats_wait_start_fair(cfs_rq, se);

if (flags & ENQUEUE_WAKEUP)
update_stats_enqueue_sleeper(cfs_rq, se);
update_stats_enqueue_sleeper_fair(cfs_rq, se);
}

static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{

if (!schedstat_enabled())
Expand All @@ -1054,7 +978,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* waiting task:
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
update_stats_wait_end_fair(cfs_rq, se);

if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
Expand Down Expand Up @@ -4267,26 +4191,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);

static inline void check_schedstat_required(void)
{
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled())
return;

/* Force schedstat enabled if a dependent tracepoint is active */
if (trace_sched_stat_wait_enabled() ||
trace_sched_stat_sleep_enabled() ||
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled()) {
printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
"kernel parameter schedstats=enable or "
"kernel.sched_schedstats=1\n");
}
#endif
}

static inline bool cfs_bandwidth_used(void);

/*
Expand Down Expand Up @@ -4360,7 +4264,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
place_entity(cfs_rq, se, 0);

check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
update_stats_enqueue_fair(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
Expand Down Expand Up @@ -4444,7 +4348,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);

update_stats_dequeue(cfs_rq, se, flags);
update_stats_dequeue_fair(cfs_rq, se, flags);

clear_buddies(cfs_rq, se);

Expand Down Expand Up @@ -4529,7 +4433,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
Expand Down Expand Up @@ -4631,7 +4535,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
check_spread(cfs_rq, prev);

if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
Expand Down
103 changes: 103 additions & 0 deletions kernel/sched/stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,109 @@
*/
#include "sched.h"

void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
{
u64 wait_start, prev_wait_start;

wait_start = rq_clock(rq);
prev_wait_start = schedstat_val(stats->wait_start);

if (p && likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;

__schedstat_set(stats->wait_start, wait_start);
}

void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
{
u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start);

if (p) {
if (task_on_rq_migrating(p)) {
/*
* Preserve migrating task's wait time so wait_start
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
__schedstat_set(stats->wait_start, delta);

return;
}

trace_sched_stat_wait(p, delta);
}

__schedstat_set(stats->wait_max,
max(schedstat_val(stats->wait_max), delta));
__schedstat_inc(stats->wait_count);
__schedstat_add(stats->wait_sum, delta);
__schedstat_set(stats->wait_start, 0);
}

void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
{
u64 sleep_start, block_start;

sleep_start = schedstat_val(stats->sleep_start);
block_start = schedstat_val(stats->block_start);

if (sleep_start) {
u64 delta = rq_clock(rq) - sleep_start;

if ((s64)delta < 0)
delta = 0;

if (unlikely(delta > schedstat_val(stats->sleep_max)))
__schedstat_set(stats->sleep_max, delta);

__schedstat_set(stats->sleep_start, 0);
__schedstat_add(stats->sum_sleep_runtime, delta);

if (p) {
account_scheduler_latency(p, delta >> 10, 1);
trace_sched_stat_sleep(p, delta);
}
}

if (block_start) {
u64 delta = rq_clock(rq) - block_start;

if ((s64)delta < 0)
delta = 0;

if (unlikely(delta > schedstat_val(stats->block_max)))
__schedstat_set(stats->block_max, delta);

__schedstat_set(stats->block_start, 0);
__schedstat_add(stats->sum_sleep_runtime, delta);

if (p) {
if (p->in_iowait) {
__schedstat_add(stats->iowait_sum, delta);
__schedstat_inc(stats->iowait_count);
trace_sched_stat_iowait(p, delta);
}

trace_sched_stat_blocked(p, delta);

/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(p),
delta >> 20);
}
account_scheduler_latency(p, delta >> 10, 0);
}
}
}

/*
* Current schedstat API version.
*
Expand Down
Loading

0 comments on commit 60f2415

Please sign in to comment.