Skip to content

Commit

Permalink
sched/uclamp: Add CPU's clamp buckets refcounting
Browse files Browse the repository at this point in the history
Utilization clamping allows to clamp the CPU's utilization within a
[util_min, util_max] range, depending on the set of RUNNABLE tasks on
that CPU. Each task references two "clamp buckets" defining its minimum
and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp
bucket is active if there is at least one RUNNABLE tasks enqueued on
that CPU and refcounting that bucket.

When a task is {en,de}queued {on,from} a rq, the set of active clamp
buckets on that CPU can change. If the set of active clamp buckets
changes for a CPU a new "aggregated" clamp value is computed for that
CPU. This is because each clamp bucket enforces a different utilization
clamp value.

Clamp values are always MAX aggregated for both util_min and util_max.
This ensures that no task can affect the performance of other
co-scheduled tasks which are more boosted (i.e. with higher util_min
clamp) or less capped (i.e. with higher util_max clamp).

A task has:
   task_struct::uclamp[clamp_id]::bucket_id
to track the "bucket index" of the CPU's clamp bucket it refcounts while
enqueued, for each clamp index (clamp_id).

A runqueue has:
   rq::uclamp[clamp_id]::bucket[bucket_id].tasks
to track how many RUNNABLE tasks on that CPU refcount each
clamp bucket (bucket_id) of a clamp index (clamp_id).
It also has a:
   rq::uclamp[clamp_id]::bucket[bucket_id].value
to track the clamp value of each clamp bucket (bucket_id) of a clamp
index (clamp_id).

The rq::uclamp::bucket[clamp_id][] array is scanned every time it's
needed to find a new MAX aggregated clamp value for a clamp_id. This
operation is required only when it's dequeued the last task of a clamp
bucket tracking the current MAX aggregated clamp value. In this case,
the CPU is either entering IDLE or going to schedule a less boosted or
more clamped task.
The expected number of different clamp values configured at build time
is small enough to fit the full unordered array into a single cache
line, for configurations of up to 7 buckets.

Add to struct rq the basic data structures required to refcount the
number of RUNNABLE tasks for each clamp bucket. Add also the max
aggregation required to update the rq's clamp value at each
enqueue/dequeue event.

Use a simple linear mapping of clamp values into clamp buckets.
Pre-compute and cache bucket_id to avoid integer divisions at
enqueue/dequeue time.

Signed-off-by: Patrick Bellasi <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Alessio Balsini <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Joel Fernandes <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Paul Turner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Quentin Perret <[email protected]>
Cc: Rafael J . Wysocki <[email protected]>
Cc: Steve Muckle <[email protected]>
Cc: Suren Baghdasaryan <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Todd Kjos <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Viresh Kumar <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
  • Loading branch information
derkling authored and Ingo Molnar committed Jun 24, 2019
1 parent a3df067 commit 69842cb
Show file tree
Hide file tree
Showing 6 changed files with 343 additions and 6 deletions.
34 changes: 34 additions & 0 deletions include/linux/log2.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
ilog2((n) - 1) + 1) : \
__order_base_2(n) \
)

static inline __attribute__((const))
int __bits_per(unsigned long n)
{
if (n < 2)
return 1;
if (is_power_of_2(n))
return order_base_2(n) + 1;
return order_base_2(n);
}

/**
* bits_per - calculate the number of bits required for the argument
* @n: parameter
*
* This is constant-capable and can be used for compile time
* initializations, e.g bitfields.
*
* The first few values calculated by this routine:
* bf(0) = 1
* bf(1) = 1
* bf(2) = 2
* bf(3) = 2
* bf(4) = 3
* ... and so on.
*/
#define bits_per(n) \
( \
__builtin_constant_p(n) ? ( \
((n) == 0 || (n) == 1) \
? 1 : ilog2(n) + 1 \
) : \
__bits_per(n) \
)
#endif /* _LINUX_LOG2_H */
39 changes: 39 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,18 @@ struct vtime {
u64 gtime;
};

/*
* Utilization clamp constraints.
* @UCLAMP_MIN: Minimum utilization
* @UCLAMP_MAX: Maximum utilization
* @UCLAMP_CNT: Utilization clamp constraints count
*/
enum uclamp_id {
UCLAMP_MIN = 0,
UCLAMP_MAX,
UCLAMP_CNT
};

struct sched_info {
#ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */
Expand Down Expand Up @@ -314,6 +326,10 @@ struct sched_info {
# define SCHED_FIXEDPOINT_SHIFT 10
# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)

/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)

struct load_weight {
unsigned long weight;
u32 inv_weight;
Expand Down Expand Up @@ -562,6 +578,25 @@ struct sched_dl_entity {
struct hrtimer inactive_timer;
};

#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT

/*
* Utilization clamp for a scheduling entity
* @value: clamp value "assigned" to a se
* @bucket_id: bucket index corresponding to the "assigned" value
*
* The bucket_id is the index of the clamp bucket matching the clamp value
* which is pre-computed and stored to avoid expensive integer divisions from
* the fast path.
*/
struct uclamp_se {
unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
};
#endif /* CONFIG_UCLAMP_TASK */

union rcu_special {
struct {
u8 blocked;
Expand Down Expand Up @@ -642,6 +677,10 @@ struct task_struct {
#endif
struct sched_dl_entity dl;

#ifdef CONFIG_UCLAMP_TASK
struct uclamp_se uclamp[UCLAMP_CNT];
#endif

#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
Expand Down
6 changes: 0 additions & 6 deletions include/linux/sched/topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@

#include <linux/sched/idle.h>

/*
* Increase resolution of cpu_capacity calculations
*/
#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)

/*
* sched-domains (multiprocessor balancing) declarations:
*/
Expand Down
53 changes: 53 additions & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
config GENERIC_SCHED_CLOCK
bool

menu "Scheduler features"

config UCLAMP_TASK
bool "Enable utilization clamping for RT/FAIR tasks"
depends on CPU_FREQ_GOV_SCHEDUTIL
help
This feature enables the scheduler to track the clamped utilization
of each CPU based on RUNNABLE tasks scheduled on that CPU.

With this option, the user can specify the min and max CPU
utilization allowed for RUNNABLE tasks. The max utilization defines
the maximum frequency a task should use while the min utilization
defines the minimum frequency it should use.

Both min and max utilization clamp values are hints to the scheduler,
aiming at improving its frequency selection policy, but they do not
enforce or grant any specific bandwidth for tasks.

If in doubt, say N.

config UCLAMP_BUCKETS_COUNT
int "Number of supported utilization clamp buckets"
range 5 20
default 5
depends on UCLAMP_TASK
help
Defines the number of clamp buckets to use. The range of each bucket
will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
number of clamp buckets the finer their granularity and the higher
the precision of clamping aggregation and tracking at run-time.

For example, with the minimum configuration value we will have 5
clamp buckets tracking 20% utilization each. A 25% boosted tasks will
be refcounted in the [20..39]% bucket and will set the bucket clamp
effective value to 25%.
If a second 30% boosted task should be co-scheduled on the same CPU,
that task will be refcounted in the same bucket of the first task and
it will boost the bucket clamp effective value to 30%.
The clamp effective value of a bucket is reset to its nominal value
(20% in the example above) when there are no more tasks refcounted in
that bucket.

An additional boost/capping margin can be added to some tasks. In the
example above the 25% task will be boosted to 30% until it exits the
CPU. If that should be considered not acceptable on certain systems,
it's always possible to reduce the margin by increasing the number of
clamp buckets to trade off used memory for run-time tracking
precision.

If in doubt, use the default value.

endmenu

#
# For architectures that want to enable the support for NUMA-affine scheduler
# balancing logic:
Expand Down
166 changes: 166 additions & 0 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}

#ifdef CONFIG_UCLAMP_TASK

/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)

#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)

static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
return clamp_value / UCLAMP_BUCKET_DELTA;
}

static inline unsigned int uclamp_none(int clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
return SCHED_CAPACITY_SCALE;
}

static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
{
uc_se->value = value;
uc_se->bucket_id = uclamp_bucket_id(value);
}

static inline
unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;

/*
* Since both min and max clamps are max aggregated, find the
* top most bucket with tasks in.
*/
for ( ; bucket_id >= 0; bucket_id--) {
if (!bucket[bucket_id].tasks)
continue;
return bucket[bucket_id].value;
}

/* No tasks -- default clamp values */
return uclamp_none(clamp_id);
}

/*
* When a task is enqueued on a rq, the clamp bucket currently defined by the
* task's uclamp::bucket_id is refcounted on that rq. This also immediately
* updates the rq's clamp value if required.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
unsigned int clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;

lockdep_assert_held(&rq->lock);

bucket = &uc_rq->bucket[uc_se->bucket_id];
bucket->tasks++;

if (uc_se->value > READ_ONCE(uc_rq->value))
WRITE_ONCE(uc_rq->value, bucket->value);
}

/*
* When a task is dequeued from a rq, the clamp bucket refcounted by the task
* is released. If this is the last task reference counting the rq's max
* active clamp value, then the rq's clamp value is updated.
*
* Both refcounted tasks and rq's cached clamp values are expected to be
* always valid. If it's detected they are not, as defensive programming,
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
unsigned int clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
unsigned int rq_clamp;

lockdep_assert_held(&rq->lock);

bucket = &uc_rq->bucket[uc_se->bucket_id];
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;

if (likely(bucket->tasks))
return;

rq_clamp = READ_ONCE(uc_rq->value);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
*/
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp)
WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id));
}

static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
unsigned int clamp_id;

if (unlikely(!p->sched_class->uclamp_enabled))
return;

for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
}

static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
unsigned int clamp_id;

if (unlikely(!p->sched_class->uclamp_enabled))
return;

for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}

static void __init init_uclamp(void)
{
unsigned int clamp_id;
int cpu;

for_each_possible_cpu(cpu) {
struct uclamp_bucket *bucket;
struct uclamp_rq *uc_rq;
unsigned int bucket_id;

memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));

for_each_clamp_id(clamp_id) {
uc_rq = &cpu_rq(cpu)->uclamp[clamp_id];

bucket_id = 1;
while (bucket_id < UCLAMP_BUCKETS) {
bucket = &uc_rq->bucket[bucket_id];
bucket->value = bucket_id * UCLAMP_BUCKET_DELTA;
++bucket_id;
}
}
}

for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp[clamp_id],
uclamp_none(clamp_id));
}
}

#else /* CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */

static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
Expand All @@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}

uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}

Expand All @@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}

uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}

Expand Down Expand Up @@ -6093,6 +6257,8 @@ void __init sched_init(void)

psi_init();

init_uclamp();

scheduler_running = 1;
}

Expand Down
Loading

0 comments on commit 69842cb

Please sign in to comment.