Skip to content

Commit

Permalink
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/l…
Browse files Browse the repository at this point in the history
…inux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "Major changes:

   - Reworked CPU capacity code, for better SMP load balancing on
     systems with assymetric CPUs. (Vincent Guittot, Morten Rasmussen)

   - Reworked RT task SMP balancing to be push based instead of pull
     based, to reduce latencies on large CPU count systems. (Steven
     Rostedt)

   - SCHED_DEADLINE support updates and fixes. (Juri Lelli)

   - SCHED_DEADLINE task migration support during CPU hotplug. (Wanpeng Li)

   - x86 mwait-idle optimizations and fixes. (Mike Galbraith, Len Brown)

   - sched/numa improvements. (Rik van Riel)

   - various cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits)
  sched/core: Drop debugging leftover trace_printk call
  sched/deadline: Support DL task migration during CPU hotplug
  sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()
  sched/deadline: Always enqueue on previous rq when dl_task_timer() fires
  sched/core: Remove unused argument from init_[rt|dl]_rq()
  sched/deadline: Fix rt runtime corruption when dl fails its global constraints
  sched/deadline: Avoid a superfluous check
  sched: Improve load balancing in the presence of idle CPUs
  sched: Optimize freq invariant accounting
  sched: Move CFS tasks to CPUs with higher capacity
  sched: Add SD_PREFER_SIBLING for SMT level
  sched: Remove unused struct sched_group_capacity::capacity_orig
  sched: Replace capacity_factor by usage
  sched: Calculate CPU's usage statistic and put it into struct sg_lb_stats::group_usage
  sched: Add struct rq::cpu_capacity_orig
  sched: Make scale_rt invariant with frequency
  sched: Make sched entity usage tracking scale-invariant
  sched: Remove frequency scaling from cpu_capacity
  sched: Track group sched_entity usage contributions
  sched: Add sched_avg::utilization_avg_contrib
  ...
  • Loading branch information
torvalds committed Apr 13, 2015
2 parents cc76ee7 + 62a935b commit 49d2953
Show file tree
Hide file tree
Showing 11 changed files with 690 additions and 235 deletions.
8 changes: 8 additions & 0 deletions arch/x86/include/asm/mwait.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
:: "a" (eax), "c" (ecx));
}

static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
trace_hardirqs_on();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}

/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
Expand Down
51 changes: 51 additions & 0 deletions arch/x86/kernel/process.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/mwait.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
Expand Down Expand Up @@ -399,6 +400,53 @@ static void amd_e400_idle(void)
default_idle();
}

/*
* Intel Core2 and older machines prefer MWAIT over HALT for C1.
* We can't rely on cpuidle installing MWAIT, because it will not load
* on systems that support only C1 -- so the boot default must be MWAIT.
*
* Some AMD machines are the opposite, they depend on using HALT.
*
* So for default C1, which is used during boot until cpuidle loads,
* use MWAIT-C1 on Intel HW that has it, else use HALT.
*/
static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
{
if (c->x86_vendor != X86_VENDOR_INTEL)
return 0;

if (!cpu_has(c, X86_FEATURE_MWAIT))
return 0;

return 1;
}

/*
* MONITOR/MWAIT with no hints, used for default default C1 state.
* This invokes MWAIT with interrutps enabled and no flags,
* which is backwards compatible with the original MWAIT implementation.
*/

static void mwait_idle(void)
{
if (!current_set_polling_and_test()) {
if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
smp_mb(); /* quirk */
clflush((void *)&current_thread_info()->flags);
smp_mb(); /* quirk */
}

__monitor((void *)&current_thread_info()->flags, 0, 0);
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
} else {
local_irq_enable();
}
__current_clr_polling();
}

void select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
Expand All @@ -412,6 +460,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
/* E400: APIC timer interrupt does not wake up CPU from C1e */
pr_info("using AMD E400 aware idle routine\n");
x86_idle = amd_e400_idle;
} else if (prefer_mwait_c1_over_halt(c)) {
pr_info("using mwait in idle threads\n");
x86_idle = mwait_idle;
} else
x86_idle = default_idle;
}
Expand Down
3 changes: 2 additions & 1 deletion include/linux/irq_work.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,17 @@ bool irq_work_queue(struct irq_work *work);
bool irq_work_queue_on(struct irq_work *work, int cpu);
#endif

void irq_work_run(void);
void irq_work_tick(void);
void irq_work_sync(struct irq_work *work);

#ifdef CONFIG_IRQ_WORK
#include <asm/irq_work.h>

void irq_work_run(void);
bool irq_work_needs_cpu(void);
#else
static inline bool irq_work_needs_cpu(void) { return false; }
static inline void irq_work_run(void) { }
#endif

#endif /* _LINUX_IRQ_WORK_H */
21 changes: 17 additions & 4 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1123,15 +1123,28 @@ struct load_weight {
};

struct sched_avg {
u64 last_runnable_update;
s64 decay_count;
/*
* utilization_avg_contrib describes the amount of time that a
* sched_entity is running on a CPU. It is based on running_avg_sum
* and is scaled in the range [0..SCHED_LOAD_SCALE].
* load_avg_contrib described the amount of time that a sched_entity
* is runnable on a rq. It is based on both runnable_avg_sum and the
* weight of the task.
*/
unsigned long load_avg_contrib, utilization_avg_contrib;
/*
* These sums represent an infinite geometric series and so are bound
* above by 1024/(1-y). Thus we only need a u32 to store them for all
* choices of y < 1-2^(-32)*1024.
* running_avg_sum reflects the time that the sched_entity is
* effectively running on the CPU.
* runnable_avg_sum represents the amount of time a sched_entity is on
* a runqueue which includes the running time that is monitored by
* running_avg_sum.
*/
u32 runnable_avg_sum, runnable_avg_period;
u64 last_runnable_update;
s64 decay_count;
unsigned long load_avg_contrib;
u32 runnable_avg_sum, avg_period, running_avg_sum;
};

#ifdef CONFIG_SCHEDSTATS
Expand Down
96 changes: 50 additions & 46 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,23 @@ static inline bool got_nohz_idle_kick(void)
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(void)
{
/*
* FIFO realtime policy runs the highest priority task. Other runnable
* tasks are of a lower priority. The scheduler tick does nothing.
*/
if (current->policy == SCHED_FIFO)
return true;

/*
* Round-robin realtime tasks time slice with other tasks at the same
* realtime priority. Is this task the only one at this priority?
*/
if (current->policy == SCHED_RR) {
struct sched_rt_entity *rt_se = &current->rt;

return rt_se->run_list.prev == rt_se->run_list.next;
}

/*
* More than one running task need preemption.
* nr_running update is assumed to be visible
Expand Down Expand Up @@ -5335,36 +5352,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned long flags;
long cpu = (long)hcpu;
struct dl_bw *dl_b;

switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
set_cpu_active(cpu, false);

/* explicitly allow suspend */
if (!(action & CPU_TASKS_FROZEN)) {
bool overflow;
int cpus;

rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);

raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(cpu);
overflow = __dl_overflow(dl_b, cpus, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);

rcu_read_unlock_sched();

if (overflow)
return notifier_from_errno(-EBUSY);
}
set_cpu_active((long)hcpu, false);
return NOTIFY_OK;
default:
return NOTIFY_DONE;
}

return NOTIFY_DONE;
}

static int __init migration_init(void)
Expand Down Expand Up @@ -5445,17 +5439,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}

/*
* Even though we initialize ->capacity to something semi-sane,
* we leave capacity_orig unset. This allows us to detect if
* domain iteration is still funny without causing /0 traps.
*/
if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}

if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
Expand Down Expand Up @@ -5939,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->capacity_orig = sg->sgc->capacity;

/*
* Make sure the first group of this domain contains the
Expand Down Expand Up @@ -6250,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/

if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */

Expand Down Expand Up @@ -7015,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
*/

case CPU_ONLINE:
case CPU_DOWN_FAILED:
cpuset_update_active_cpus(true);
break;
default:
Expand All @@ -7027,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
switch (action) {
unsigned long flags;
long cpu = (long)hcpu;
struct dl_bw *dl_b;

switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
/* explicitly allow suspend */
if (!(action & CPU_TASKS_FROZEN)) {
bool overflow;
int cpus;

rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);

raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(cpu);
overflow = __dl_overflow(dl_b, cpus, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);

rcu_read_unlock_sched();

if (overflow)
return notifier_from_errno(-EBUSY);
}
cpuset_update_active_cpus(false);
break;
case CPU_DOWN_PREPARE_FROZEN:
Expand Down Expand Up @@ -7173,8 +7177,8 @@ void __init sched_init(void)
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
init_dl_rq(&rq->dl, rq);
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
Expand Down Expand Up @@ -7214,7 +7218,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
Expand Down Expand Up @@ -7813,7 +7817,7 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */

static int sched_dl_global_constraints(void)
static int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
Expand Down Expand Up @@ -7914,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
if (ret)
goto undo;

ret = sched_rt_global_constraints();
ret = sched_dl_global_validate();
if (ret)
goto undo;

ret = sched_dl_global_constraints();
ret = sched_rt_global_constraints();
if (ret)
goto undo;

Expand Down
Loading

0 comments on commit 49d2953

Please sign in to comment.