Skip to content

Commit

Permalink
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/l…
Browse files Browse the repository at this point in the history
…inux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "The main changes are:

   - irqtime accounting cleanups and enhancements. (Frederic Weisbecker)

   - schedstat debugging enhancements, make it more broadly runtime
     available. (Josh Poimboeuf)

   - More work on asymmetric topology/capacity scheduling. (Morten
     Rasmussen)

   - sched/wait fixes and cleanups. (Oleg Nesterov)

   - PELT (per entity load tracking) improvements. (Peter Zijlstra)

   - Rewrite and enhance select_idle_siblings(). (Peter Zijlstra)

   - sched/numa enhancements/fixes (Rik van Riel)

   - sched/cputime scalability improvements (Stanislaw Gruszka)

   - Load calculation arithmetics fixes. (Dietmar Eggemann)

   - sched/deadline enhancements (Tommaso Cucinotta)

   - Fix utilization accounting when switching to the SCHED_NORMAL
     policy. (Vincent Guittot)

   - ... plus misc cleanups and enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
  sched/irqtime: Consolidate irqtime flushing code
  sched/irqtime: Consolidate accounting synchronization with u64_stats API
  u64_stats: Introduce IRQs disabled helpers
  sched/irqtime: Remove needless IRQs disablement on kcpustat update
  sched/irqtime: No need for preempt-safe accessors
  sched/fair: Fix min_vruntime tracking
  sched/debug: Add SCHED_WARN_ON()
  sched/core: Fix set_user_nice()
  sched/fair: Introduce set_curr_task() helper
  sched/core, ia64: Rename set_curr_task()
  sched/core: Fix incorrect utilization accounting when switching to fair class
  sched/core: Optimize SCHED_SMT
  sched/core: Rewrite and improve select_idle_siblings()
  sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared
  sched/core: Introduce 'struct sched_domain_shared'
  sched/core: Restructure destroy_sched_domain()
  sched/core: Remove unused @cpu argument from destroy_sched_domain*()
  sched/wait: Introduce init_wait_entry()
  sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock()
  sched/wait: Avoid abort_exclusive_wait() in ___wait_event()
  ...
  • Loading branch information
torvalds committed Oct 3, 2016
2 parents e606d81 + 447976e commit af79ad2
Show file tree
Hide file tree
Showing 24 changed files with 1,198 additions and 746 deletions.
18 changes: 18 additions & 0 deletions Documentation/scheduler/sched-deadline.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ CONTENTS
4.1 System-wide settings
4.2 Task interface
4.3 Default behavior
4.4 Behavior of sched_yield()
5. Tasks CPU affinity
5.1 SCHED_DEADLINE and cpusets HOWTO
6. Future plans
Expand Down Expand Up @@ -426,6 +427,23 @@ CONTENTS
Finally, notice that in order not to jeopardize the admission control a
-deadline task cannot fork.


4.4 Behavior of sched_yield()
-----------------------------

When a SCHED_DEADLINE task calls sched_yield(), it gives up its
remaining runtime and is immediately throttled, until the next
period, when its runtime will be replenished (a special flag
dl_yielded is set and used to handle correctly throttling and runtime
replenishment after a call to sched_yield()).

This behavior of sched_yield() allows the task to wake-up exactly at
the beginning of the next period. Also, this may be useful in the
future with bandwidth reclaiming mechanisms, where sched_yield() will
make the leftoever runtime available for reclamation by other
SCHED_DEADLINE tasks.


5. Tasks CPU affinity
=====================

Expand Down
10 changes: 5 additions & 5 deletions arch/ia64/kernel/mca.c
Original file line number Diff line number Diff line change
Expand Up @@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
int cpu = smp_processor_id();

previous_current = curr_task(cpu);
set_curr_task(cpu, current);
ia64_set_curr_task(cpu, current);
if ((p = strchr(current->comm, ' ')))
*p = '\0';

Expand Down Expand Up @@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
cpumask_clear_cpu(i, &mca_cpu); /* wake next cpu */
while (monarch_cpu != -1)
cpu_relax(); /* spin until last cpu leaves */
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu]
= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
return;
}
}
}
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
monarch_cpu = -1; /* This frees the slaves and previous monarchs */
}
Expand Down Expand Up @@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);

mprintk("Slave on cpu %d returning to normal service.\n", cpu);
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
atomic_dec(&slaves);
return;
Expand All @@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,

mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
atomic_dec(&monarchs);
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
monarch_cpu = -1;
return;
}
Expand Down
46 changes: 30 additions & 16 deletions arch/x86/kernel/smpboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}

static struct sched_domain_topology_level numa_inside_package_topology[] = {
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
Expand All @@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
#endif
{ NULL, },
};

static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};

/*
* set_sched_topology() sets the topology internal to a CPU. The
* NUMA topologies are layered on top of it to build the full
* system topology.
*
* If NUMA nodes are observed to occur within a CPU package, this
* function should be called. It forces the sched domain code to
* only use the SMT level for the CPU portion of the topology.
* This essentially falls back to relying on NUMA information
* from the SRAT table to describe the entire system topology
* (except for hyperthreads).
* Set if a package/die has multiple NUMA nodes inside.
* AMD Magny-Cours and Intel Cluster-on-Die have this.
*/
static void primarily_use_numa_for_topology(void)
{
set_sched_topology(numa_inside_package_topology);
}
static bool x86_has_numa_in_package;

void set_cpu_sibling_map(int cpu)
{
Expand Down Expand Up @@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
c->booted_cores = cpu_data(i).booted_cores;
}
if (match_die(c, o) && !topology_same_node(c, o))
primarily_use_numa_for_topology();
x86_has_numa_in_package = true;
}

threads = cpumask_weight(topology_sibling_cpumask(cpu));
Expand Down Expand Up @@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}

/*
* Set 'default' x86 topology, this matches default_topology() in that
* it has NUMA nodes as a topology level. See also
* native_smp_cpus_done().
*
* Must be done before set_cpus_sibling_map() is ran.
*/
set_sched_topology(x86_topology);

set_cpu_sibling_map(0);

switch (smp_sanity_check(max_cpus)) {
Expand Down Expand Up @@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done\n");

if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);

nmi_selftest();
impress_friends();
setup_ioapic_dest();
Expand Down
9 changes: 3 additions & 6 deletions include/linux/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,17 +259,14 @@ static inline void might_fault(void) { }
extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(int state);
__printf(1, 2)
void panic(const char *fmt, ...)
__noreturn __cold;
void panic(const char *fmt, ...) __noreturn __cold;
void nmi_panic(struct pt_regs *regs, const char *msg);
extern void oops_enter(void);
extern void oops_exit(void);
void print_oops_end_marker(void);
extern int oops_may_print(void);
void do_exit(long error_code)
__noreturn;
void complete_and_exit(struct completion *, long)
__noreturn;
void do_exit(long error_code) __noreturn;
void complete_and_exit(struct completion *, long) __noreturn;

/* Internal, do not use. */
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
Expand Down
30 changes: 28 additions & 2 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,8 @@ static inline void io_schedule(void)
io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
}

void __noreturn do_task_dead(void);

struct nsproxy;
struct user_namespace;

Expand Down Expand Up @@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head);
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
Expand Down Expand Up @@ -1064,6 +1067,12 @@ extern int sched_domain_level_max;

struct sched_group;

struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
};

struct sched_domain {
/* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */
Expand Down Expand Up @@ -1094,6 +1103,8 @@ struct sched_domain {
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;

u64 avg_scan_cost; /* select_idle_sibling */

#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
Expand Down Expand Up @@ -1132,6 +1143,7 @@ struct sched_domain {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
struct sched_domain_shared *shared;

unsigned int span_weight;
/*
Expand Down Expand Up @@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void);

struct sd_data {
struct sched_domain **__percpu sd;
struct sched_domain_shared **__percpu sds;
struct sched_group **__percpu sg;
struct sched_group_capacity **__percpu sgc;
};
Expand Down Expand Up @@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p)
return p->pid == 0;
}
extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);

void yield(void);

Expand Down Expand Up @@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
* cond_resched_lock() will drop the spinlock before scheduling,
* cond_resched_softirq() will enable bhs before scheduling.
*/
#ifndef CONFIG_PREEMPT
extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
#endif

#define cond_resched() ({ \
___might_sleep(__FILE__, __LINE__, 0); \
Expand Down Expand Up @@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void)
#endif
}

static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
{
#ifdef CONFIG_DEBUG_PREEMPT
return p->preempt_disable_ip;
#else
return 0;
#endif
}

/*
* Does a critical section need to be broken due to another
* task waiting?: (technically does not depend on CONFIG_PREEMPT,
Expand Down
45 changes: 24 additions & 21 deletions include/linux/u64_stats_sync.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
#endif
}

static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_begin(&syncp->seq);
#else
#if BITS_PER_LONG==32
preempt_disable();
#endif
return 0;
#endif
}

static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
preempt_disable();
#endif
return __u64_stats_fetch_begin(syncp);
}

static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_retry(&syncp->seq, start);
#else
#if BITS_PER_LONG==32
preempt_enable();
#endif
return false;
#endif
}

static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
preempt_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
}

/*
* In case irq handlers can update u64 counters, readers can use following helpers
* - SMP 32bit arches use seqcount protection, irq safe.
Expand All @@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
*/
static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_begin(&syncp->seq);
#else
#if BITS_PER_LONG==32
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
local_irq_disable();
#endif
return 0;
#endif
return __u64_stats_fetch_begin(syncp);
}

static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
unsigned int start)
unsigned int start)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_retry(&syncp->seq, start);
#else
#if BITS_PER_LONG==32
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
local_irq_enable();
#endif
return false;
#endif
return __u64_stats_fetch_retry(syncp, start);
}

#endif /* _LINUX_U64_STATS_SYNC_H */
Loading

0 comments on commit af79ad2

Please sign in to comment.