Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/l…

…inux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: "The main scheduler changes in this cycle were: - NUMA balancing improvements (Mel Gorman) - Further load tracking improvements (Patrick Bellasi) - Various NOHZ balancing cleanups and optimizations (Peter Zijlstra) - Improve blocked load handling, in particular we can now reduce and eventually stop periodic load updates on 'very idle' CPUs. (Vincent Guittot) - On isolated CPUs offload the final 1Hz scheduler tick as well, plus related cleanups and reorganization. (Frederic Weisbecker) - Core scheduler code cleanups (Ingo Molnar)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits) sched/core: Update preempt_notifier_key to modern API sched/cpufreq: Rate limits for SCHED_DEADLINE sched/fair: Update util_est only on util_avg updates sched/cpufreq/schedutil: Use util_est for OPP selection sched/fair: Use util_est in LB and WU paths sched/fair: Add util_est on top of PELT sched/core: Remove TASK_ALL sched/completions: Use bool in try_wait_for_completion() sched/fair: Update blocked load when newly idle sched/fair: Move idle_balance() sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks sched/fair: Move rebalance_domains() sched/nohz: Optimize nohz_idle_balance() sched/fair: Reduce the periodic update duration sched/nohz: Stop NOHZ stats when decayed sched/cpufreq: Provide migration hint sched/nohz: Clean up nohz enter/exit sched/fair: Update blocked load from NEWIDLE sched/fair: Add NOHZ stats balancing sched/fair: Restructure nohz_balance_kick() ...
kawasaki · Apr 2, 2018 · 46e0d28 · 46e0d28
2 parents 86bbbeb + b720342
commit 46e0d28
Show file tree

Hide file tree

Showing 41 changed files with 2,114 additions and 1,561 deletions.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
@@ -1766,6 +1766,17 @@
 
 			nohz
 			  Disable the tick when a single task runs.
+
+			  A residual 1Hz tick is offloaded to workqueues, which you
+			  need to affine to housekeeping through the global
+			  workqueue's affinity configured via the
+			  /sys/devices/virtual/workqueue/cpumask sysfs file, or
+			  by using the 'domain' flag described below.
+
+			  NOTE: by default the global workqueue runs on all CPUs,
+			  so to protect individual CPUs the 'cpumask' file has to
+			  be configured manually after bootup.
+
 			domain
 			  Isolate from the general SMP balancing and scheduling
 			  algorithms. Note that performing domain isolation this way

diff --git a/include/linux/sched.h b/include/linux/sched.h
@@ -93,7 +93,6 @@ struct task_group;
 
 /* Convenience macros for the sake of wake_up(): */
 #define TASK_NORMAL			(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
-#define TASK_ALL			(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
 
 /* get_task_state(): */
 #define TASK_REPORT			(TASK_RUNNING | TASK_INTERRUPTIBLE | \
@@ -275,6 +274,34 @@ struct load_weight {
 	u32				inv_weight;
 };
 
+/**
+ * struct util_est - Estimation utilization of FAIR tasks
+ * @enqueued: instantaneous estimated utilization of a task/cpu
+ * @ewma:     the Exponential Weighted Moving Average (EWMA)
+ *            utilization of a task
+ *
+ * Support data structure to track an Exponential Weighted Moving Average
+ * (EWMA) of a FAIR task's utilization. New samples are added to the moving
+ * average each time a task completes an activation. Sample's weight is chosen
+ * so that the EWMA will be relatively insensitive to transient changes to the
+ * task's workload.
+ *
+ * The enqueued attribute has a slightly different meaning for tasks and cpus:
+ * - task:   the task's util_avg at last task dequeue time
+ * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
+ * Thus, the util_est.enqueued of a task represents the contribution on the
+ * estimated utilization of the CPU where that task is currently enqueued.
+ *
+ * Only for tasks we track a moving average of the past instantaneous
+ * estimated utilization. This allows to absorb sporadic drops in utilization
+ * of an otherwise almost periodic task.
+ */
+struct util_est {
+	unsigned int			enqueued;
+	unsigned int			ewma;
+#define UTIL_EST_WEIGHT_SHIFT		2
+};
+
 /*
  * The load_avg/util_avg accumulates an infinite geometric series
  * (see __update_load_avg() in kernel/sched/fair.c).
@@ -336,6 +363,7 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_load_avg;
 	unsigned long			util_avg;
+	struct util_est			util_est;
 };
 
 struct sched_statistics {

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
@@ -8,9 +8,8 @@
  * Interface between cpufreq drivers and the scheduler:
  */
 
-#define SCHED_CPUFREQ_RT	(1U << 0)
-#define SCHED_CPUFREQ_DL	(1U << 1)
-#define SCHED_CPUFREQ_IOWAIT	(1U << 2)
+#define SCHED_CPUFREQ_IOWAIT	(1U << 0)
+#define SCHED_CPUFREQ_MIGRATION	(1U << 1)
 
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
@@ -1,8 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SCHED_DEADLINE_H
-#define _LINUX_SCHED_DEADLINE_H
-
-#include <linux/sched.h>
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
 {
 	return (s64)(a - b) < 0;
 }
-
-#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
@@ -12,6 +12,7 @@ enum hk_flags {
 	HK_FLAG_SCHED		= (1 << 3),
 	HK_FLAG_TICK		= (1 << 4),
 	HK_FLAG_DOMAIN		= (1 << 5),
+	HK_FLAG_WQ		= (1 << 6),
 };
 
 #ifdef CONFIG_CPU_ISOLATION

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { }
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
-extern void set_cpu_sd_state_idle(void);
 extern int get_nohz_timer_target(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
-static inline void set_cpu_sd_state_idle(void) { }
 #endif
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -37,8 +35,4 @@ extern void wake_up_nohz_cpu(int cpu);
 static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
 
-#ifdef CONFIG_NO_HZ_FULL
-extern u64 scheduler_tick_max_deferment(void);
-#endif
-
 #endif /* _LINUX_SCHED_NOHZ_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
@@ -113,7 +113,8 @@ enum tick_dep_bits {
 
 #ifdef CONFIG_NO_HZ_COMMON
 extern bool tick_nohz_enabled;
-extern int tick_nohz_tick_stopped(void);
+extern bool tick_nohz_tick_stopped(void);
+extern bool tick_nohz_tick_stopped_cpu(int cpu);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
+static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
+obj-y += idle.o fair.o rt.o deadline.o
+obj-y += wait.o wait_bit.o swait.o completion.o
+
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
@@ -1,10 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/utsname.h>
-#include <linux/security.h>
-#include <linux/export.h>
-
+/*
+ * Auto-group scheduling implementation:
+ */
 #include "sched.h"
 
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	autogroup_kref_put(prev);
 }
 
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
 void sched_autogroup_create_attach(struct task_struct *p)
 {
 	struct autogroup *ag = autogroup_create();
 
 	autogroup_move_group(p, ag);
-	/* drop extra reference added by autogroup_create() */
+
+	/* Drop extra reference added by autogroup_create(): */
 	autogroup_kref_put(ag);
 }
 EXPORT_SYMBOL(sched_autogroup_create_attach);
 
-/* Cannot be called under siglock.  Currently has no users */
+/* Cannot be called under siglock. Currently has no users: */
 void sched_autogroup_detach(struct task_struct *p)
 {
 	autogroup_move_group(p, &autogroup_default);
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
 
 	return 1;
 }
-
 __setup("noautogroup", setup_autogroup);
 
 #ifdef CONFIG_PROC_FS
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 	if (nice < 0 && !can_nice(current, nice))
 		return -EPERM;
 
-	/* this is a heavy operation taking global locks.. */
+	/* This is a heavy operation, taking global locks.. */
 	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
 		return -EAGAIN;
 
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
 
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
-#endif /* CONFIG_SCHED_DEBUG */
+#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
@@ -1,15 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifdef CONFIG_SCHED_AUTOGROUP
 
-#include <linux/kref.h>
-#include <linux/rwsem.h>
-#include <linux/sched/autogroup.h>
-
 struct autogroup {
 	/*
-	 * reference doesn't mean how many thread attach to this
-	 * autogroup now. It just stands for the number of task
-	 * could use this autogroup.
+	 * Reference doesn't mean how many threads attach to this
+	 * autogroup now. It just stands for the number of tasks
+	 * which could use this autogroup.
 	 */
 	struct kref		kref;
 	struct task_group	*tg;
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
 	return tg;
 }
 
-#ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
 	return 0;
 }
-#endif
 
 #endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
 /*
- * sched_clock for unstable cpu clocks
+ * sched_clock() for unstable CPU clocks
  *
  *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
  *
@@ -11,7 +11,7 @@
  *   Guillaume Chazarain <[email protected]>
  *
  *
- * What:
+ * What this file implements:
  *
  * cpu_clock(i) provides a fast (execution time) high resolution
  * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
  * at 0 on boot (but people really shouldn't rely on that).
  *
  * cpu_clock(i)       -- can be used from any context, including NMI.
- * local_clock()      -- is cpu_clock() on the current cpu.
+ * local_clock()      -- is cpu_clock() on the current CPU.
  *
  * sched_clock_cpu(i)
  *
- * How:
+ * How it is implemented:
  *
  * The implementation either uses sched_clock() when
  * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -52,19 +52,7 @@
  * that is otherwise invisible (TSC gets stopped).
  *
  */
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/ktime.h>
-#include <linux/sched.h>
-#include <linux/nmi.h>
-#include <linux/sched/clock.h>
-#include <linux/static_key.h>
-#include <linux/workqueue.h>
-#include <linux/compiler.h>
-#include <linux/tick.h>
-#include <linux/init.h>
+#include "sched.h"
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -302,21 +290,21 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
 	 * cmpxchg64 below only protects one readout.
 	 *
 	 * We must reread via sched_clock_local() in the retry case on
-	 * 32bit as an NMI could use sched_clock_local() via the
+	 * 32-bit kernels as an NMI could use sched_clock_local() via the
 	 * tracer and hit between the readout of
-	 * the low32bit and the high 32bit portion.
+	 * the low 32-bit and the high 32-bit portion.
 	 */
 	this_clock = sched_clock_local(my_scd);
 	/*
-	 * We must enforce atomic readout on 32bit, otherwise the
-	 * update on the remote cpu can hit inbetween the readout of
-	 * the low32bit and the high 32bit portion.
+	 * We must enforce atomic readout on 32-bit, otherwise the
+	 * update on the remote CPU can hit inbetween the readout of
+	 * the low 32-bit and the high 32-bit portion.
 	 */
 	remote_clock = cmpxchg64(&scd->clock, 0, 0);
 #else
 	/*
-	 * On 64bit the read of [my]scd->clock is atomic versus the
-	 * update, so we can avoid the above 32bit dance.
+	 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
+	 * update, so we can avoid the above 32-bit dance.
 	 */
 	sched_clock_local(my_scd);
 again:

diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
  * typically be used for exclusion which gives rise to priority inversion.
  * Waiting for completion is a typically sync point, but not an exclusion point.
  */
-
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/completion.h>
+#include "sched.h"
 
 /**
  * complete: - signals a single thread waiting on this completion
@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
 bool try_wait_for_completion(struct completion *x)
 {
 	unsigned long flags;
-	int ret = 1;
+	bool ret = true;
 
 	/*
 	 * Since x->done will need to be locked only
@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
 	 * return early in the blocking case.
 	 */
 	if (!READ_ONCE(x->done))
-		return 0;
+		return false;
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
-		ret = 0;
+		ret = false;
 	else if (x->done != UINT_MAX)
 		x->done--;
 	spin_unlock_irqrestore(&x->wait.lock, flags);