From eab172294d5e24464f332dd8e94a57a9819c81c4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 29 Oct 2008 17:03:22 +0800
Subject: [PATCH 01/31] sched: cleanup for alloc_rt/fair_sched_group()

Impact: cleanup

Remove checking parent == NULL. It won't be NULLL, because we dynamically
create sub task_group only, and sub task_group always has its parent.
(root task_group is statically defined)

Also replace kmalloc_node(GFP_ZERO) with kzalloc_node().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f462c1..7dd6c860773b85 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8472,7 +8472,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se, *parent_se;
+	struct sched_entity *se;
 	struct rq *rq;
 	int i;
 
@@ -8488,18 +8488,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 
-		se = kmalloc_node(sizeof(struct sched_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		se = kzalloc_node(sizeof(struct sched_entity),
+				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err;
 
-		parent_se = parent ? parent->se[i] : NULL;
-		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
 	}
 
 	return 1;
@@ -8560,7 +8559,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se, *parent_se;
+	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 
@@ -8577,18 +8576,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		rt_rq = kmalloc_node(sizeof(struct rt_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		rt_rq = kzalloc_node(sizeof(struct rt_rq),
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 
-		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_se)
 			goto err;
 
-		parent_se = parent ? parent->rt_se[i] : NULL;
-		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
 	}
 
 	return 1;

From 34f3a814eef8069a24e5b3ebcf27aba9dabac2ea Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 30 Oct 2008 15:23:32 +0800
Subject: [PATCH 02/31] sched: switch sched_features to seqfile

Impact: cleanup

So handling of sched_features read is simplified.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 52 ++++++++++++++++----------------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 7dd6c860773b85..5419df9cc5c472 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = {
 
 #undef SCHED_FEAT
 
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
 {
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-		size_t cnt, loff_t *ppos)
-{
-	char *buf;
-	int r = 0;
-	int len = 0;
 	int i;
 
 	for (i = 0; sched_feat_names[i]; i++) {
-		len += strlen(sched_feat_names[i]);
-		len += 4;
+		if (!(sysctl_sched_features & (1UL << i)))
+			seq_puts(m, "NO_");
+		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
+	seq_puts(m, "\n");
 
-	buf = kmalloc(len + 2, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	for (i = 0; sched_feat_names[i]; i++) {
-		if (sysctl_sched_features & (1UL << i))
-			r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-		else
-			r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
-	}
-
-	r += sprintf(buf + r, "\n");
-	WARN_ON(r >= len + 2);
-
-	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
-	kfree(buf);
-
-	return r;
+	return 0;
 }
 
 static ssize_t
@@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_feat_show, NULL);
+}
+
 static struct file_operations sched_feat_fops = {
-	.open	= sched_feat_open,
-	.read	= sched_feat_read,
-	.write	= sched_feat_write,
+	.open		= sched_feat_open,
+	.write		= sched_feat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
 };
 
 static __init int sched_init_debug(void)

From 8bb8c4386d08f2cc5d871d22f220d35032213f84 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 1 Nov 2008 00:13:49 +0100
Subject: [PATCH 03/31] sched, ftrace: trace sched.c

Impact: allow function tracing within sched.c

Its useful to see what happens in sched.c.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d841c..e1af0397214837 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o

From e113a745f693af196c8081b328bf42def086989b Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 31 Oct 2008 08:03:41 -0500
Subject: [PATCH 04/31] sched/rt: small optimization to update_curr_rt()

Impact: micro-optimization to SCHED_FIFO/RR scheduling

A very minor improvement, but might it be better to check sched_rt_runtime(rt_rq)
before taking the rt_runtime_lock?

Peter Zijlstra observes:

> Yes, I think its ok to do so.
>
> Like pointed out in the other thread, there are two races:
>
>  - sched_rt_runtime() going to RUNTIME_INF, and that will be handled
>    properly by sched_rt_runtime_exceeded()
>
>  - sched_rt_runtime() going to !RUNTIME_INF, and here we can miss an
>    accounting cycle, but I don't think that is something to worry too
>    much about.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

--

 kernel/sched_rt.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
---
 kernel/sched_rt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6af..c7963d5d0625f5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
-		spin_lock(&rt_rq->rt_runtime_lock);
 		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+			spin_lock(&rt_rq->rt_runtime_lock);
 			rt_rq->rt_time += delta_exec;
 			if (sched_rt_runtime_exceeded(rt_rq))
 				resched_task(curr);
+			spin_unlock(&rt_rq->rt_runtime_lock);
 		}
-		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }
 

From eefd796a8e831408ce17e633d73d70430748c47a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:15:37 +0800
Subject: [PATCH 05/31] sched debug: remove sd_level_to_string()

Impact: cleanup

Just use the newly introduced sd->name.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5419df9cc5c472..7ac59bae87d2c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6602,28 +6602,6 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-	switch (lvl) {
-	case SD_LV_NONE:
-			return "NONE";
-	case SD_LV_SIBLING:
-			return "SIBLING";
-	case SD_LV_MC:
-			return "MC";
-	case SD_LV_CPU:
-			return "CPU";
-	case SD_LV_NODE:
-			return "NODE";
-	case SD_LV_ALLNODES:
-			return "ALLNODES";
-	case SD_LV_MAX:
-			return "MAX";
-
-	}
-	return "MAX";
-}
-
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
@@ -6643,8 +6621,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		return -1;
 	}
 
-	printk(KERN_CONT "span %s level %s\n",
-		str, sd_level_to_string(sd->level));
+	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "

From 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:17:05 +0800
Subject: [PATCH 06/31] sched debug: remove NULL checking in print_cfs/rt_rq()

Impact: cleanup

cfs->tg is initialized in init_tg_cfs_entry() with tg != NULL, and
will never be invalidated to NULL. And the underlying cgroup of a
valid task_group is always valid.

Same for rt->tg.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec3253..d25cefe3f0eb45 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -121,14 +121,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	char path[128] = "";
-	struct cgroup *cgroup = NULL;
 	struct task_group *tg = cfs_rq->tg;
 
-	if (tg)
-		cgroup = tg->css.cgroup;
-
-	if (cgroup)
-		cgroup_path(cgroup, path, sizeof(path));
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #else
@@ -193,14 +188,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
 	char path[128] = "";
-	struct cgroup *cgroup = NULL;
 	struct task_group *tg = rt_rq->tg;
 
-	if (tg)
-		cgroup = tg->css.cgroup;
-
-	if (cgroup)
-		cgroup_path(cgroup, path, sizeof(path));
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else

From a17e2260926f681a0eb983c1e3cb859ba2064bce Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:19:13 +0800
Subject: [PATCH 07/31] sched: remove redundant call to
 unregister_sched_domain_sysctl()

Impact: cleanup

The sysctl has been unregistered by partition_sched_domains().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 7ac59bae87d2c2..3cb94fad33cae5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7691,8 +7691,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	cpumask_t tmpmask;
 	int i;
 
-	unregister_sched_domain_sysctl();
-
 	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();

From faa2f98f856e89d1afb6e4a91707284d242e816e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:20:23 +0800
Subject: [PATCH 08/31] sched: add sanity check in partition_sched_domains()

Impact: cleanup, add debug check

It's wrong to make dattr_new = NULL if doms_new == NULL, it introduces
memory leak if dattr_new != NULL. Fortunately dattr_new is always NULL
in this case. So remove the code and add a sanity check.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3cb94fad33cae5..213cad5e50aa48 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7767,7 +7767,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 		ndoms_cur = 0;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
+		WARN_ON_ONCE(dattr_new);
 	}
 
 	/* Build new domains */

From 90080bf4b803efbc9b9cd3a7ed1639f1e036238e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 4 Nov 2008 14:37:25 -0800
Subject: [PATCH 09/31] sched, documentation: update scheduler header file
 paths

Impact: update documentation

Update Documentation/scheduler/ files to reflect changed header files locations.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/scheduler/sched-arch.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
index 941615a9769b91..d43dbcbd163b4d 100644
--- a/Documentation/scheduler/sched-arch.txt
+++ b/Documentation/scheduler/sched-arch.txt
@@ -8,7 +8,7 @@ Context switch
 By default, the switch_to arch function is called with the runqueue
 locked. This is usually not a problem unless switch_to may need to
 take the runqueue lock. This is usually due to a wake up operation in
-the context switch. See include/asm-ia64/system.h for an example.
+the context switch. See arch/ia64/include/asm/system.h for an example.
 
 To request the scheduler call switch_to with the runqueue unlocked,
 you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
@@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
 introduce a significant interrupt latency by adding the line
 `#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
 unlocked context switches. This define also implies
-`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an
+`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
 example.
 
 

From cf7f8690e864c6fe11e77202dd847fa60f483418 Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Wed, 5 Nov 2008 18:57:14 +0530
Subject: [PATCH 10/31] sched, lockdep: inline double_unlock_balance()

We have a test case which measures the variation in the amount of time
needed to perform a fixed amount of work on the preempt_rt kernel. We
started seeing deterioration in it's performance recently. The test
should never take more than 10 microseconds, but we started 5-10%
failure rate.

Using elimination method, we traced the problem to commit
1b12bbc747560ea68bcc132c3d05699e52271da0 (lockdep: re-annotate
scheduler runqueues).

When LOCKDEP is disabled, this patch only adds an additional function
call to double_unlock_balance(). Hence I inlined double_unlock_balance()
and the problem went away. Here is a patch to make this change.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 2 +-
 kernel/sched_rt.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f462c1..ad10d0aae1d7e6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2825,7 +2825,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	spin_unlock(&busiest->lock);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c7963d5d0625f5..2bdd4442359991 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -910,7 +910,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+static inline void double_unlock_balance(struct rq *this_rq,
+						struct rq *busiest);
 
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 

From a87d091434ed2a34d647979ab12084139ee1fe41 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Thu, 6 Nov 2008 11:10:49 -0800
Subject: [PATCH 11/31] x86, sched: enable wchan config menu item on 64-bit

Enable the wchan config menu item for now on x86-64 arch?  This will
at least allow people to enable/disable frame pointers on scheduler
functions.

Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6f20718d3156b2..488a4ecd0b590e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -367,7 +367,7 @@ config X86_RDC321X
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
-	depends on X86_32
+	depends on X86
 	help
 	  Calculate simpler /proc/<PID>/wchan values. If this option
 	  is disabled then wchan values will recurse back to the

From 6d21cd62516a9697cb7ec33cc52e6b814fb65a13 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 7 Nov 2008 17:03:18 +0800
Subject: [PATCH 12/31] sched: clean up SCHED_CPUMASK_ALLOC

Impact: cleanup

The #if/#endif is ugly. Change SCHED_CPUMASK_ALLOC and
SCHED_CPUMASK_FREE to static inline functions.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index b24e57a10f6fb1..59db86c915f9d9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7279,13 +7279,21 @@ struct allmasks {
 };
 
 #if	NR_CPUS > 128
-#define	SCHED_CPUMASK_ALLOC		1
-#define	SCHED_CPUMASK_FREE(v)		kfree(v)
-#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
+#define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{
+	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+	kfree(masks);
+}
 #else
-#define	SCHED_CPUMASK_ALLOC		0
-#define	SCHED_CPUMASK_FREE(v)
-#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
+#define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
 #endif
 
 #define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
@@ -7361,9 +7369,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		return -ENOMEM;
 	}
 
-#if SCHED_CPUMASK_ALLOC
 	/* get space for all scratch cpumask variables */
-	allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+	sched_cpumask_alloc(&allmasks);
 	if (!allmasks) {
 		printk(KERN_WARNING "Cannot alloc cpumask array\n");
 		kfree(rd);
@@ -7372,7 +7379,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 		return -ENOMEM;
 	}
-#endif
+
 	tmpmask = (cpumask_t *)allmasks;
 
 
@@ -7626,13 +7633,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpu_attach_domain(sd, rd, i);
 	}
 
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	return 0;
 
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map, tmpmask);
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	kfree(rd);
 	return -ENOMEM;
 #endif

From ae1e9130bfb9ad55eb97ec3fb17a122b7a118f98 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Nov 2008 09:05:16 +0100
Subject: [PATCH 13/31] sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER =>
 SCHED_OMIT_FRAME_POINTER

Impact: cleanup, change .config option name

We had this ugly config name for a long time for hysteric raisons.
Rename it to a saner name.

We still cannot get rid of it completely, until /proc/<pid>/stack
usage replaces WCHAN usage for good.

We'll be able to do that in the v2.6.29/v2.6.30 timeframe.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/Kconfig         | 2 +-
 arch/m32r/Kconfig         | 2 +-
 arch/mips/Kconfig         | 2 +-
 arch/powerpc/Kconfig      | 2 +-
 arch/x86/Kconfig          | 2 +-
 include/asm-m32r/system.h | 2 +-
 kernel/Makefile           | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 27eec71429b052..59d12788b60c67 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -99,7 +99,7 @@ config GENERIC_IOMAP
 	bool
 	default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index dbaed4a638153b..29047d5c259aa9 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
         bool
         default y
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f4af967a6b3081..a5255e7c79e004 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
 	bool
 	default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 525c13a4de93ed..adb23ea1c1ef1b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,7 +141,7 @@ config GENERIC_NVRAM
 	bool
 	default y if PPC32
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d5550d19b663f..74db682ec1ce15 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -364,7 +364,7 @@ config X86_RDC321X
 	  as R-8610-(G).
 	  If you don't have one of these chips, you should say N here.
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
 	depends on X86
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 70a57c8c002b59..c980f5ba8de7dc 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -23,7 +23,7 @@
  */
 
 #if defined(CONFIG_FRAME_POINTER) || \
-	!defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER)
+	!defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
 #define M32R_PUSH_FP "	push fp\n"
 #define M32R_POP_FP  "	pop  fp\n"
 #else
diff --git a/kernel/Makefile b/kernel/Makefile
index e1af0397214837..46e67a3984959b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,7 +91,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 
-ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure

From ff9b48c3598732926fa09afd7f526981c32a48cc Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2008 21:34:09 +0530
Subject: [PATCH 14/31] sched: include group statistics in /proc/sched_debug

Impact: extend /proc/sched_debug info

Since the statistics of a group entity isn't exported directly from the
kernel, it becomes difficult to obtain some of the group statistics.
For example, the current method to obtain exec time of a group entity
is not always accurate. One has to read the exec times of all
the tasks(/proc/<pid>/sched) in the group and add them. This method
fails (or becomes difficult) if we want to collect stats of a group over
a duration where tasks get created and terminated.

This patch makes it easier to obtain group stats by directly including
them in /proc/sched_debug. Stats like group exec time would help user
programs (like LTP) to accurately measure the group fairness.

An example output of group stats from /proc/sched_debug:

cfs_rq[3]:/3/a/1
  .exec_clock                    : 89.598007
  .MIN_vruntime                  : 0.000001
  .min_vruntime                  : 256300.970506
  .max_vruntime                  : 0.000001
  .spread                        : 0.000000
  .spread0                       : -25373.372248
  .nr_running                    : 0
  .load                          : 0
  .yld_exp_empty                 : 0
  .yld_act_empty                 : 0
  .yld_both_empty                : 0
  .yld_count                     : 4474
  .sched_switch                  : 0
  .sched_count                   : 40507
  .sched_goidle                  : 12686
  .ttwu_count                    : 15114
  .ttwu_local                    : 11950
  .bkl_count                     : 67
  .nr_spread_over                : 0
  .shares                        : 0
  .se->exec_start                : 113676.727170
  .se->vruntime                  : 1592.612714
  .se->sum_exec_runtime          : 89.598007
  .se->wait_start                : 0.000000
  .se->sleep_start               : 0.000000
  .se->block_start               : 0.000000
  .se->sleep_max                 : 0.000000
  .se->block_max                 : 0.000000
  .se->exec_max                  : 1.000282
  .se->slice_max                 : 1.999750
  .se->wait_max                  : 54.981093
  .se->wait_sum                  : 217.610521
  .se->wait_count                : 50
  .se->load.weight               : 2

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Acked-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index d25cefe3f0eb45..3625a659869945 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu,
+		struct task_group *tg)
+{
+	struct sched_entity *se = tg->se[cpu];
+	if (!se)
+		return;
+
+#define P(F) \
+	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define PN(F) \
+	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+
+	PN(se->exec_start);
+	PN(se->vruntime);
+	PN(se->sum_exec_runtime);
+#ifdef CONFIG_SCHEDSTATS
+	PN(se->wait_start);
+	PN(se->sleep_start);
+	PN(se->block_start);
+	PN(se->sleep_max);
+	PN(se->block_max);
+	PN(se->exec_max);
+	PN(se->slice_max);
+	PN(se->wait_max);
+	PN(se->wait_sum);
+	P(se->wait_count);
+#endif
+	P(se->load.weight);
+#undef PN
+#undef P
+}
+#endif
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -181,6 +215,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
+	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 
@@ -261,7 +296,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);

From 934352f214b3251eb0793c1209d346595a661d80 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2008 20:41:13 +0530
Subject: [PATCH 15/31] sched: add hierarchical accounting to cpu accounting
 controller

Impact: improve CPU time accounting of tasks under the cpu accounting controller

Add hierarchical accounting to cpu accounting controller and include
cpuacct documentation.

Currently, while charging the task's cputime to its accounting group,
the accounting group hierarchy isn't updated. This patch charges the cputime
of a task to its accounting group and all its parent accounting groups.

Reported-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/controllers/cpuacct.txt | 32 +++++++++++++++++++++++++++
 kernel/sched.c                        | 12 +++++++---
 2 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/controllers/cpuacct.txt

diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt
new file mode 100644
index 00000000000000..bb775fbe43d78f
--- /dev/null
+++ b/Documentation/controllers/cpuacct.txt
@@ -0,0 +1,32 @@
+CPU Accounting Controller
+-------------------------
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem.
+
+# mkdir /cgroups
+# mount -t cgroup -ocpuacct none /cgroups
+
+With the above step, the initial or the parent accounting group
+becomes visible at /cgroups. At bootup, this group includes all the
+tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
+/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
+this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /cgroups.
+
+# cd /cgroups
+# mkdir g1
+# echo $$ > g1
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/cgroups/cpuacct.usage also.
diff --git a/kernel/sched.c b/kernel/sched.c
index 59db86c915f9d9..ebaf432365f685 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9196,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  * (balbir@in.ibm.com).
  */
 
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
+	struct cpuacct *parent;
 };
 
 struct cgroup_subsys cpuacct_subsys;
@@ -9234,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create(
 		return ERR_PTR(-ENOMEM);
 	}
 
+	if (cgrp->parent)
+		ca->parent = cgroup_ca(cgrp->parent);
+
 	return &ca->css;
 }
 
@@ -9313,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
+	int cpu;
 
 	if (!cpuacct_subsys.active)
 		return;
 
+	cpu = task_cpu(tsk);
 	ca = task_ca(tsk);
-	if (ca) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 
+	for (; ca; ca = ca->parent) {
+		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }

From 50ee91765e25e7967a7b69cd5cc2bcab85e2eeb8 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Tue, 11 Nov 2008 18:13:23 +0530
Subject: [PATCH 16/31] sched/rt: removed unneeded defintion

Impact: cleanup

This function no longer exists, so remove the defintion.

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39a711207..c6bfb34d978ea6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -258,8 +258,6 @@ static inline int select_nohz_load_balancer(int cpu)
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */

From ec4e0e2fe018992d980910db901637c814575914 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Tue, 18 Nov 2008 22:41:57 -0800
Subject: [PATCH 17/31] sched: fix inconsistency when redistribute per-cpu
 tg->cfs_rq shares

Impact: make load-balancing more consistent

In the update_shares() path leading to tg_shares_up(), the calculation of
per-cpu cfs_rq shares is rather erratic even under moderate task wake up
rate.  The problem is that the per-cpu tg->cfs_rq load weight used in the
sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares
are collected at different time.  Under moderate system load, we've seen
quite a bit of variation on the cfs_rq->shares and ultimately wildly
affects sched_entity's load weight.

This patch caches the result of initial per-cpu load weight when doing the
sum calculation, and then pass it down to update_group_shares_cpu() for
redistributing per-cpu cfs_rq shares.  This allows consistent total cfs_rq
shares across all CPUs. It also simplifies the rounding and zero load
weight check.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index a4c156d9a4a56d..93bfb086e60fe6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1453,27 +1453,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
 			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
 
 	if (!tg->se[cpu])
 		return;
 
-	rq_weight = tg->cfs_rq[cpu]->load.weight;
-
-	/*
-	 * If there are currently no tasks on the cpu pretend there is one of
-	 * average load so that when a new task gets to run here it will not
-	 * get delayed by group starvation.
-	 */
-	if (!rq_weight) {
-		boost = 1;
-		rq_weight = NICE_0_LOAD;
-	}
-
-	if (unlikely(rq_weight > sd_rq_weight))
-		rq_weight = sd_rq_weight;
+	rq_weight = tg->cfs_rq[cpu]->rq_weight;
 
 	/*
 	 *           \Sum shares * rq_weight
@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+	shares = (sd_shares * rq_weight) / sd_rq_weight;
 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
 	if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		/*
-		 * record the actual number of shares, not the boosted amount.
-		 */
-		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-		tg->cfs_rq[cpu]->rq_weight = rq_weight;
+		tg->cfs_rq[cpu]->shares = shares;
 
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long rq_weight = 0;
+	unsigned long weight, rq_weight = 0;
 	unsigned long shares = 0;
 	struct sched_domain *sd = data;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
-		rq_weight += tg->cfs_rq[i]->load.weight;
+		/*
+		 * If there are currently no tasks on the cpu pretend there
+		 * is one of average load so that when a new task gets to
+		 * run here it will not get delayed by group starvation.
+		 */
+		weight = tg->cfs_rq[i]->load.weight;
+		if (!weight)
+			weight = NICE_0_LOAD;
+
+		tg->cfs_rq[i]->rq_weight = weight;
+		rq_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 
@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	if (!rq_weight)
-		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
 	for_each_cpu_mask(i, sd->span)
 		update_group_shares_cpu(tg, i, shares, rq_weight);
 

From 957ad0166e9f76a8561dafa5e14ef5bd3f5e9a3b Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 21 Nov 2008 01:30:36 +0100
Subject: [PATCH 18/31] sched: update comment for move_task_off_dead_cpu

Impact: cleanup

This commit:

commit f7b4cddcc5aca03e80e357360c9424dfba5056c2
Author: Oleg Nesterov <oleg@tv-sign.ru>
Date:   Tue Oct 16 23:30:56 2007 -0700

    do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(ta

    Currently move_task_off_dead_cpu() is called under
    write_lock_irq(tasklist).  This means it can't use task_lock() which is
    needed to improve migrating to take task's ->cpuset into account.

    Change the code to call move_task_off_dead_cpu() with irqs enabled, and
    change migrate_live_tasks() to use read_lock(tasklist).

...forgot to update the comment in front of move_task_off_dead_cpu.

Reference: http://lkml.org/lkml/2008/6/23/135

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 93bfb086e60fe6..a6085d5166ddb7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6094,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {

From 70574a996fc7a70c5586eb56bd92a544eccf18b6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 28 Nov 2008 22:08:00 +0300
Subject: [PATCH 19/31] sched: move double_unlock_balance() higher

Move double_lock_balance()/double_unlock_balance() higher to fix the following
with gcc-3.4.6:

   CC      kernel/sched.o
 In file included from kernel/sched.c:1605:
 kernel/sched_rt.c: In function `find_lock_lowest_rq':
 kernel/sched_rt.c:914: sorry, unimplemented: inlining failed in call to 'double_unlock_balance': function body not available
 kernel/sched_rt.c:1077: sorry, unimplemented: called from here
 make[2]: *** [kernel/sched.o] Error 1

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 67 +++++++++++++++++++++++------------------------
 kernel/sched_rt.c |  4 ---
 2 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3d1ee429219bdf..6a99703e0eb080 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1581,6 +1581,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	int ret = 0;
+
+	if (unlikely(!irqs_disabled())) {
+		/* printk() doesn't work good under rq->lock */
+		spin_unlock(&this_rq->lock);
+		BUG_ON(1);
+	}
+	if (unlikely(!spin_trylock(&busiest->lock))) {
+		if (busiest < this_rq) {
+			spin_unlock(&this_rq->lock);
+			spin_lock(&busiest->lock);
+			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+			ret = 1;
+		} else
+			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+	}
+	return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(busiest->lock)
+{
+	spin_unlock(&busiest->lock);
+	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2780,40 +2813,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-	__releases(this_rq->lock)
-	__acquires(busiest->lock)
-	__acquires(this_rq->lock)
-{
-	int ret = 0;
-
-	if (unlikely(!irqs_disabled())) {
-		/* printk() doesn't work good under rq->lock */
-		spin_unlock(&this_rq->lock);
-		BUG_ON(1);
-	}
-	if (unlikely(!spin_trylock(&busiest->lock))) {
-		if (busiest < this_rq) {
-			spin_unlock(&this_rq->lock);
-			spin_lock(&busiest->lock);
-			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-			ret = 1;
-		} else
-			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-	}
-	return ret;
-}
-
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-	__releases(busiest->lock)
-{
-	spin_unlock(&busiest->lock);
-	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2bdd4442359991..587a16e2a8f596 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -909,10 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
-static inline void double_unlock_balance(struct rq *this_rq,
-						struct rq *busiest);
-
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)

From 6c415b9234a8c71f290e5d4fddc467f103f32719 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Mon, 1 Dec 2008 20:49:05 +0530
Subject: [PATCH 20/31] sched: add uid information to sched_debug for
 CONFIG_USER_SCHED

Impact: extend information in /proc/sched_debug

This patch adds uid information in sched_debug for CONFIG_USER_SCHED

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 10 ++++++++++
 kernel/sched_debug.c  |  6 +++++-
 kernel/user.c         |  2 ++
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7a69c4d224eeda..d8733f07d80bdc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2218,6 +2218,7 @@ extern void normalize_rt_tasks(void);
 extern struct task_group init_task_group;
 #ifdef CONFIG_USER_SCHED
 extern struct task_group root_task_group;
+extern void set_tg_uid(struct user_struct *user);
 #endif
 
 extern struct task_group *sched_create_group(struct task_group *parent);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6a99703e0eb080..4c7388ef5be7c5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -261,6 +261,10 @@ struct task_group {
 	struct cgroup_subsys_state css;
 #endif
 
+#ifdef CONFIG_USER_SCHED
+	uid_t uid;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
@@ -286,6 +290,12 @@ struct task_group {
 
 #ifdef CONFIG_USER_SCHED
 
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+	user->tg->uid = user->uid;
+}
+
 /*
  * Root task group.
  * 	Every UID task group (including init_task_group aka UID-0) will
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index baf2f17af46217..4293cfa9681d74 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -160,10 +160,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+	{
+		uid_t uid = cfs_rq->tg->uid;
+		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
+	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
-
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae430c..cec2224bc9f524 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
 	if (IS_ERR(up->tg))
 		rc = -ENOMEM;
 
+	set_tg_uid(up);
+
 	return rc;
 }
 

From 5436499e6098759c2340f8b906ea52f993dc4efb Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Sun, 7 Dec 2008 18:47:37 -0800
Subject: [PATCH 21/31] sched: fix sd_parent_degenerate on non-numa smp machine

Impact: optimize the sched domains tree some more

The addition of SD_SERIALIZE flag added to SD_NODE_INIT prevented top level
dummy numa sched_domain to be properly degenerated on non-numa smp machine.
The reason is that in sd_parent_degenerate(), it found that the child and
parent does not have comon sched_domain flags due to SD_SERIALIZE.  However,
for non-numa smp box, the top level is a dummy with a single sched_group.

Filter out SD_SERIALIZE if it is on non-numa machine to properly degenerate
top level node sched_domain.  this will cut back some of the sd domain walk
in the load balancer code.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 152828239ef01f..74498c840f933e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6768,6 +6768,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
+		if (nr_node_ids == 1)
+			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;

From efbe027e95dc13ac343b6130948418d7ead7ddf1 Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2008 20:52:49 +0530
Subject: [PATCH 22/31] sched: idle_balance() does not call
 load_balance_newidle()

Impact: fix SD_BALANCE_NEWIDLEand broaden its use

load_balance_newidle() does not get called if SD_BALANCE_NEWIDLE is
set at higher level domain (3-CPU) and not in low level domain (2-MC).

pulled_task is initialised to -1 and checked for non-zero which is
always true if the lowest level sched_domain does not have
SD_BALANCE_NEWIDLE flag set.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 74498c840f933e..bb9c6384d07740 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3685,7 +3685,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
-	int pulled_task = -1;
+	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	cpumask_t tmpmask;
 

From ee79d1bdb6a10499e53f80b1e8d14110215178ba Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Dec 2008 18:49:50 +0100
Subject: [PATCH 23/31] sched: let arch_update_cpu_topology indicate if
 topology changed

Change arch_update_cpu_topology so it returns 1 if the cpu topology changed
and 0 if it didn't change. This will be useful for the next patch which adds
a call to this function in partition_sched_domains.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/s390/kernel/topology.c | 5 +++--
 include/linux/topology.h    | 2 +-
 kernel/sched.c              | 8 +++++++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index a947899dcba1d5..bf96f1b5c6ec20 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
 		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
 }
 
-void arch_update_cpu_topology(void)
+int arch_update_cpu_topology(void)
 {
 	struct tl_info *info = tl_info;
 	struct sys_device *sysdev;
@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
 	if (!machine_has_topology) {
 		update_cpu_core_map();
 		topology_update_polarization_simple();
-		return;
+		return 0;
 	}
 	stsi(info, 15, 1, 2);
 	tl_to_cores(info);
@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
+	return 1;
 }
 
 static void topology_work_fn(struct work_struct *work)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 117f1b7405cfc9..0c5b5ac36d8edc 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -49,7 +49,7 @@
 	for_each_online_node(node)			\
 		if (nr_cpus_node(node))
 
-void arch_update_cpu_topology(void);
+int arch_update_cpu_topology(void);
 
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
diff --git a/kernel/sched.c b/kernel/sched.c
index ef212da928e895..fcfbbd9dbd6083 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7675,8 +7675,14 @@ static struct sched_domain_attr *dattr_cur;
  */
 static cpumask_t fallback_doms;
 
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
 {
+	return 0;
 }
 
 /*

From d65bd5ecb2bd166cea4952a59b7e16cc3ad6ef6c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Dec 2008 18:49:51 +0100
Subject: [PATCH 24/31] sched: add missing arch_update_cpu_topology() call

arch_reinit_sched_domains() used to call arch_update_cpu_topology()
via arch_init_sched_domains(). This call got lost with
e761b7725234276a802322549cee5255305a0930 ("cpu hotplug, sched: Introduce
cpu_active_map and redo sched domain managment (take 2)".

So we might end up with outdated and missing cpus in the cpu core
maps (architecture used to call arch_reinit_sched_domains if cpu
topology changed).

This adds a call to arch_update_cpu_topology in partition_sched_domains
which gets called whenever scheduling domains get updated. Which is
what is supposed to happen when cpu topology changes.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index fcfbbd9dbd6083..ad7b93be56918e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7774,17 +7774,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
+	int new_topology;
 
 	mutex_lock(&sched_domains_mutex);
 
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
 	n = doms_new ? ndoms_new : 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < n; j++) {
+		for (j = 0; j < n && !new_topology; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
@@ -7804,7 +7808,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < ndoms_cur; j++) {
+		for (j = 0; j < ndoms_cur && !new_topology; j++) {
 			if (cpus_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;

From 03e89e4574a680af15f59329b061f35d9813aff4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 16 Dec 2008 08:45:30 +0100
Subject: [PATCH 25/31] sched: fix wakeup preemption clock

Impact: sharpen the wakeup-granularity to always be against current scheduler time

It was possible to do the preemption check against an old time stamp.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 2 +-
 kernel/sched_fair.c | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index ad7b93be56918e..88215066efaec3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,6 +2266,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
+	update_rq_clock(rq);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
@@ -2323,7 +2324,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		schedstat_inc(p, se.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.nr_wakeups_remote);
-	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	success = 1;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059ac..928cd74cff0db3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1345,12 +1345,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 
-	if (unlikely(rt_prio(p->prio))) {
-		struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	update_curr(cfs_rq);
 
-		update_rq_clock(rq);
-		update_curr(cfs_rq);
+	if (unlikely(rt_prio(p->prio))) {
 		resched_task(curr);
 		return;
 	}

From 34f28ecd0f4bdc733c681294d02d9fab5880591b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 16 Dec 2008 08:45:31 +0100
Subject: [PATCH 26/31] sched: optimize update_curr()

Impact: micro-optimization

Skip the hard work when there is none.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 928cd74cff0db3..5ad4440f0fc447 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	 * overflow on 32 bits):
 	 */
 	delta_exec = (unsigned long)(now - curr->exec_start);
+	if (!delta_exec)
+		return;
 
 	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;

From 720f54988e17b33f3f477010b3a68ee872d20d5a Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 15 Dec 2008 22:02:01 -0800
Subject: [PATCH 27/31] sched, cpuacct: refactoring cpuusage_read /
 cpuusage_write

Impact: micro-optimize the code on 64-bit architectures

In the thread regarding to 'export percpu cpuacct cgroup stats'
http://lkml.org/lkml/2008/12/7/13

akpm pointed out that current cpuacct code is inefficient.  This patch
refactoring the following:

* make cpu_rq locking only on 32-bit
* change iterator to each_present_cpu instead of each_possible_cpu to
  make it hotplug friendly.

It's a bit of code churn, but I was rewarded with 160 byte code size saving
on x86-64 arch and zero code size change on i386.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 88215066efaec3..41b7e2d524d680 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9275,6 +9275,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	kfree(ca);
 }
 
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 data;
+
+#ifndef CONFIG_64BIT
+	/*
+	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+	 */
+	spin_lock_irq(&cpu_rq(cpu)->lock);
+	data = *cpuusage;
+	spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+	data = *cpuusage;
+#endif
+
+	return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+	/*
+	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+	 */
+	spin_lock_irq(&cpu_rq(cpu)->lock);
+	*cpuusage = val;
+	spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+	*cpuusage = val;
+#endif
+}
+
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
@@ -9282,17 +9317,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 	u64 totalcpuusage = 0;
 	int i;
 
-	for_each_possible_cpu(i) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
-		/*
-		 * Take rq->lock to make 64-bit addition safe on 32-bit
-		 * platforms.
-		 */
-		spin_lock_irq(&cpu_rq(i)->lock);
-		totalcpuusage += *cpuusage;
-		spin_unlock_irq(&cpu_rq(i)->lock);
-	}
+	for_each_present_cpu(i)
+		totalcpuusage += cpuacct_cpuusage_read(ca, i);
 
 	return totalcpuusage;
 }
@@ -9309,13 +9335,9 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 		goto out;
 	}
 
-	for_each_possible_cpu(i) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+	for_each_present_cpu(i)
+		cpuacct_cpuusage_write(ca, i, 0);
 
-		spin_lock_irq(&cpu_rq(i)->lock);
-		*cpuusage = 0;
-		spin_unlock_irq(&cpu_rq(i)->lock);
-	}
 out:
 	return err;
 }

From e9515c3c9feecd74174c2998add0db51e02abb8d Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 15 Dec 2008 22:04:15 -0800
Subject: [PATCH 28/31] sched, cpuacct: export percpu cpuacct cgroup stats

This patch export per-cpu CPU cycle usage for a given cpuacct cgroup.
There is a need for a user space monitor daemon to track group CPU
usage on per-cpu base.  It is also useful for monitoring CFS load
balancer behavior by tracking per CPU group usage.

Signed-off-by: Ken Chen <kenchen@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 41b7e2d524d680..f53e2b8ef52153 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9342,12 +9342,32 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 	return err;
 }
 
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+				   struct seq_file *m)
+{
+	struct cpuacct *ca = cgroup_ca(cgroup);
+	u64 percpu;
+	int i;
+
+	for_each_present_cpu(i) {
+		percpu = cpuacct_cpuusage_read(ca, i);
+		seq_printf(m, "%llu ", (unsigned long long) percpu);
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
+	{
+		.name = "usage_percpu",
+		.read_seq_string = cpuacct_percpu_seq_read,
+	},
+
 };
 
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

From 80f40ee4a07530cc3acbc239a9299ec47025825b Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 15 Dec 2008 11:56:48 +0530
Subject: [PATCH 29/31] sched: use RCU variant of list traversal in
 for_each_leaf_rt_rq()

Impact: fix potential of rare crash

for_each_leaf_rt_rq() walks an RCU protected list (rq->leaf_rt_rq_list),
but doesn't use list_for_each_entry_rcu(). Fix this.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6af..7bdf84c85ccd82 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
-	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {

From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Tue, 16 Dec 2008 23:41:22 -0800
Subject: [PATCH 30/31] schedstat: consolidate per-task cpu runtime stats

Impact: simplify code

When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated
twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time.
These two stats are exactly the same.

Given that task->se.sum_exec_runtime is always accumulated by the core
scheduler, sched_info can reuse that data instead of duplicate the accounting.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/base.c        | 2 +-
 include/linux/sched.h | 3 +--
 kernel/delayacct.c    | 2 +-
 kernel/sched.c        | 2 ++
 kernel/sched_stats.h  | 5 ++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d4677603c88920..4d745bac768c2b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,7 +347,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			task->sched_info.cpu_time,
+			task->se.sum_exec_runtime,
 			task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cccd6dc5d6672..2d1e840ddd35ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -670,8 +670,7 @@ struct reclaim_state;
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
-	unsigned long long cpu_time,  /* time spent on the cpu */
-			   run_delay; /* time spent waiting on a runqueue */
+	unsigned long long run_delay; /* time spent waiting on a runqueue */
 
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be87..abb6e17505e289 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->sched_info.cpu_time;
+	t3 = tsk->se.sum_exec_runtime;
 
 	d->cpu_count += t1;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f53e2b8ef52153..fd835fc320b895 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -596,6 +596,8 @@ struct rq {
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
+	unsigned long long rq_cpu_time;
+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
 	/* sys_sched_yield() stats */
 	unsigned int yld_exp_empty;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02cf1..3b01098164c896 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_sched_info.cpu_time,
+		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
 		seq_printf(seq, "\n");
@@ -123,7 +123,7 @@ static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {
 	if (rq)
-		rq->rq_sched_info.cpu_time += delta;
+		rq->rq_cpu_time += delta;
 }
 
 static inline void
@@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
 	unsigned long long delta = task_rq(t)->clock -
 					t->sched_info.last_arrival;
 
-	t->sched_info.cpu_time += delta;
 	rq_sched_info_depart(task_rq(t), delta);
 
 	if (t->state == TASK_RUNNING)

From 826e08b0157c0ce8a80dfe3c0a6c5a1540dd0b1d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 22 Dec 2008 07:37:41 +0100
Subject: [PATCH 31/31] sched: fix warning in fs/proc/base.c

Stephen Rothwell reported this new (harmless) build warning on platforms that
define u64 to long:

 fs/proc/base.c: In function 'proc_pid_schedstat':
 fs/proc/base.c:352: warning: format '%llu' expects type 'long long unsigned int', but argument 3 has type 'u64'

asm-generic/int-l64.h platforms strike again: that file should be eliminated.

Fix it by casting the parameters to long long.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4d745bac768c2b..a48200c71127f3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,8 +347,8 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			task->se.sum_exec_runtime,
-			task->sched_info.run_delay,
+			(unsigned long long)task->se.sum_exec_runtime,
+			(unsigned long long)task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
 #endif