From f6d5b33125c4fa63c16f7f54c533338c9695d82c Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 29 Mar 2011 18:00:27 -0700
Subject: [PATCH 01/13] RTC: Fix early irqs caused by calling rtc_set_alarm too
 early

When we register an rtc device at boot, we read the alarm value
in hardware and set the rtc device's aie_timer to that value.

The initial method to do this was to simply call rtc_set_alarm()
with the value read from hardware. However, this may cause problems
as rtc_set_alarm may enable interupts, and the RTC alarm might fire,
which can cause invalid pointer dereferencing since the RTC registration
is not complete.

This patch solves the issue by initializing the rtc_device.aie_timer
y hand via rtc_initialize_alarm(). This avoids any calls to the RTC
hardware which might enable interrupts too early.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
Reported-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/class.c     |  2 +-
 drivers/rtc/interface.c | 26 ++++++++++++++++++++++++++
 include/linux/rtc.h     |  2 ++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 09b4437b3e616a..39013867cbd6cf 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -171,7 +171,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 	err = __rtc_read_alarm(rtc, &alrm);
 
 	if (!err && !rtc_valid_tm(&alrm.time))
-		rtc_set_alarm(rtc, &alrm);
+		rtc_initialize_alarm(rtc, &alrm);
 
 	strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
 	dev_set_name(&rtc->dev, "rtc%d", id);
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 8ec6b069a7f580..b2fea80dfb65ea 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -375,6 +375,32 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 }
 EXPORT_SYMBOL_GPL(rtc_set_alarm);
 
+/* Called once per device from rtc_device_register */
+int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+{
+	int err;
+
+	err = rtc_valid_tm(&alarm->time);
+	if (err != 0)
+		return err;
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return err;
+
+	rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
+	rtc->aie_timer.period = ktime_set(0, 0);
+	if (alarm->enabled) {
+		rtc->aie_timer.enabled = 1;
+		timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node);
+	}
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_initialize_alarm);
+
+
+
 int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 {
 	int err = mutex_lock_interruptible(&rtc->ops_lock);
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 2ca7e8a78060f2..877ece45426f6c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -228,6 +228,8 @@ extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
 extern int rtc_set_alarm(struct rtc_device *rtc,
 				struct rtc_wkalrm *alrm);
+extern int rtc_initialize_alarm(struct rtc_device *rtc,
+				struct rtc_wkalrm *alrm);
 extern void rtc_update_irq(struct rtc_device *rtc,
 			unsigned long num, unsigned long events);
 

From a54aba87bb8e90f9e39bcfe151717b86abbbdd79 Mon Sep 17 00:00:00 2001
From: Vasily Khoruzhick <anarsoul@gmail.com>
Date: Thu, 24 Mar 2011 22:09:33 +0200
Subject: [PATCH 02/13] RTC: Fix s3c compile error due to missing
 s3c_rtc_setpie

s3c_rtc_setpie was removed, and it resulted in compiler error:

drivers/rtc/rtc-s3c.c: In function s3c_rtc_release
drivers/rtc/rtc-s3c.c:339:2: error: implicit declaration of function
s3c_rtc_setpie

Fix it by removing s3c_rtc_release calls.

[jstultz: An identical fix was also sent in by Jiri Pinkava
 <jiri.pinkava@vscht.cz>]

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/rtc-s3c.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index 714964913e5eb3..b3466c491cd35c 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -336,7 +336,6 @@ static void s3c_rtc_release(struct device *dev)
 
 	/* do not clear AIE here, it may be needed for wake */
 
-	s3c_rtc_setpie(dev, 0);
 	free_irq(s3c_rtc_alarmno, rtc_dev);
 	free_irq(s3c_rtc_tickno, rtc_dev);
 }
@@ -408,7 +407,6 @@ static int __devexit s3c_rtc_remove(struct platform_device *dev)
 	platform_set_drvdata(dev, NULL);
 	rtc_device_unregister(rtc);
 
-	s3c_rtc_setpie(&dev->dev, 0);
 	s3c_rtc_setaie(&dev->dev, 0);
 
 	clk_disable(rtc_clk);

From 8c122b96866580c99e44f3f07ac93a993d964ec3 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Fri, 18 Mar 2011 04:26:24 -0400
Subject: [PATCH 03/13] RTC: add missing "return 0" in new alarm func for
 rtc-bfin.c

The new bfin_rtc_alarm_irq_enable function forgot to add a "return 0" to
the end leading to the build warning:
	drivers/rtc/rtc-bfin.c: In function 'bfin_rtc_alarm_irq_enable':
	drivers/rtc/rtc-bfin.c:253: warning: control reaches end of non-void function

CC: stable@kernel.org
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/rtc-bfin.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c
index ca9cff85ab8a8f..f2496440d07642 100644
--- a/drivers/rtc/rtc-bfin.c
+++ b/drivers/rtc/rtc-bfin.c
@@ -250,6 +250,8 @@ static int bfin_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 		bfin_rtc_int_set_alarm(rtc);
 	else
 		bfin_rtc_int_clear(~(RTC_ISTAT_ALARM | RTC_ISTAT_ALARM_DAY));
+
+	return 0;
 }
 
 static int bfin_rtc_read_time(struct device *dev, struct rtc_time *tm)

From 67c1b8c6aa354aad14aad85d36508fd73d1c6361 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 7 Apr 2011 09:39:44 +0800
Subject: [PATCH 04/13] RTC: rtc-mrst: follow on to the change of
 rtc_device_register()

commit f44f7f96a20 (RTC: Initialize kernel state from RTC) will
call rtc_read_alarm() inside rtc_device_register(), so rtc-mrst
driver need to call dev_set_drvdata() before rtc_device_register()
get called.

Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/rtc-mrst.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index b86bc328463b2e..d2c397c369fc5e 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -342,6 +342,8 @@ vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
 
 	mrst_rtc.irq = rtc_irq;
 	mrst_rtc.iomem = iomem;
+	mrst_rtc.dev = dev;
+	dev_set_drvdata(dev, &mrst_rtc);
 
 	mrst_rtc.rtc = rtc_device_register(driver_name, dev,
 				&mrst_rtc_ops, THIS_MODULE);
@@ -350,8 +352,6 @@ vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
 		goto cleanup0;
 	}
 
-	mrst_rtc.dev = dev;
-	dev_set_drvdata(dev, &mrst_rtc);
 	rename_region(iomem, dev_name(&mrst_rtc.rtc->dev));
 
 	spin_lock_irq(&rtc_lock);
@@ -376,9 +376,10 @@ vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
 	return 0;
 
 cleanup1:
-	mrst_rtc.dev = NULL;
 	rtc_device_unregister(mrst_rtc.rtc);
 cleanup0:
+	dev_set_drvdata(dev, NULL);
+	mrst_rtc.dev = NULL;
 	release_region(iomem->start, iomem->end + 1 - iomem->start);
 	dev_err(dev, "rtc-mrst: unable to initialise\n");
 	return retval;

From 621d26567fd0c222f419e3b5ddf39e529e0fdcb3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 8 Apr 2011 12:11:06 +0200
Subject: [PATCH 05/13] perf: Fix a build error with some GCC versions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix this:

 util/cgroup.c: In function ‘open_cgroup’:
 util/cgroup.c:16:16: error: ‘saved_ptr’ may be used uninitialized in this function
 util/cgroup.c:16:16: note: ‘saved_ptr’ was declared here

Apparently newer GCC (4.6) can figure out that this variable is properly
initialized - but some versions of GCC (such as 4.5.2) need help.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 9fea75535221a2..96bee5c4600845 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -13,7 +13,7 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
 	FILE *fp;
 	char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1];
-	char *token, *saved_ptr;
+	char *token, *saved_ptr = NULL;
 	int found = 0;
 
 	fp = fopen("/proc/mounts", "r");

From e566b76ed30768140df8f0023904aed5a41244f7 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 6 Apr 2011 02:54:54 +0200
Subject: [PATCH 06/13] perf_event: Fix cgrp event scheduling bug in
 perf_enable_on_exec()

There is a bug in perf_event_enable_on_exec() when cgroup events are
active on a CPU: the cgroup events may be scheduled twice causing event
state corruptions which eventually may lead to kernel panics.

The reason is that the function needs to first schedule out the cgroup
events, just like for the per-thread events. The cgroup event are
scheduled back in automatically from the perf_event_context_sched_in()
function.

The patch also adds a WARN_ON_ONCE() is perf_cgroup_switch() to catch any
bogus state.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110406005454.GA1062@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 27960f114efd73..8e81a9860a0d54 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -364,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
 			}
 
 			if (mode & PERF_CGROUP_SWIN) {
+				WARN_ON_ONCE(cpuctx->cgrp);
 				/* set cgrp before ctxsw in to
 				 * allow event_filter_match() to not
 				 * have to pass task around
@@ -2423,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 	if (!ctx || !ctx->nr_events)
 		goto out;
 
+	/*
+	 * We must ctxsw out cgroup events to avoid conflict
+	 * when invoking perf_task_event_sched_in() later on
+	 * in this function. Otherwise we end up trying to
+	 * ctxswin cgroup events which are already scheduled
+	 * in.
+	 */
+	perf_cgroup_sched_out(current);
 	task_ctx_sched_out(ctx, EVENT_ALL);
 
 	raw_spin_lock(&ctx->lock);
@@ -2447,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	raw_spin_unlock(&ctx->lock);
 
+	/*
+	 * Also calls ctxswin for cgroup events, if any:
+	 */
 	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);

From b0432d8f162c7d5d9537b4cb749d44076b76a783 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Thu, 7 Apr 2011 17:23:22 -0700
Subject: [PATCH 07/13] sched: Fix sched-domain avg_load calculation

In function find_busiest_group(), the sched-domain avg_load isn't
calculated at all if there is a group imbalance within the domain. This
will cause erroneous imbalance calculation.

The reason is that calculate_imbalance() sees sds->avg_load = 0 and it
will dump entire sds->max_load into imbalance variable, which is used
later on to migrate entire load from busiest CPU to the puller CPU.

This has two really bad effect:

1. stampede of task migration, and they won't be able to break out
   of the bad state because of positive feedback loop: large load
   delta -> heavier load migration -> larger imbalance and the cycle
   goes on.

2. severe imbalance in CPU queue depth.  This causes really long
   scheduling latency blip which affects badly on application that
   has tight latency requirement.

The fix is to have kernel calculate domain avg_load in both cases. This
will ensure that imbalance calculation is always sensible and the target
is usually half way between busiest and puller CPU.

Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20110408002322.3A0D812217F@elm.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7f00772e57c90b..60f9d407c5ec09 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3127,6 +3127,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
 	/*
 	 * If the busiest group is imbalanced the below checks don't
 	 * work because they assumes all things are equal, which typically
@@ -3151,7 +3153,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * Don't pull any tasks if this group is already above the domain
 	 * average load.
 	 */
-	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 

From b30aef17f71cf9e24b10c11cbb5e5f0ebe8a85ab Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Fri, 8 Apr 2011 12:20:16 -0700
Subject: [PATCH 08/13] sched: Fix erroneous all_pinned logic

The scheduler load balancer has specific code to deal with cases of
unbalanced system due to lots of unmovable tasks (for example because of
hard CPU affinity). In those situation, it excludes the busiest CPU that
has pinned tasks for load balance consideration such that it can perform
second 2nd load balance pass on the rest of the system.

This all works as designed if there is only one cgroup in the system.

However, when we have multiple cgroups, this logic has false positives and
triggers multiple load balance passes despite there are actually no pinned
tasks at all.

The reason it has false positives is that the all pinned logic is deep in
the lowest function of can_migrate_task() and is too low level:

load_balance_fair() iterates each task group and calls balance_tasks() to
migrate target load. Along the way, balance_tasks() will also set a
all_pinned variable. Given that task-groups are iterated, this all_pinned
variable is essentially the status of last group in the scanning process.
Task group can have number of reasons that no load being migrated, none
due to cpu affinity. However, this status bit is being propagated back up
to the higher level load_balance(), which incorrectly think that no tasks
were moved.  It kick off the all pinned logic and start multiple passes
attempt to move load onto puller CPU.

To fix this, move the all_pinned aggregation up at the iterator level.
This ensures that the status is aggregated over all task-groups, not just
last one in the list.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: stable@kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/BANLkTi=ernzNawaR5tJZEsV_QVnfxqXmsQ@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 60f9d407c5ec09..6fa833ab2cb80e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-	int loops = 0, pulled = 0, pinned = 0;
+	int loops = 0, pulled = 0;
 	long rem_load_move = max_load_move;
 	struct task_struct *p, *n;
 
 	if (max_load_move == 0)
 		goto out;
 
-	pinned = 1;
-
 	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
 		if (loops++ > sysctl_sched_nr_migrate)
 			break;
 
 		if ((p->se.load.weight >> 1) > rem_load_move ||
-		    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+		    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+				      all_pinned))
 			continue;
 
 		pull_task(busiest, p, this_rq, this_cpu);
@@ -2153,9 +2152,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 
-	if (all_pinned)
-		*all_pinned = pinned;
-
 	return max_load_move - rem_load_move;
 }
 
@@ -3341,6 +3337,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
+		all_pinned = 1;
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,

From 30d746c68025ee69ac17219aacc9b1614d951f01 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 7 Apr 2011 14:13:15 +0200
Subject: [PATCH 09/13] x86/ce4100: Add reg property to bridges

without the reg property Ben's new code won't find the PCI & ISA
bridge and the devices won't get the DT-node attached.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: monstr@monstr.eu
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Link: http://lkml.kernel.org/r/20110407121315.GA9204@linutronix.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/ce4100/falconfalls.dts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index dc701ea585461a..2d6d226f2b10fc 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -74,6 +74,7 @@
 				compatible = "intel,ce4100-pci", "pci";
 				device_type = "pci";
 				bus-range = <1 1>;
+				reg = <0x0800 0x0 0x0 0x0 0x0>;
 				ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
 
 				interrupt-parent = <&ioapic2>;
@@ -412,6 +413,7 @@
 				#address-cells = <2>;
 				#size-cells = <1>;
 				compatible = "isa";
+				reg = <0xf800 0x0 0x0 0x0 0x0>;
 				ranges = <1 0 0 0 0 0x100>;
 
 				rtc@70 {

From 9d90e49da57fe73a2f35334fdd2fb60dbf3933ed Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 8 Apr 2011 11:23:00 -0700
Subject: [PATCH 10/13] x86/mrst: Fix boot crash caused by incorrect pin to irq
 mapping

Moorestown systems crash on boot because the secondary CPU
clockevent (apbt1) will fail to request irq#1, which does not
have ioapic chip in its irq_desc[] entry.

Background:

Moorestown platform does not have ISA bus nor legacy IRQs. It
reuses the range of legacy IRQs for regular device interrupts.
The routing information of early system device IRQs (timers) are
obtained from firmware provided SFI tables. We reuse/fake MP
configuration table to facilitate IRQ setup with IOAPIC.

Maintaining a 1:1 mapping of IOAPIC pin (RTE entry) and IRQ#
makes routing information clean and easy to understand on
Moorestown. Though optional.

This patch allows SFI timer and vRTC IRQ to be treated as ISA
IRQ so that pin2irq mapping will be 1:1.

Also fixed MP table type and use macros to clearly set MP IRQ
entries. As a result, apbt timer and RTC interrupts on
Moorestown are within legacy IRQ range:

 # cat /proc/interrupts
            CPU0       CPU1
   0:      11249          0   IO-APIC-edge      apbt0
   1:          0      12271   IO-APIC-edge      apbt1
   8:        887          0   IO-APIC-fasteoi   dw_spi
  13:          0          0   IO-APIC-fasteoi   INTEL_MID_DMAC2
  14:          0          0   IO-APIC-fasteoi   rtc0

Further discussion of this patch can be found at:

  https://lkml.org/lkml/2010/6/10/70

Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Link: http://lkml.kernel.org/r/1302286980-21139-1-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 5c0207bf959bc4..275dbc19e2cf55 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -97,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
 			pentry->freq_hz, pentry->irq);
 			if (!pentry->irq)
 				continue;
-			mp_irq.type = MP_IOAPIC;
+			mp_irq.type = MP_INTSRC;
 			mp_irq.irqtype = mp_INT;
 /* triggering mode edge bit 2-3, active high polarity bit 0-1 */
 			mp_irq.irqflag = 5;
-			mp_irq.srcbus = 0;
+			mp_irq.srcbus = MP_BUS_ISA;
 			mp_irq.srcbusirq = pentry->irq;	/* IRQ */
 			mp_irq.dstapic = MP_APIC_ALL;
 			mp_irq.dstirq = pentry->irq;
@@ -168,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 	for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
 		pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
 			totallen, (u32)pentry->phys_addr, pentry->irq);
-		mp_irq.type = MP_IOAPIC;
+		mp_irq.type = MP_INTSRC;
 		mp_irq.irqtype = mp_INT;
 		mp_irq.irqflag = 0xf;	/* level trigger and active low */
-		mp_irq.srcbus = 0;
+		mp_irq.srcbus = MP_BUS_ISA;
 		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
 		mp_irq.dstapic = MP_APIC_ALL;
 		mp_irq.dstirq = pentry->irq;
@@ -282,7 +282,7 @@ void __init x86_mrst_early_setup(void)
 	/* Avoid searching for BIOS MP tables */
 	x86_init.mpparse.find_smp_config = x86_init_noop;
 	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-
+	set_bit(MP_BUS_ISA, mp_bus_not_pci);
 }
 
 /*

From 0cd9c6494ee5c19aef085152bc37f3a4e774a9e1 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Thu, 14 Apr 2011 15:41:57 -0700
Subject: [PATCH 11/13] futex: Set FLAGS_HAS_TIMEOUT during futex_wait restart
 setup

The FLAGS_HAS_TIMEOUT flag was not getting set, causing the restart_block to
restart futex_wait() without a timeout after a signal.

Commit b41277dc7a18ee332d in 2.6.38 introduced the regression by accidentally
removing the the FLAGS_HAS_TIMEOUT assignment from futex_wait() during the setup
of the restart block. Restore the originaly behavior.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=32922

Reported-by: Tim Smith <tsmith201104@yahoo.com>
Reported-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/%3Cdaac0eb3af607f72b9a4d3126b2ba8fb5ed3b883.1302820917.git.dvhart%40linux.intel.com%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index dfb924ffe65ba7..fe28dc282eae43 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1886,7 +1886,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 	restart->futex.val = val;
 	restart->futex.time = abs_time->tv64;
 	restart->futex.bitset = bitset;
-	restart->futex.flags = flags;
+	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
 	ret = -ERESTART_RESTARTBLOCK;
 

From 7d6b46707f2491a94f4bd3b4329d2d7f809e9368 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 15 Apr 2011 20:39:01 +0900
Subject: [PATCH 12/13] x86, NUMA: Fix fakenuma boot failure

Currently, numa=fake boot parameter is broken. If it's used,
kernel may panic due to devide by zero error depending on CPU
configuration

Call Trace:
 [<ffffffff8104ad4c>] find_busiest_group+0x38c/0xd30
 [<ffffffff81086aff>] ? local_clock+0x6f/0x80
 [<ffffffff81050533>] load_balance+0xa3/0x600
 [<ffffffff81050f53>] idle_balance+0xf3/0x180
 [<ffffffff81550092>] schedule+0x722/0x7d0
 [<ffffffff81550538>] ? wait_for_common+0x128/0x190
 [<ffffffff81550a65>] schedule_timeout+0x265/0x320
 [<ffffffff81095815>] ? lock_release_holdtime+0x35/0x1a0
 [<ffffffff81550538>] ? wait_for_common+0x128/0x190
 [<ffffffff8109bb6c>] ? __lock_release+0x9c/0x1d0
 [<ffffffff815534e0>] ? _raw_spin_unlock_irq+0x30/0x40
 [<ffffffff815534e0>] ? _raw_spin_unlock_irq+0x30/0x40
 [<ffffffff81550540>] wait_for_common+0x130/0x190
 [<ffffffff81051920>] ? try_to_wake_up+0x510/0x510
 [<ffffffff8155067d>] wait_for_completion+0x1d/0x20
 [<ffffffff8107f36c>] kthread_create_on_node+0xac/0x150
 [<ffffffff81077bb0>] ? process_scheduled_works+0x40/0x40
 [<ffffffff8155045f>] ? wait_for_common+0x4f/0x190
 [<ffffffff8107a283>] __alloc_workqueue_key+0x1a3/0x590
 [<ffffffff81e0cce2>] cpuset_init_smp+0x6b/0x7b
 [<ffffffff81df3d07>] kernel_init+0xc3/0x182
 [<ffffffff8155d5e4>] kernel_thread_helper+0x4/0x10
 [<ffffffff81553cd4>] ? retint_restore_args+0x13/0x13
 [<ffffffff81df3c44>] ? start_kernel+0x400/0x400
 [<ffffffff8155d5e0>] ? gs_change+0x13/0x13

The divede by zero is caused by the following line,
group->cpu_power==0:

 kernel/sched_fair.c::update_sg_lb_stats()
        /* Adjust by relative CPU power of the group */
        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

This regression was caused by commit e23bba6044 ("x86-64, NUMA: Unify
emulated distance mapping") because it changes cpu -> node
mapping in the process of dropping fake_physnodes().

  old) all cpus are assinged node 0
  now) cpus are assigned round robin
       (the logic is implemented by numa_init_array())

  Note: The change in behavior only happens if the system doesn't
        have neither ACPI SRAT table nor AMD northbridge NUMA
	information.

Round robin assignment doesn't work because init_numa_sched_groups_power()
assumes all logical cpus in the same physical cpu share the same node
(then it only accounts for group_first_cpu()), and the simple round robin
breaks the above assumption.

Thus, this patch implements a reassignment of node-ids if buggy firmware
or numa emulation makes wrong cpu node map. Tt enforce all logical cpus
in the same physical cpu share the same node.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/20110415203928.1303.A69D9226@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2871d3c71b64a..8ed8908cc9f79a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -312,6 +312,26 @@ void __cpuinit smp_store_cpu_info(int id)
 		identify_secondary_cpu(c);
 }
 
+static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2)
+{
+	int node1 = early_cpu_to_node(cpu1);
+	int node2 = early_cpu_to_node(cpu2);
+
+	/*
+	 * Our CPU scheduler assumes all logical cpus in the same physical cpu
+	 * share the same node. But, buggy ACPI or NUMA emulation might assign
+	 * them to different node. Fix it.
+	 */
+	if (node1 != node2) {
+		pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n",
+			   cpu1, node1, cpu2, node2, node2);
+
+		numa_remove_cpu(cpu1);
+		numa_set_node(cpu1, node2);
+		numa_add_cpu(cpu1);
+	}
+}
+
 static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
 {
 	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
@@ -320,6 +340,7 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
 	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
 	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
 	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+	check_cpu_siblings_on_same_node(cpu1, cpu2);
 }
 
 
@@ -361,10 +382,12 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
 			cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
 			cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
+			check_cpu_siblings_on_same_node(cpu, i);
 		}
 		if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
 			cpumask_set_cpu(i, cpu_core_mask(cpu));
 			cpumask_set_cpu(cpu, cpu_core_mask(i));
+			check_cpu_siblings_on_same_node(cpu, i);
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */

From 5bbc097d890409d8eff4e3f1d26f11a9d6b7c07e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 15 Apr 2011 14:47:40 +0200
Subject: [PATCH 13/13] x86, amd: Disable GartTlbWlkErr when BIOS forgets it

This patch disables GartTlbWlk errors on AMD Fam10h CPUs if
the BIOS forgets to do is (or is just too old). Letting
these errors enabled can cause a sync-flood on the CPU
causing a reboot.

The AMD BKDG recommends disabling GART TLB Wlk Error completely.

This patch is the fix for

	https://bugzilla.kernel.org/show_bug.cgi?id=33012

on my machine.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Link: http://lkml.kernel.org/r/20110415131152.GJ18463@8bytes.org
Tested-by: Alexandre Demers <alexandre.f.demers@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/msr-index.h |  4 ++++
 arch/x86/kernel/cpu/amd.c        | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fd5a1f365c9510..3cce71413d0be8 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -96,11 +96,15 @@
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
 
+#define MSR_AMD64_MC0_MASK		0xc0010044
+
 #define MSR_IA32_MCx_CTL(x)		(MSR_IA32_MC0_CTL + 4*(x))
 #define MSR_IA32_MCx_STATUS(x)		(MSR_IA32_MC0_STATUS + 4*(x))
 #define MSR_IA32_MCx_ADDR(x)		(MSR_IA32_MC0_ADDR + 4*(x))
 #define MSR_IA32_MCx_MISC(x)		(MSR_IA32_MC0_MISC + 4*(x))
 
+#define MSR_AMD64_MCx_MASK(x)		(MSR_AMD64_MC0_MASK + (x))
+
 /* These are consecutive and not in the normal 4er MCE bank block */
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3ecece0217ef1c..3532d3bf81050e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -615,6 +615,25 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/* As a rule processors have APIC timer running in deep C states */
 	if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
 		set_cpu_cap(c, X86_FEATURE_ARAT);
+
+	/*
+	 * Disable GART TLB Walk Errors on Fam10h. We do this here
+	 * because this is always needed when GART is enabled, even in a
+	 * kernel which has no MCE support built in.
+	 */
+	if (c->x86 == 0x10) {
+		/*
+		 * BIOS should disable GartTlbWlk Errors themself. If
+		 * it doesn't do it here as suggested by the BKDG.
+		 *
+		 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
+		 */
+		u64 mask;
+
+		rdmsrl(MSR_AMD64_MCx_MASK(4), mask);
+		mask |= (1 << 10);
+		wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+	}
 }
 
 #ifdef CONFIG_X86_32