From aed5ed47724f6a7453fa62e3c90f3cee93edbfe3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 6 May 2015 18:04:23 +0200 Subject: [PATCH 1/4] context_tracking: Protect against recursion Context tracking recursion can happen when an exception triggers in the middle of a call to a context tracking probe. This special case can be caused by vmalloc faults. If an access to a memory area allocated by vmalloc happens in the middle of context_tracking_enter(), we may run into an endless fault loop because the exception in turn calls context_tracking_enter() which faults on the same vmalloc'ed memory, triggering an exception again, etc... Some rare crashes have been reported so lets protect against this with a recursion counter. Reported-by: Dave Jones Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris Metcalf Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rafael J . Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430928266-24888-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/context_tracking_state.h | 1 + kernel/context_tracking.c | 29 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 6b7b96a32b753a..678ecdf90cf660 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -12,6 +12,7 @@ struct context_tracking { * may be further optimized using static keys. */ bool active; + int recursion; enum ctx_state { CONTEXT_KERNEL = 0, CONTEXT_USER, diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 72d59a1a6eb66f..5b11a10e196a82 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -38,6 +38,25 @@ void context_tracking_cpu_set(int cpu) } } +static bool context_tracking_recursion_enter(void) +{ + int recursion; + + recursion = __this_cpu_inc_return(context_tracking.recursion); + if (recursion == 1) + return true; + + WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion); + __this_cpu_dec(context_tracking.recursion); + + return false; +} + +static void context_tracking_recursion_exit(void) +{ + __this_cpu_dec(context_tracking.recursion); +} + /** * context_tracking_enter - Inform the context tracking that the CPU is going * enter user or guest space mode. @@ -75,6 +94,9 @@ void context_tracking_enter(enum ctx_state state) WARN_ON_ONCE(!current->mm); local_irq_save(flags); + if (!context_tracking_recursion_enter()) + goto out_irq_restore; + if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -105,6 +127,8 @@ void context_tracking_enter(enum ctx_state state) */ __this_cpu_write(context_tracking.state, state); } + context_tracking_recursion_exit(); +out_irq_restore: local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -139,6 +163,9 @@ void context_tracking_exit(enum ctx_state state) return; local_irq_save(flags); + if (!context_tracking_recursion_enter()) + goto out_irq_restore; + if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -153,6 +180,8 @@ void context_tracking_exit(enum ctx_state state) } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } + context_tracking_recursion_exit(); +out_irq_restore: local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); From fafe870f31212a72f3c2d74e7b90e4ef39e83ee1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 6 May 2015 18:04:24 +0200 Subject: [PATCH 2/4] context_tracking: Inherit TIF_NOHZ through forks instead of context switches TIF_NOHZ is used by context_tracking to force syscall slow-path on every task in order to track userspace roundtrips. As such, it must be set on all running tasks. It's currently explicitly inherited through context switches. There is no need to do it in this fast-path though. The flag could simply be set once for all on all tasks, whether they are running or not. Lets do this by setting the flag for the init task on early boot, and let it propagate through fork inheritance. While at it, mark context_tracking_cpu_set() as init code, we only need it at early boot time. Suggested-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Cc: Borislav Petkov Cc: Chris Metcalf Cc: Dave Jones Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430928266-24888-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/context_tracking.h | 10 -------- include/linux/sched.h | 3 +++ kernel/context_tracking.c | 44 ++++++++++++++------------------ kernel/sched/core.c | 1 - 4 files changed, 22 insertions(+), 36 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 2821838256b4f7..b96bd299966f04 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -14,8 +14,6 @@ extern void context_tracking_enter(enum ctx_state state); extern void context_tracking_exit(enum ctx_state state); extern void context_tracking_user_enter(void); extern void context_tracking_user_exit(void); -extern void __context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next); static inline void user_enter(void) { @@ -51,19 +49,11 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } -static inline void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) -{ - if (context_tracking_is_enabled()) - __context_tracking_task_switch(prev, next); -} #else static inline void user_enter(void) { } static inline void user_exit(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } -static inline void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) { } #endif /* !CONFIG_CONTEXT_TRACKING */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734f8..185a750e4ed454 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2532,6 +2532,9 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif +#define tasklist_empty() \ + list_empty(&init_task.tasks) + #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5b11a10e196a82..0a495ab35bc72b 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -30,14 +30,6 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -void context_tracking_cpu_set(int cpu) -{ - if (!per_cpu(context_tracking.active, cpu)) { - per_cpu(context_tracking.active, cpu) = true; - static_key_slow_inc(&context_tracking_enabled); - } -} - static bool context_tracking_recursion_enter(void) { int recursion; @@ -193,24 +185,26 @@ void context_tracking_user_exit(void) } NOKPROBE_SYMBOL(context_tracking_user_exit); -/** - * __context_tracking_task_switch - context switch the syscall callbacks - * @prev: the task that is being switched out - * @next: the task that is being switched in - * - * The context tracking uses the syscall slow path to implement its user-kernel - * boundaries probes on syscalls. This way it doesn't impact the syscall fast - * path on CPUs that don't do context tracking. - * - * But we need to clear the flag on the previous task because it may later - * migrate to some CPU that doesn't do the context tracking. As such the TIF - * flag may not be desired there. - */ -void __context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) +void __init context_tracking_cpu_set(int cpu) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); + static __initdata bool initialized = false; + + if (!per_cpu(context_tracking.active, cpu)) { + per_cpu(context_tracking.active, cpu) = true; + static_key_slow_inc(&context_tracking_enabled); + } + + if (initialized) + return; + + /* + * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork + * This assumes that init is the only task at this early boot stage. + */ + set_tsk_thread_flag(&init_task, TIF_NOHZ); + WARN_ON_ONCE(!tasklist_empty()); + + initialized = true; } #ifdef CONFIG_CONTEXT_TRACKING_FORCE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe22f7510bceab..54f032cea04060 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2332,7 +2332,6 @@ context_switch(struct rq *rq, struct task_struct *prev, */ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); From 83dedea8a07fb4bf91863764b15c1c4ec00330f9 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 May 2015 18:04:25 +0200 Subject: [PATCH 3/4] nohz: Add tick_nohz_full_add_cpus_to() API This API is useful to modify a cpumask indicating some special nohz-type functionality so that the nohz cores are automatically added to that set. Signed-off-by: Chris Metcalf Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Dave Jones Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429024675-18938-1-git-send-email-cmetcalf@ezchip.com Link: http://lkml.kernel.org/r/1430928266-24888-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/tick.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/tick.h b/include/linux/tick.h index f8492da57ad32e..4191b5623a286c 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -134,6 +134,12 @@ static inline bool tick_nohz_full_cpu(int cpu) return cpumask_test_cpu(cpu, tick_nohz_full_mask); } +static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) +{ + if (tick_nohz_full_enabled()) + cpumask_or(mask, mask, tick_nohz_full_mask); +} + extern void __tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); @@ -142,6 +148,7 @@ extern void __tick_nohz_task_switch(struct task_struct *tsk); #else static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } +static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } static inline void __tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } From 8cb9764fc88b41db11f251e8b2a0d006578b7eb4 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 May 2015 18:04:26 +0200 Subject: [PATCH 4/4] nohz: Set isolcpus when nohz_full is set nohz_full is only useful with isolcpus are also set, since otherwise the scheduler has to run periodically to try to determine whether to steal work from other cores. Accordingly, when booting with nohz_full=xxx on the command line, we should act as if isolcpus=xxx was also set, and set (or extend) the isolcpus set to include the nohz_full cpus. Signed-off-by: Chris Metcalf Signed-off-by: Frederic Weisbecker Acked-by: Rik van Riel Acked-by: Mike Galbraith Acked-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Dave Jones Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rafael J. Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430928266-24888-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 54f032cea04060..b8f48763579bf5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7036,6 +7036,9 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + /* nohz_full won't take effect without isolating the cpus. */ + tick_nohz_full_add_cpus_to(cpu_isolated_map); + sched_init_numa(); /*