From 8da0b4f692c6d90b09c91f271517db746a22ff67 Mon Sep 17 00:00:00 2001 From: Benjamin Gordon Date: Thu, 3 Jan 2019 15:25:56 -0800 Subject: [PATCH 01/58] fs/proc/base.c: use ns_capable instead of capable for timerslack_ns Access to timerslack_ns is controlled by a process having CAP_SYS_NICE in its effective capability set, but the current check looks in the root namespace instead of the process' user namespace. Since a process is allowed to do other activities controlled by CAP_SYS_NICE inside a namespace, it should also be able to adjust timerslack_ns. Link: http://lkml.kernel.org/r/20181030180012.232896-1-bmgordon@google.com Signed-off-by: Benjamin Gordon Acked-by: "Eric W. Biederman" Cc: John Stultz Cc: "Eric W. Biederman" Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Thomas Gleixner Cc: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index d7fd1ca807d2c3..58a8dc3fd6c612 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2356,10 +2356,13 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, return -ESRCH; if (p != current) { - if (!capable(CAP_SYS_NICE)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); count = -EPERM; goto out; } + rcu_read_unlock(); err = security_task_setscheduler(p); if (err) { @@ -2392,11 +2395,14 @@ static int timerslack_ns_show(struct seq_file *m, void *v) return -ESRCH; if (p != current) { - - if (!capable(CAP_SYS_NICE)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); err = -EPERM; goto out; } + rcu_read_unlock(); + err = security_task_getscheduler(p); if (err) goto out; From 81966d83492620bf42d94d580370c59ff8d02772 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 3 Jan 2019 15:26:00 -0800 Subject: [PATCH 02/58] fs/proc/util.c: include fs/proc/internal.h for name_to_int() name_to_int() is defined in fs/proc/util.c and declared in fs/proc/internal.h, but the declaration isn't included at the point of the definition. Include the header to enforce that the definition matches the declaration. This addresses a gcc warning when -Wmissing-prototypes is enabled. Link: http://lkml.kernel.org/r/20181115001833.49371-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Reviewed-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/util.c b/fs/proc/util.c index b161cfa0f9fa15..98f8adc1734531 100644 --- a/fs/proc/util.c +++ b/fs/proc/util.c @@ -1,4 +1,5 @@ #include +#include "internal.h" unsigned name_to_int(const struct qstr *qstr) { From 230f72e9f6dc7b22ee92dc03a393429447b4395c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Jan 2019 15:26:05 -0800 Subject: [PATCH 03/58] fs/proc/inode.c: delete unnecessary variable in proc_alloc_inode() Link: http://lkml.kernel.org/r/20181203164015.GA6904@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 5792f9e39466ef..da649ccd680431 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -59,7 +59,6 @@ static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; - struct inode *inode; ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) @@ -71,8 +70,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->sysctl = NULL; ei->sysctl_entry = NULL; ei->ns_ops = NULL; - inode = &ei->vfs_inode; - return inode; + return &ei->vfs_inode; } static void proc_i_callback(struct rcu_head *head) From afe922c2daae4a8f0101a30658c886c2b6eb2a96 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Jan 2019 15:26:09 -0800 Subject: [PATCH 04/58] fs/proc/base.c: slightly faster /proc/*/limits Header of /proc/*/limits is a fixed string, so print it directly without formatting specifiers. Link: http://lkml.kernel.org/r/20181203164242.GB6904@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 58a8dc3fd6c612..633a6346257346 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -581,8 +581,10 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, /* * print the file header */ - seq_printf(m, "%-25s %-20s %-20s %-10s\n", - "Limit", "Soft Limit", "Hard Limit", "Units"); + seq_puts(m, "Limit " + "Soft Limit " + "Hard Limit " + "Units \n"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) From 09be178400829dddc1189b50a7888495dd26aa84 Mon Sep 17 00:00:00 2001 From: Cheng Lin Date: Thu, 3 Jan 2019 15:26:13 -0800 Subject: [PATCH 05/58] proc/sysctl: fix return error for proc_doulongvec_minmax() If the number of input parameters is less than the total parameters, an EINVAL error will be returned. For example, we use proc_doulongvec_minmax to pass up to two parameters with kern_table: { .procname = "monitor_signals", .data = &monitor_sigs, .maxlen = 2*sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, Reproduce: When passing two parameters, it's work normal. But passing only one parameter, an error "Invalid argument"(EINVAL) is returned. [root@cl150 ~]# echo 1 2 > /proc/sys/kernel/monitor_signals [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 1 2 [root@cl150 ~]# echo 3 > /proc/sys/kernel/monitor_signals -bash: echo: write error: Invalid argument [root@cl150 ~]# echo $? 1 [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 3 2 [root@cl150 ~]# The following is the result after apply this patch. No error is returned when the number of input parameters is less than the total parameters. [root@cl150 ~]# echo 1 2 > /proc/sys/kernel/monitor_signals [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 1 2 [root@cl150 ~]# echo 3 > /proc/sys/kernel/monitor_signals [root@cl150 ~]# echo $? 0 [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 3 2 [root@cl150 ~]# There are three processing functions dealing with digital parameters, __do_proc_dointvec/__do_proc_douintvec/__do_proc_doulongvec_minmax. This patch deals with __do_proc_doulongvec_minmax, just as __do_proc_dointvec does, adding a check for parameters 'left'. In __do_proc_douintvec, its code implementation explicitly does not support multiple inputs. static int __do_proc_douintvec(...){ ... /* * Arrays are not supported, keep this simple. *Do not* add * support for them. */ if (vleft != 1) { *lenp = 0; return -EINVAL; } ... } So, just __do_proc_doulongvec_minmax has the problem. And most use of proc_doulongvec_minmax/proc_doulongvec_ms_jiffies_minmax just have one parameter. Link: http://lkml.kernel.org/r/1544081775-15720-1-git-send-email-cheng.lin130@zte.com.cn Signed-off-by: Cheng Lin Acked-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1825f712e73bb7..7f6c1a3b3485f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2787,6 +2787,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int bool neg; left -= proc_skip_spaces(&p); + if (!left) + break; err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, From 3fe5dbfef47e992b810cbe82af1df02d8255fb8c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Jan 2019 15:26:16 -0800 Subject: [PATCH 06/58] Documentation/process/coding-style.rst: don't use "extern" with function prototypes `extern' with function prototypes makes lines longer and creates more characters on the screen. Do not bug people with checkpatch.pl warnings for now as fallout can be devastating. Link: http://lkml.kernel.org/r/20181101134153.GA29267@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/process/coding-style.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 277c113376a6f0..b78dd680c03809 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -443,6 +443,9 @@ In function prototypes, include parameter names with their data types. Although this is not required by the C language, it is preferred in Linux because it is a simple way to add valuable information for the reader. +Do not use the `extern' keyword with function prototypes as this makes +lines longer and isn't strictly necessary. + 7) Centralized exiting of functions ----------------------------------- From c60d3b79423aab402085c30b33bfff5354a61d8b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Jan 2019 15:26:20 -0800 Subject: [PATCH 07/58] build_bug.h: remove negative-array fallback for BUILD_BUG_ON() The kernel can only be compiled with an optimization option (-O2, -Os, or the currently proposed -Og). Hence, __OPTIMIZE__ is always defined in the kernel source. The fallback for the -O0 case is just hypothetical and pointless. Moreover, commit 0bb95f80a38f ("Makefile: Globally enable VLA warning") enabled -Wvla warning. The use of variable length arrays is banned. Link: http://lkml.kernel.org/r/1542856462-18836-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Kees Cook Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/build_bug.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index 43d1fd50d433d3..d415c6431441b4 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -51,23 +51,9 @@ * If you have some code which relies on certain constants being equal, or * some other compile-time-evaluated condition, you should use BUILD_BUG_ON to * detect if someone changes it. - * - * The implementation uses gcc's reluctance to create a negative array, but gcc - * (as of 4.4) only emits that error for obvious cases (e.g. not arguments to - * inline functions). Luckily, in 4.3 they added the "error" function - * attribute just for this type of case. Thus, we use a negative sized array - * (should always create an error on gcc versions older than 4.4) and then call - * an undefined function with the error attribute (should always create an - * error on gcc 4.3 and later). If for some reason, neither creates a - * compile-time error, we'll still have a link-time error, which is harder to - * track down. */ -#ifndef __OPTIMIZE__ -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) -#else #define BUILD_BUG_ON(condition) \ BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) -#endif /** * BUILD_BUG - break compile if used. From 527edbc18a70e745740ef31edb0ffefb2f161afa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Jan 2019 15:26:23 -0800 Subject: [PATCH 08/58] build_bug.h: remove most of dummy BUILD_BUG_ON stubs for Sparse The introduction of these dummy BUILD_BUG_ON stubs dates back to commmit 903c0c7cdc21 ("sparse: define dummy BUILD_BUG_ON definition for sparse"). At that time, BUILD_BUG_ON() was implemented with the negative array trick *and* the link-time trick, like this: extern int __build_bug_on_failed; #define BUILD_BUG_ON(condition) \ do { \ ((void)sizeof(char[1 - 2*!!(condition)])); \ if (condition) __build_bug_on_failed = 1; \ } while(0) Sparse is more strict about the negative array trick than GCC because Sparse requires the array length to be really constant. Here is the simple test code for the macro above: static const int x = 0; BUILD_BUG_ON(x); GCC is absolutely fine with it (-Wvla was enabled only very recently), but Sparse warns like this: error: bad constant expression error: cannot size expression (If you are using a newer version of Sparse, you will see a different warning message, "warning: Variable length array is used".) Anyway, Sparse was producing many false positives, and noisier than it should be at that time. With the previous commit, the leftover negative array trick is gone. Sparse is fine with the current BUILD_BUG_ON(), which is implemented by using the 'error' attribute. I am keeping the stub for BUILD_BUG_ON_ZERO(). Otherwise, Sparse would complain about the following code, which GCC is fine with: static const int x = 0; int y = BUILD_BUG_ON_ZERO(x); Link: http://lkml.kernel.org/r/1542856462-18836-3-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Kees Cook Reviewed-by: Luc Van Oostenryck Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/build_bug.h | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index d415c6431441b4..faeec7433aab02 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -5,21 +5,8 @@ #include #ifdef __CHECKER__ -#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) -#define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_ZERO(e) (0) -#define BUILD_BUG_ON_INVALID(e) (0) -#define BUILD_BUG_ON_MSG(cond, msg) (0) -#define BUILD_BUG_ON(condition) (0) -#define BUILD_BUG() (0) #else /* __CHECKER__ */ - -/* Force a compilation error if a constant expression is not a power of 2 */ -#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ - BUILD_BUG_ON(((n) & ((n) - 1)) != 0) -#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ - BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) - /* * Force a compilation error if condition is true, but also produce a * result (of value 0 and type size_t), so the expression can be used @@ -27,6 +14,13 @@ * aren't permitted). */ #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); })) +#endif /* __CHECKER__ */ + +/* Force a compilation error if a constant expression is not a power of 2 */ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON(((n) & ((n) - 1)) != 0) +#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) /* * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the @@ -64,6 +58,4 @@ */ #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") -#endif /* __CHECKER__ */ - #endif /* _LINUX_BUILD_BUG_H */ From 168e06f7937d96c7222037d8a05565e8a6eb00fe Mon Sep 17 00:00:00 2001 From: "Liu, Chuansheng" Date: Thu, 3 Jan 2019 15:26:27 -0800 Subject: [PATCH 09/58] kernel/hung_task.c: force console verbose before panic Based on commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic"), we could get the call stack of hung task. However, if the console loglevel is not high, we still can not see the useful panic information in practice, and in most cases users don't set console loglevel to high level. This patch is to force console verbose before system panic, so that the real useful information can be seen in the console, instead of being like the following, which doesn't have hung task information. INFO: task init:1 blocked for more than 120 seconds. Tainted: G U W 4.19.0-quilt-2e5dc0ac-g51b6c21d76cc #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. Kernel panic - not syncing: hung_task: blocked tasks CPU: 2 PID: 479 Comm: khungtaskd Tainted: G U W 4.19.0-quilt-2e5dc0ac-g51b6c21d76cc #1 Call Trace: dump_stack+0x4f/0x65 panic+0xde/0x231 watchdog+0x290/0x410 kthread+0x12c/0x150 ret_from_fork+0x35/0x40 reboot: panic mode set: p,w Kernel Offset: 0x34000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A6015B675@SHSMSX101.ccr.corp.intel.com Signed-off-by: Chuansheng Liu Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cb8e3e8ac7b90b..85af0cde7f46a4 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -112,8 +112,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) - return; + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_show_lock = true; + hung_task_call_panic = true; + } /* * Ok, the task did not get scheduled for more than 2 minutes, @@ -135,11 +138,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) } touch_nmi_watchdog(); - - if (sysctl_hung_task_panic) { - hung_task_show_lock = true; - hung_task_call_panic = true; - } } /* From 304ae42739b108305f8d7b3eb3c1aec7c2b643a9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 3 Jan 2019 15:26:31 -0800 Subject: [PATCH 10/58] kernel/hung_task.c: break RCU locks based on jiffies check_hung_uninterruptible_tasks() is currently calling rcu_lock_break() for every 1024 threads. But check_hung_task() is very slow if printk() was called, and is very fast otherwise. If many threads within some 1024 threads called printk(), the RCU grace period might be extended enough to trigger RCU stall warnings. Therefore, calling rcu_lock_break() for every some fixed jiffies will be safer. Link: http://lkml.kernel.org/r/1544800658-11423-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Paul E. McKenney Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Dmitry Vyukov Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 85af0cde7f46a4..4a9191617076fa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -34,7 +34,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ -#define HUNG_TASK_BATCHING 1024 +#define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: @@ -171,7 +171,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; - int batch_count = HUNG_TASK_BATCHING; + unsigned long last_break = jiffies; struct task_struct *g, *t; /* @@ -186,10 +186,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) for_each_process_thread(g, t) { if (!max_count--) goto unlock; - if (!--batch_count) { - batch_count = HUNG_TASK_BATCHING; + if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; + last_break = jiffies; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) From 300133d372b7b541c7e7c5e8d63ea5439f9865b6 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 3 Jan 2019 15:26:34 -0800 Subject: [PATCH 11/58] drivers/dma-buf/udmabuf.c: convert to use vm_fault_t Use new return type vm_fault_t for fault handler. Link: http://lkml.kernel.org/r/20181106173628.GA12989@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Cc: Gerd Hoffmann Cc: Sumit Semwal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dma-buf/udmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index fc359ca4503d12..cd57747286f286 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -20,7 +20,7 @@ struct udmabuf { struct page **pages; }; -static int udmabuf_vm_fault(struct vm_fault *vmf) +static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct udmabuf *ubuf = vma->vm_private_data; From e6310f0fb5cd3f65244dbdef2fb264859891c7ec Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Jan 2019 15:26:37 -0800 Subject: [PATCH 12/58] include/linux/printk.h: drop silly "static inline asmlinkage" from dump_stack() Empty function will be inlined so asmlinkage doesn't do anything. Link: http://lkml.kernel.org/r/20181124093530.GE10969@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Acked-by: Joey Pabalinas Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index 55aa96975fa268..77740a506ebbb9 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -264,7 +264,7 @@ static inline void show_regs_print_info(const char *log_lvl) { } -static inline asmlinkage void dump_stack(void) +static inline void dump_stack(void) { } From 3fc2579e6f162fcff964f5aa01c8a29438ca5c05 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 3 Jan 2019 15:26:41 -0800 Subject: [PATCH 13/58] fls: change parameter to unsigned int When testing in userspace, UBSAN pointed out that shifting into the sign bit is undefined behaviour. It doesn't really make sense to ask for the highest set bit of a negative value, so just turn the argument type into an unsigned int. Some architectures (eg ppc) already had it declared as an unsigned int, so I don't expect too many problems. Link: http://lkml.kernel.org/r/20181105221117.31828-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Thomas Gleixner Acked-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/bitops.h | 4 ++-- arch/arc/include/asm/bitops.h | 4 ++-- arch/c6x/include/asm/bitops.h | 2 +- arch/csky/include/asm/bitops.h | 2 +- arch/hexagon/include/asm/bitops.h | 2 +- arch/ia64/include/asm/bitops.h | 3 +-- arch/m68k/include/asm/bitops.h | 2 +- arch/mips/include/asm/bitops.h | 2 +- arch/openrisc/include/asm/bitops/fls.h | 2 +- arch/parisc/include/asm/bitops.h | 2 +- arch/s390/include/asm/bitops.h | 4 ++-- arch/unicore32/include/asm/bitops.h | 2 +- arch/x86/include/asm/bitops.h | 2 +- include/asm-generic/bitops/builtin-fls.h | 2 +- include/asm-generic/bitops/fls.h | 2 +- tools/include/asm-generic/bitops/fls.h | 2 +- 16 files changed, 19 insertions(+), 20 deletions(-) diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index ca43f4d0b93751..5adca78830b5e9 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -391,9 +391,9 @@ static inline unsigned long __fls(unsigned long x) return fls64(x) - 1; } -static inline int fls(int x) +static inline int fls(unsigned int x) { - return fls64((unsigned int) x); + return fls64(x); } /* diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index 8da87feec59aab..ee9246184033b3 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -278,7 +278,7 @@ static inline __attribute__ ((const)) int clz(unsigned int x) return res; } -static inline int constant_fls(int x) +static inline int constant_fls(unsigned int x) { int r = 32; @@ -312,7 +312,7 @@ static inline int constant_fls(int x) * @result: [1-32] * fls(1) = 1, fls(0x80000000) = 32, fls(0) = 0 */ -static inline __attribute__ ((const)) int fls(unsigned long x) +static inline __attribute__ ((const)) int fls(unsigned int x) { if (__builtin_constant_p(x)) return constant_fls(x); diff --git a/arch/c6x/include/asm/bitops.h b/arch/c6x/include/asm/bitops.h index f0ab012401b67c..8b68234ace181b 100644 --- a/arch/c6x/include/asm/bitops.h +++ b/arch/c6x/include/asm/bitops.h @@ -54,7 +54,7 @@ static inline unsigned long __ffs(unsigned long x) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { if (!x) return 0; diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 335f2883fb1ee8..43b9838bff63d1 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -40,7 +40,7 @@ static __always_inline unsigned long __ffs(unsigned long x) /* * asm-generic/bitops/fls.h */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { asm volatile( "ff1 %0\n" diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 2691a1857d203d..bee97426238705 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -211,7 +211,7 @@ static inline long ffz(int x) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int r; diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h index 56a774bf13fa73..2f24ee6459d20f 100644 --- a/arch/ia64/include/asm/bitops.h +++ b/arch/ia64/include/asm/bitops.h @@ -388,8 +388,7 @@ ia64_fls (unsigned long x) * Find the last (most significant) bit set. Returns 0 for x==0 and * bits are numbered from 1..32 (e.g., fls(9) == 4). */ -static inline int -fls (int t) +static inline int fls(unsigned int t) { unsigned long x = t & 0xffffffffu; diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index d979f38af751cf..10133a968c8e15 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -502,7 +502,7 @@ static inline unsigned long __ffs(unsigned long x) /* * fls: find last bit set. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int cnt; diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index f2a840fb6a9af6..c4675957b21bca 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -555,7 +555,7 @@ static inline unsigned long __ffs(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int r; diff --git a/arch/openrisc/include/asm/bitops/fls.h b/arch/openrisc/include/asm/bitops/fls.h index 9efbf9ad86c409..57de5a1115bfd9 100644 --- a/arch/openrisc/include/asm/bitops/fls.h +++ b/arch/openrisc/include/asm/bitops/fls.h @@ -15,7 +15,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FL1 -static inline int fls(int x) +static inline int fls(unsigned int x) { int ret; diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 53252d4f9a570c..a09eaebfdfd07f 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -188,7 +188,7 @@ static __inline__ int ffs(int x) * fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __inline__ int fls(int x) +static __inline__ int fls(unsigned int x) { int ret; if (!x) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 86e5b2fdee3c8e..d1f8a4d94cca02 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -397,9 +397,9 @@ static inline int fls64(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int word) +static inline int fls(unsigned int word) { - return fls64((unsigned int)word); + return fls64(word); } #else /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */ diff --git a/arch/unicore32/include/asm/bitops.h b/arch/unicore32/include/asm/bitops.h index c0cbdbe1716809..de5853761c22ca 100644 --- a/arch/unicore32/include/asm/bitops.h +++ b/arch/unicore32/include/asm/bitops.h @@ -22,7 +22,7 @@ * the cntlz instruction for much better code efficiency. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int ret; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 124f9195eb3ef5..ad7b210aa3f621 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -448,7 +448,7 @@ static __always_inline int ffs(int x) * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { int r; diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h index 62daf940989df9..c8455cc28841af 100644 --- a/include/asm-generic/bitops/builtin-fls.h +++ b/include/asm-generic/bitops/builtin-fls.h @@ -9,7 +9,7 @@ * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; } diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h index 753aecaab641a8..b168bb10e1be17 100644 --- a/include/asm-generic/bitops/fls.h +++ b/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { int r = 32; diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h index 753aecaab641a8..b168bb10e1be17 100644 --- a/tools/include/asm-generic/bitops/fls.h +++ b/tools/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { int r = 32; From 52fbf1134d479234d7e64ba9dcbaea23405f229e Mon Sep 17 00:00:00 2001 From: Alexey Skidanov Date: Thu, 3 Jan 2019 15:26:44 -0800 Subject: [PATCH 14/58] lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk gen_pool_alloc_algo() uses different allocation functions implementing different allocation algorithms. With gen_pool_first_fit_align() allocation function, the returned address should be aligned on the requested boundary. If chunk start address isn't aligned on the requested boundary, the returned address isn't aligned too. The only way to get properly aligned address is to initialize the pool with chunks aligned on the requested boundary. If want to have an ability to allocate buffers aligned on different boundaries (for example, 4K, 1MB, ...), the chunk start address should be aligned on the max possible alignment. This happens because gen_pool_first_fit_align() looks for properly aligned memory block without taking into account the chunk start address alignment. To fix this, we provide chunk start address to gen_pool_first_fit_align() and change its implementation such that it starts looking for properly aligned block with appropriate offset (exactly as is done in CMA). Link: https://lkml.kernel.org/lkml/a170cf65-6884-3592-1de9-4c235888cc8a@intel.com Link: http://lkml.kernel.org/r/1541690953-4623-1-git-send-email-alexey.skidanov@intel.com Signed-off-by: Alexey Skidanov Reviewed-by: Andrew Morton Cc: Logan Gunthorpe Cc: Daniel Mentz Cc: Mathieu Desnoyers Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 13 +++++++------ lib/genalloc.c | 20 ++++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 872f930f1b06d4..dd0a452373e715 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -51,7 +51,8 @@ typedef unsigned long (*genpool_algo_t)(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, + unsigned long start_addr); /* * General purpose special memory pool descriptor. @@ -131,24 +132,24 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool); + struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_first_fit_order_align(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool); + struct gen_pool *pool, unsigned long start_addr); extern struct gen_pool *devm_gen_pool_create(struct device *dev, diff --git a/lib/genalloc.c b/lib/genalloc.c index ca06adc4f44510..5deb25c40a5a14 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -311,7 +311,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, end_bit = chunk_size(chunk) >> order; retry: start_bit = algo(chunk->bits, end_bit, start_bit, - nbits, data, pool); + nbits, data, pool, chunk->start_addr); if (start_bit >= end_bit) continue; remain = bitmap_set_ll(chunk->bits, start_bit, nbits); @@ -525,7 +525,7 @@ EXPORT_SYMBOL(gen_pool_set_algo); */ unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { return bitmap_find_next_zero_area(map, size, start, nr, 0); } @@ -543,16 +543,19 @@ EXPORT_SYMBOL(gen_pool_first_fit); */ unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { struct genpool_data_align *alignment; - unsigned long align_mask; + unsigned long align_mask, align_off; int order; alignment = data; order = pool->min_alloc_order; align_mask = ((alignment->align + (1UL << order) - 1) >> order) - 1; - return bitmap_find_next_zero_area(map, size, start, nr, align_mask); + align_off = (start_addr & (alignment->align - 1)) >> order; + + return bitmap_find_next_zero_area_off(map, size, start, nr, + align_mask, align_off); } EXPORT_SYMBOL(gen_pool_first_fit_align); @@ -567,7 +570,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_align); */ unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { struct genpool_data_fixed *fixed_data; int order; @@ -601,7 +604,8 @@ EXPORT_SYMBOL(gen_pool_fixed_alloc); */ unsigned long gen_pool_first_fit_order_align(unsigned long *map, unsigned long size, unsigned long start, - unsigned int nr, void *data, struct gen_pool *pool) + unsigned int nr, void *data, struct gen_pool *pool, + unsigned long start_addr) { unsigned long align_mask = roundup_pow_of_two(nr) - 1; @@ -624,7 +628,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_order_align); */ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { unsigned long start_bit = size; unsigned long len = size + 1; From 439e00b76a5fb1662e3ae49fc48bd3f950575b9d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 3 Jan 2019 15:26:48 -0800 Subject: [PATCH 15/58] lib/find_bit_benchmark.c: align test_find_next_and_bit with others Contrary to other tests, test_find_next_and_bit() test uses tab formatting in output and get_cycles() instead of ktime_get(). get_cycles() is not supported by some arches, so ktime_get() fits better in generic code. Fix it and minor style issues, so the output looks like this: Start testing find_bit() with random-filled bitmap find_next_bit: 7142816 ns, 163282 iterations find_next_zero_bit: 8545712 ns, 164399 iterations find_last_bit: 6332032 ns, 163282 iterations find_first_bit: 20509424 ns, 16606 iterations find_next_and_bit: 4060016 ns, 73424 iterations Start testing find_bit() with sparse bitmap find_next_bit: 55984 ns, 656 iterations find_next_zero_bit: 19197536 ns, 327025 iterations find_last_bit: 65088 ns, 656 iterations find_first_bit: 5923712 ns, 656 iterations find_next_and_bit: 29088 ns, 1 iterations Link: http://lkml.kernel.org/r/20181123174803.10916-1-ynorov@caviumnetworks.com Signed-off-by: Yury Norov Reviewed-by: Andrew Morton Cc: "Norov, Yuri" Cc: Clement Courbet Cc: Geert Uytterhoeven Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/find_bit_benchmark.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index 5367ffa5c18f9c..f0e394dd2beba0 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -108,14 +108,13 @@ static int __init test_find_next_and_bit(const void *bitmap, const void *bitmap2, unsigned long len) { unsigned long i, cnt; - cycles_t cycles; + ktime_t time; - cycles = get_cycles(); + time = ktime_get(); for (cnt = i = 0; i < BITMAP_LEN; cnt++) - i = find_next_and_bit(bitmap, bitmap2, BITMAP_LEN, i+1); - cycles = get_cycles() - cycles; - pr_err("find_next_and_bit:\t\t%llu cycles, %ld iterations\n", - (u64)cycles, cnt); + i = find_next_and_bit(bitmap, bitmap2, BITMAP_LEN, i + 1); + time = ktime_get() - time; + pr_err("find_next_and_bit: %18llu ns, %6ld iterations\n", time, cnt); return 0; } From 6862d2fc81859f88c1f3f660886427893f2b4f3f Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Thu, 3 Jan 2019 15:26:51 -0800 Subject: [PATCH 16/58] lib/genalloc.c: use vzalloc_node() to allocate the bitmap Some devices may have big memory on chip, such as over 1G. In some cases, the nbytes maybe bigger then 4M which is the bounday of the memory buddy system (4K default). So use vzalloc_node() to allocate the bitmap. Also use vfree to free it. Link: http://lkml.kernel.org/r/20181225015701.6289-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Reviewed-by: Andrew Morton Cc: Alexey Skidanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index 5deb25c40a5a14..f365d71cdc774c 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -187,7 +187,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy int nbytes = sizeof(struct gen_pool_chunk) + BITS_TO_LONGS(nbits) * sizeof(long); - chunk = kzalloc_node(nbytes, GFP_KERNEL, nid); + chunk = vzalloc_node(nbytes, nid); if (unlikely(chunk == NULL)) return -ENOMEM; @@ -251,7 +251,7 @@ void gen_pool_destroy(struct gen_pool *pool) bit = find_next_bit(chunk->bits, end_bit, 0); BUG_ON(bit < end_bit); - kfree(chunk); + vfree(chunk); } kfree_const(pool->name); kfree(pool); From 05391772a72d49c37e6e4cb8e871be57beb66155 Mon Sep 17 00:00:00 2001 From: "huang.zijiang" Date: Thu, 3 Jan 2019 15:26:55 -0800 Subject: [PATCH 17/58] drivers/firmware/memmap.c: modify memblock_alloc to memblock_alloc_nopanic memblock_alloc() never returns NULL because panic never returns. Link: http://lkml.kernel.org/r/1545640882-42009-1-git-send-email-huang.zijiang@zte.com.cn Signed-off-by: huang.zijiang Acked-by: Mike Rapoport Cc: Michal Hocko Cc: Stephen Rothwell Cc: Michael Ellerman Cc: Yi Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index d168c87c7d3085..ec4fd253a4e92a 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -333,7 +333,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = memblock_alloc(sizeof(struct firmware_map_entry), + entry = memblock_alloc_nopanic(sizeof(struct firmware_map_entry), SMP_CACHE_BYTES); if (WARN_ON(!entry)) return -ENOMEM; From 77b8c0a8e47484e205b01dfedcd224770aa9d800 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Jan 2019 15:26:59 -0800 Subject: [PATCH 18/58] checkpatch: warn on const char foo[] = "bar"; declarations These declarations should generally be static const to avoid poor compilation and runtime performance where compilers tend to initialize the const declaration for every call instead of using .rodata for the string. Miscellanea: - Convert spaces to tabs for indentation in 2 adjacent checks Link: http://lkml.kernel.org/r/10ea5f4b087dc911e41e187a4a2b5e79c7529aa3.camel@perches.com Signed-off-by: Joe Perches Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 377f373db6c002..93e84c9504a15f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3890,14 +3890,23 @@ sub process { WARN("STATIC_CONST_CHAR_ARRAY", "static const char * array should probably be static const char * const\n" . $herecurr); - } + } + +# check for initialized const char arrays that should be static const + if ($line =~ /^\+\s*const\s+(char|unsigned\s+char|_*u8|(?:[us]_)?int8_t)\s+\w+\s*\[\s*(?:\w+\s*)?\]\s*=\s*"/) { + if (WARN("STATIC_CONST_CHAR_ARRAY", + "const array should probably be static const\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/(^.\s*)const\b/${1}static const/; + } + } # check for static char foo[] = "bar" declarations. if ($line =~ /\bstatic\s+char\s+(\w+)\s*\[\s*\]\s*=\s*"/) { WARN("STATIC_CONST_CHAR_ARRAY", "static char array declaration should probably be static const char\n" . $herecurr); - } + } # check for const const where is not a pointer or array type if ($sline =~ /\bconst\s+($BasicType)\s+const\b/) { From 74bdc129850c32eaddc625ce557da560303fbf25 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:02 -0800 Subject: [PATCH 19/58] fs/epoll: remove max_nests argument from ep_call_nested() Patch series "epoll: some miscellaneous optimizations". The following are some incremental optimizations on some of the epoll core. Each patch has the details, but together, the series is seen to shave off measurable cycles on a number of systems and workloads. For example, on a 40-core IB, a pipetest as well as parallel epoll_wait() benchmark show around a 20-30% increase in raw operations per second when the box is fully occupied (incremental thread counts), and up to 15% performance improvement with lower counts. Passes ltp epoll related testcases. This patch(of 6): All callers pass the EP_MAX_NESTS constant already, so lets simplify this a tad and get rid of the redundant parameter for nested eventpolls. Link: http://lkml.kernel.org/r/20181108051006.18751-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a5a1010886bad..be50799737f4c3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -471,7 +471,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * no re-entered. * * @ncalls: Pointer to the nested_calls structure to be used for this call. - * @max_nests: Maximum number of allowed nesting calls. * @nproc: Nested call core function pointer. * @priv: Opaque data to be passed to the @nproc callback. * @cookie: Cookie to be used to identify this nested call. @@ -480,7 +479,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * Returns: Returns the code returned by the @nproc callback, or -1 if * the maximum recursion limit has been exceeded. */ -static int ep_call_nested(struct nested_calls *ncalls, int max_nests, +static int ep_call_nested(struct nested_calls *ncalls, int (*nproc)(void *, void *, int), void *priv, void *cookie, void *ctx) { @@ -499,7 +498,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, */ list_for_each_entry(tncur, lsthead, llink) { if (tncur->ctx == ctx && - (tncur->cookie == cookie || ++call_nests > max_nests)) { + (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { /* * Ops ... loop detected or maximum nest level reached. * We abort this wake by breaking the cycle itself. @@ -573,7 +572,7 @@ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); - ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, + ep_call_nested(&poll_safewake_ncalls, ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); put_cpu(); @@ -1333,7 +1332,6 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) } } else { error = ep_call_nested(&poll_loop_ncalls, - EP_MAX_NESTS, reverse_path_check_proc, child_file, child_file, current); @@ -1367,7 +1365,7 @@ static int reverse_path_check(void) /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { path_count_init(); - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + error = ep_call_nested(&poll_loop_ncalls, reverse_path_check_proc, current_file, current_file, current); if (error) @@ -1876,7 +1874,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->visited) continue; - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + error = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, epi->ffd.file, ep_tovisit, current); if (error != 0) @@ -1916,7 +1914,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) int ret; struct eventpoll *ep_cur, *ep_next; - ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ret = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, file, ep, current); /* clear visited list */ list_for_each_entry_safe(ep_cur, ep_next, &visited_list, From 4e0982a00564c80cb849a892043450860ef91e14 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:05 -0800 Subject: [PATCH 20/58] fs/epoll: simplify ep_send_events_proc() ready-list loop The current logic is a bit convoluted. Lets simplify this with a standard list_for_each_entry_safe() loop instead and just break out after maxevents is reached. While at it, remove an unnecessary indentation level in the loop when there are in fact ready events. Link: http://lkml.kernel.org/r/20181108051006.18751-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 73 +++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index be50799737f4c3..aaf614ee08e4f1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1624,21 +1624,22 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head { struct ep_send_events_data *esed = priv; __poll_t revents; - struct epitem *epi; - struct epoll_event __user *uevent; + struct epitem *epi, *tmp; + struct epoll_event __user *uevent = esed->events; struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); + esed->res = 0; /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ - for (esed->res = 0, uevent = esed->events; - !list_empty(head) && esed->res < esed->maxevents;) { - epi = list_first_entry(head, struct epitem, rdllink); + list_for_each_entry_safe(epi, tmp, head, rdllink) { + if (esed->res >= esed->maxevents) + break; /* * Activate ep->ws before deactivating epi->ws to prevent @@ -1658,42 +1659,42 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt, 1); - /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() - * is holding "mtx", so no operations coming from userspace + * is holding ep->mtx, so no operations coming from userspace * can change the item. */ - if (revents) { - if (__put_user(revents, &uevent->events) || - __put_user(epi->event.data, &uevent->data)) { - list_add(&epi->rdllink, head); - ep_pm_stay_awake(epi); - if (!esed->res) - esed->res = -EFAULT; - return 0; - } - esed->res++; - uevent++; - if (epi->event.events & EPOLLONESHOT) - epi->event.events &= EP_PRIVATE_BITS; - else if (!(epi->event.events & EPOLLET)) { - /* - * If this file has been added with Level - * Trigger mode, we need to insert back inside - * the ready list, so that the next call to - * epoll_wait() will check again the events - * availability. At this point, no one can insert - * into ep->rdllist besides us. The epoll_ctl() - * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the - * poll callback will queue them in ep->ovflist. - */ - list_add_tail(&epi->rdllink, &ep->rdllist); - ep_pm_stay_awake(epi); - } + revents = ep_item_poll(epi, &pt, 1); + if (!revents) + continue; + + if (__put_user(revents, &uevent->events) || + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); + ep_pm_stay_awake(epi); + if (!esed->res) + esed->res = -EFAULT; + return 0; + } + esed->res++; + uevent++; + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, no one can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); } } From 76699a67f3041ff4c7af6d6ee9be2bfbf1ffb671 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:09 -0800 Subject: [PATCH 21/58] fs/epoll: drop ovflist branch prediction The ep->ovflist is a secondary ready-list to temporarily store events that might occur when doing sproc without holding the ep->wq.lock. This accounts for every time we check for ready events and also send events back to userspace; both callbacks, particularly the latter because of copy_to_user, can account for a non-trivial time. As such, the unlikely() check to see if the pointer is being used, seems both misleading and sub-optimal. In fact, we go to an awful lot of trouble to sync both lists, and populating the ovflist is far from an uncommon scenario. For example, profiling a concurrent epoll_wait(2) benchmark, with CONFIG_PROFILE_ANNOTATED_BRANCHES shows that for a two threads a 33% incorrect rate was seen; and when incrementally increasing the number of epoll instances (which is used, for example for multiple queuing load balancing models), up to a 90% incorrect rate was seen. Similarly, by deleting the prediction, 3% throughput boost was seen across incremental threads. Link: http://lkml.kernel.org/r/20181108051006.18751-4-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aaf614ee08e4f1..fb7f05f4099d20 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1153,7 +1153,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (ep->ovflist != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; From 21877e1a5b520132f54515f8835c963056418b4c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:12 -0800 Subject: [PATCH 22/58] fs/epoll: robustify ep->mtx held checks Insted of just commenting how important it is, lets make it more robust and add a lockdep_assert_held() call. Link: http://lkml.kernel.org/r/20181108051006.18751-5-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index fb7f05f4099d20..68e27d1cb5cd58 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1637,6 +1637,8 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ + lockdep_assert_held(&ep->mtx); + list_for_each_entry_safe(epi, tmp, head, rdllink) { if (esed->res >= esed->maxevents) break; From c5a282e9635e9c7382821565083db5d260085e3e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:15 -0800 Subject: [PATCH 23/58] fs/epoll: reduce the scope of wq lock in epoll_wait() This patch aims at reducing ep wq.lock hold times in epoll_wait(2). For the blocking case, there is no need to constantly take and drop the spinlock, which is only needed to manipulate the waitqueue. The call to ep_events_available() is now lockless, and only exposed to benign races. Here, if false positive (returns available events and does not see another thread deleting an epi from the list) we call into send_events and then the list's state is correctly seen. Otoh, if a false negative and we don't see a list_add_tail(), for example, from irq callback, then it is rechecked again before blocking, which will see the correct state. In order for more accuracy to see concurrent list_del_init(), use the list_empty_careful() variant -- of course, this won't be safe against insertions from wakeup. For the overflow list we obviously need to prevent load/store tearing as we don't want to see partial values while the ready list is disabled. [dave@stgolabs.net: forgotten fixlets] Link: http://lkml.kernel.org/r/20181109155258.jxcr4t2pnz6zqct3@linux-r8p5 Link: http://lkml.kernel.org/r/20181108051006.18751-6-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Suggested-by: Jason Baron Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 114 ++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 68e27d1cb5cd58..b42f53d64d11b1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -381,7 +381,8 @@ static void ep_nested_calls_init(struct nested_calls *ncalls) */ static inline int ep_events_available(struct eventpoll *ep) { - return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; + return !list_empty_careful(&ep->rdllist) || + READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL @@ -698,7 +699,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, */ spin_lock_irq(&ep->wq.lock); list_splice_init(&ep->rdllist, &txlist); - ep->ovflist = NULL; + WRITE_ONCE(ep->ovflist, NULL); spin_unlock_irq(&ep->wq.lock); /* @@ -712,7 +713,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ - for (nepi = ep->ovflist; (epi = nepi) != NULL; + for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. @@ -730,7 +731,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ - ep->ovflist = EP_UNACTIVE_PTR; + WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". @@ -1153,10 +1154,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (ep->ovflist != EP_UNACTIVE_PTR) { + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { - epi->next = ep->ovflist; - ep->ovflist = epi; + epi->next = READ_ONCE(ep->ovflist); + WRITE_ONCE(ep->ovflist, epi); if (epi->ws) { /* * Activate ep->ws since epi->ws may get @@ -1762,10 +1763,17 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the - * caller specified a non blocking operation. + * caller specified a non blocking operation. We still need + * lock because we could race and not see an epi being added + * to the ready list while in irq callback. Thus incorrectly + * returning 0 back to userspace. */ timed_out = 1; + spin_lock_irq(&ep->wq.lock); + eavail = ep_events_available(ep); + spin_unlock_irq(&ep->wq.lock); + goto check_events; } @@ -1774,64 +1782,62 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); + eavail = ep_events_available(ep); + if (eavail) + goto check_events; + + /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + + /* + * We don't have any available event to return to the caller. + * We need to sleep here, and we will be wake up by + * ep_poll_callback() when events will become available. + */ + init_waitqueue_entry(&wait, current); spin_lock_irq(&ep->wq.lock); + __add_wait_queue_exclusive(&ep->wq, &wait); + spin_unlock_irq(&ep->wq.lock); - if (!ep_events_available(ep)) { + for (;;) { /* - * Busy poll timed out. Drop NAPI ID for now, we can add - * it back in when we have moved a socket with a valid NAPI - * ID onto the ready list. + * We don't want to sleep if the ep_poll_callback() sends us + * a wakeup in between. That's why we set the task state + * to TASK_INTERRUPTIBLE before doing the checks. */ - ep_reset_busy_poll_napi_id(ep); - + set_current_state(TASK_INTERRUPTIBLE); /* - * We don't have any available event to return to the caller. - * We need to sleep here, and we will be wake up by - * ep_poll_callback() when events will become available. + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. */ - init_waitqueue_entry(&wait, current); - __add_wait_queue_exclusive(&ep->wq, &wait); - - for (;;) { - /* - * We don't want to sleep if the ep_poll_callback() sends us - * a wakeup in between. That's why we set the task state - * to TASK_INTERRUPTIBLE before doing the checks. - */ - set_current_state(TASK_INTERRUPTIBLE); - /* - * Always short-circuit for fatal signals to allow - * threads to make a timely exit without the chance of - * finding more events available and fetching - * repeatedly. - */ - if (fatal_signal_pending(current)) { - res = -EINTR; - break; - } - if (ep_events_available(ep) || timed_out) - break; - if (signal_pending(current)) { - res = -EINTR; - break; - } - - spin_unlock_irq(&ep->wq.lock); - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) - timed_out = 1; - - spin_lock_irq(&ep->wq.lock); + if (fatal_signal_pending(current)) { + res = -EINTR; + break; + } + if (ep_events_available(ep) || timed_out) + break; + if (signal_pending(current)) { + res = -EINTR; + break; } - __remove_wait_queue(&ep->wq, &wait); - __set_current_state(TASK_RUNNING); + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; } -check_events: - /* Is it worth to try to dig for events ? */ - eavail = ep_events_available(ep); + __set_current_state(TASK_RUNNING); + + spin_lock_irq(&ep->wq.lock); + __remove_wait_queue(&ep->wq, &wait); spin_unlock_irq(&ep->wq.lock); +check_events: /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of From abc610e01c663e25c41a3bdcbc4115cd7fbb047b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:19 -0800 Subject: [PATCH 24/58] fs/epoll: avoid barrier after an epoll_wait(2) timeout Upon timeout, we can just exit out of the loop, without the cost of the changing the task's state with an smp_store_mb call. Just exit out of the loop and be done - setting the task state afterwards will be, of course, redundant. [dave@stgolabs.net: forgotten fixlets] Link: http://lkml.kernel.org/r/20181109155258.jxcr4t2pnz6zqct3@linux-r8p5 Link: http://lkml.kernel.org/r/20181108051006.18751-7-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Davidlohr Bueso Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b42f53d64d11b1..9beb166b7264a6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1820,15 +1820,19 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, res = -EINTR; break; } - if (ep_events_available(ep) || timed_out) + + eavail = ep_events_available(ep); + if (eavail) break; if (signal_pending(current)) { res = -EINTR; break; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { timed_out = 1; + break; + } } __set_current_state(TASK_RUNNING); From 35cff1a6e0236500584a8ae227fe08120d9b5ee2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:22 -0800 Subject: [PATCH 25/58] fs/epoll: rename check_events label to send_events It is currently called check_events because it, well, did exactly that. However, since the lockless ep_events_available() call, the label no longer checks, but just sends the events. Rename as such. Link: http://lkml.kernel.org/r/20181114182532.27981-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9beb166b7264a6..752dbc41c94156 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1774,7 +1774,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = ep_events_available(ep); spin_unlock_irq(&ep->wq.lock); - goto check_events; + goto send_events; } fetch_events: @@ -1784,7 +1784,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = ep_events_available(ep); if (eavail) - goto check_events; + goto send_events; /* * Busy poll timed out. Drop NAPI ID for now, we can add @@ -1841,7 +1841,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, __remove_wait_queue(&ep->wq, &wait); spin_unlock_irq(&ep->wq.lock); -check_events: +send_events: /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of From 86c051793b4c941ee4481725d57cf2a27f6b3aaf Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:26 -0800 Subject: [PATCH 26/58] fs/epoll: deal with wait_queue only once There is no reason why we rearm the waitiqueue upon every fetch_events retry (for when events are found yet send_events() fails). If nothing else, this saves four lock operations per retry, and furthermore reduces the scope of the lock even further. [akpm@linux-foundation.org: restore code to original position, fix and reflow comment] Link: http://lkml.kernel.org/r/20181114182532.27981-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 752dbc41c94156..2329f96469e2d7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1749,6 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; u64 slack = 0; + bool waiter = false; wait_queue_entry_t wait; ktime_t expires, *to = NULL; @@ -1794,14 +1795,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, ep_reset_busy_poll_napi_id(ep); /* - * We don't have any available event to return to the caller. - * We need to sleep here, and we will be wake up by - * ep_poll_callback() when events will become available. + * We don't have any available event to return to the caller. We need + * to sleep here, and we will be woken by ep_poll_callback() when events + * become available. */ - init_waitqueue_entry(&wait, current); - spin_lock_irq(&ep->wq.lock); - __add_wait_queue_exclusive(&ep->wq, &wait); - spin_unlock_irq(&ep->wq.lock); + if (!waiter) { + waiter = true; + init_waitqueue_entry(&wait, current); + + spin_lock_irq(&ep->wq.lock); + __add_wait_queue_exclusive(&ep->wq, &wait); + spin_unlock_irq(&ep->wq.lock); + } for (;;) { /* @@ -1837,10 +1842,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, __set_current_state(TASK_RUNNING); - spin_lock_irq(&ep->wq.lock); - __remove_wait_queue(&ep->wq, &wait); - spin_unlock_irq(&ep->wq.lock); - send_events: /* * Try to transfer events to user space. In case we get 0 events and @@ -1851,6 +1852,12 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (waiter) { + spin_lock_irq(&ep->wq.lock); + __remove_wait_queue(&ep->wq, &wait); + spin_unlock_irq(&ep->wq.lock); + } + return res; } From 7c8f71935a65a584c48cbe478aaffc52292d6e00 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Jan 2019 15:27:29 -0800 Subject: [PATCH 27/58] init/main.c: make "initcall_level_names[]" const char * Initcall names should not be changed. Link: http://lkml.kernel.org/r/20181124091829.GD10969@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 86d894852bef8b..6a74ba0892d223 100644 --- a/init/main.c +++ b/init/main.c @@ -930,7 +930,7 @@ static initcall_entry_t *initcall_levels[] __initdata = { }; /* Keep these in sync with initcalls in include/linux/init.h */ -static char *initcall_level_names[] __initdata = { +static const char *initcall_level_names[] __initdata = { "pure", "core", "postcore", From 55f0d8205dc6399826332c21bc56626868cd453d Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 3 Jan 2019 15:27:33 -0800 Subject: [PATCH 28/58] autofs: improve ioctl sbi checks Al Viro made some suggestions to improve the implementation of commit 0633da48f0 ("fix autofs_sbi() does not check super block type"). The check is unnecessary in all cases except for ioctl usage so placing the check in the super block accessor function adds a small overhead to the common case where it isn't needed. So it's sufficient to do this in the ioctl code only. Also the check in the ioctl code is needlessly complex. [akpm@linux-foundation.org: declare autofs_fs_type in .h, not .c] Link: http://lkml.kernel.org/r/154296970987.9889.1597442413573683096.stgit@pluto-themaw-net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 5 +++-- fs/autofs/dev-ioctl.c | 23 +++++------------------ fs/autofs/init.c | 2 +- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 9f9cadbfbd7a34..64f693d355ada0 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -42,6 +42,8 @@ #endif #define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__ +extern struct file_system_type autofs_fs_type; + /* * Unified info structure. This is pointed to by both the dentry and * inode structures. Each file in the filesystem has an instance of this @@ -126,8 +128,7 @@ struct autofs_sb_info { static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { - return sb->s_magic != AUTOFS_SUPER_MAGIC ? - NULL : (struct autofs_sb_info *)(sb->s_fs_info); + return (struct autofs_sb_info *)(sb->s_fs_info); } static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 86eafda4a65226..752983aafb846b 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -151,22 +151,6 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) return err; } -/* - * Get the autofs super block info struct from the file opened on - * the autofs mount point. - */ -static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) -{ - struct autofs_sb_info *sbi = NULL; - struct inode *inode; - - if (f) { - inode = file_inode(f); - sbi = autofs_sbi(inode->i_sb); - } - return sbi; -} - /* Return autofs dev ioctl version */ static int autofs_dev_ioctl_version(struct file *fp, struct autofs_sb_info *sbi, @@ -658,6 +642,8 @@ static int _autofs_dev_ioctl(unsigned int command, if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { + struct super_block *sb; + fp = fget(param->ioctlfd); if (!fp) { if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) @@ -666,12 +652,13 @@ static int _autofs_dev_ioctl(unsigned int command, goto out; } - sbi = autofs_dev_ioctl_sbi(fp); - if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { + sb = file_inode(fp)->i_sb; + if (sb->s_type != &autofs_fs_type) { err = -EINVAL; fput(fp); goto out; } + sbi = autofs_sbi(sb); /* * Admin needs to be able to set the mount catatonic in diff --git a/fs/autofs/init.c b/fs/autofs/init.c index 79ae07d9592f55..c0c1db2cc6ea7a 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -16,7 +16,7 @@ static struct dentry *autofs_mount(struct file_system_type *fs_type, return mount_nodev(fs_type, flags, data, autofs_fill_super); } -static struct file_system_type autofs_fs_type = { +struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", .mount = autofs_mount, From 9bf964c9cee40285808ce973be7a266876404501 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 3 Jan 2019 15:27:36 -0800 Subject: [PATCH 29/58] autofs: simplify parse_options() function call The parse_options() function uses a long list of parameters, most of which are present in the super block info structure already. The mount parameters set in parse_options() options don't require cleanup so using the super block info struct directly is simpler. Link: http://lkml.kernel.org/r/154296972423.9889.9368859245676473329.stgit@pluto-themaw-net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/inode.c | 55 +++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 846c052569dd49..bd49b16f43fdd9 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -124,21 +124,24 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, - int *pgrp, bool *pgrp_set, unsigned int *type, - int *minproto, int *maxproto) +static int parse_options(char *options, + struct inode *root, int *pgrp, bool *pgrp_set, + struct autofs_sb_info *sbi) { char *p; substring_t args[MAX_OPT_ARGS]; int option; + int pipefd = -1; + kuid_t uid; + kgid_t gid; - *uid = current_uid(); - *gid = current_gid(); + root->i_uid = current_uid(); + root->i_gid = current_gid(); - *minproto = AUTOFS_MIN_PROTO_VERSION; - *maxproto = AUTOFS_MAX_PROTO_VERSION; + sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; + sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; - *pipefd = -1; + sbi->pipefd = -1; if (!options) return 1; @@ -152,22 +155,25 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, token = match_token(p, tokens, args); switch (token) { case Opt_fd: - if (match_int(args, pipefd)) + if (match_int(args, &pipefd)) return 1; + sbi->pipefd = pipefd; break; case Opt_uid: if (match_int(args, &option)) return 1; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) return 1; + root->i_uid = uid; break; case Opt_gid: if (match_int(args, &option)) return 1; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) return 1; + root->i_gid = gid; break; case Opt_pgrp: if (match_int(args, &option)) @@ -178,27 +184,27 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, case Opt_minproto: if (match_int(args, &option)) return 1; - *minproto = option; + sbi->min_proto = option; break; case Opt_maxproto: if (match_int(args, &option)) return 1; - *maxproto = option; + sbi->max_proto = option; break; case Opt_indirect: - set_autofs_type_indirect(type); + set_autofs_type_indirect(&sbi->type); break; case Opt_direct: - set_autofs_type_direct(type); + set_autofs_type_direct(&sbi->type); break; case Opt_offset: - set_autofs_type_offset(type); + set_autofs_type_offset(&sbi->type); break; default: return 1; } } - return (*pipefd < 0); + return (sbi->pipefd < 0); } int autofs_fill_super(struct super_block *s, void *data, int silent) @@ -206,7 +212,6 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) struct inode *root_inode; struct dentry *root; struct file *pipe; - int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; int pgrp = 0; @@ -262,9 +267,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) root->d_fsdata = ino; /* Can this call block? */ - if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, - &sbi->max_proto)) { + if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) { pr_err("called with bogus options\n"); goto fail_dput; } @@ -303,8 +306,9 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) root_inode->i_fop = &autofs_root_operations; root_inode->i_op = &autofs_dir_inode_operations; - pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); - pipe = fget(pipefd); + pr_debug("pipe fd = %d, pgrp = %u\n", + sbi->pipefd, pid_nr(sbi->oz_pgrp)); + pipe = fget(sbi->pipefd); if (!pipe) { pr_err("could not open pipe file descriptor\n"); @@ -314,7 +318,6 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) if (ret < 0) goto fail_fput; sbi->pipe = pipe; - sbi->pipefd = pipefd; sbi->catatonic = 0; /* From 9d8719a42e4671cfe27733d82b5a071295ab9975 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 3 Jan 2019 15:27:39 -0800 Subject: [PATCH 30/58] autofs: change catatonic setting to a bit flag Change the superblock info. catatonic setting to be part of a flags bit field. Link: http://lkml.kernel.org/r/154296973142.9889.17275721668508589639.stgit@pluto-themaw-net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 7 +++++-- fs/autofs/dev-ioctl.c | 4 ++-- fs/autofs/inode.c | 4 ++-- fs/autofs/root.c | 11 ++++++----- fs/autofs/waitq.c | 10 +++++----- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 64f693d355ada0..9b81c10ef251dc 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -103,16 +103,18 @@ struct autofs_wait_queue { #define AUTOFS_SBI_MAGIC 0x6d4a556d +#define AUTOFS_SBI_CATATONIC 0x0001 + struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; struct pid *oz_pgrp; - int catatonic; int version; int sub_version; int min_proto; int max_proto; + unsigned int flags; unsigned long exp_timeout; unsigned int type; struct super_block *sb; @@ -142,7 +144,8 @@ static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) */ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; + return ((sbi->flags & AUTOFS_SBI_CATATONIC) || + task_pgrp(current) == sbi->oz_pgrp); } struct inode *autofs_get_inode(struct super_block *, umode_t); diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 752983aafb846b..e9fe74d1541bba 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -350,7 +350,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); - if (!sbi->catatonic) { + if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) { mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { @@ -377,7 +377,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; - sbi->catatonic = 0; + sbi->flags &= ~AUTOFS_SBI_CATATONIC; } out: put_pid(new_pid); diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index bd49b16f43fdd9..3a94dbda36dd2e 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -227,12 +227,12 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) sbi->magic = AUTOFS_SBI_MAGIC; sbi->pipefd = -1; sbi->pipe = NULL; - sbi->catatonic = 1; sbi->exp_timeout = 0; sbi->oz_pgrp = NULL; sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; + sbi->flags = AUTOFS_SBI_CATATONIC; set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; @@ -318,7 +318,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) if (ret < 0) goto fail_fput; sbi->pipe = pipe; - sbi->catatonic = 0; + sbi->flags &= ~AUTOFS_SBI_CATATONIC; /* * Success! Install the root dentry now to indicate completion. diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 782e57b911abcf..164ccd3402cf3d 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -510,7 +510,8 @@ static struct dentry *autofs_lookup(struct inode *dir, sbi = autofs_sbi(dir->i_sb); pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", - current->pid, task_pgrp_nr(current), sbi->catatonic, + current->pid, task_pgrp_nr(current), + sbi->flags & AUTOFS_SBI_CATATONIC, autofs_oz_mode(sbi)); active = autofs_lookup_active(dentry); @@ -563,7 +564,7 @@ static int autofs_dir_symlink(struct inode *dir, * autofs mount is catatonic but the state of an autofs * file system needs to be preserved over restarts. */ - if (sbi->catatonic) + if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; BUG_ON(!ino); @@ -626,7 +627,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) * autofs mount is catatonic but the state of an autofs * file system needs to be preserved over restarts. */ - if (sbi->catatonic) + if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; if (atomic_dec_and_test(&ino->count)) { @@ -714,7 +715,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) * autofs mount is catatonic but the state of an autofs * file system needs to be preserved over restarts. */ - if (sbi->catatonic) + if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; spin_lock(&sbi->lookup_lock); @@ -759,7 +760,7 @@ static int autofs_dir_mkdir(struct inode *dir, * autofs mount is catatonic but the state of an autofs * file system needs to be preserved over restarts. */ - if (sbi->catatonic) + if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; pr_debug("dentry %p, creating %pd\n", dentry, dentry); diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index f6385c6ef0a56a..15a3e31d09045a 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -20,14 +20,14 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) struct autofs_wait_queue *wq, *nwq; mutex_lock(&sbi->wq_mutex); - if (sbi->catatonic) { + if (sbi->flags & AUTOFS_SBI_CATATONIC) { mutex_unlock(&sbi->wq_mutex); return; } pr_debug("entering catatonic mode\n"); - sbi->catatonic = 1; + sbi->flags |= AUTOFS_SBI_CATATONIC; wq = sbi->queues; sbi->queues = NULL; /* Erase all wait queues */ while (wq) { @@ -255,7 +255,7 @@ static int validate_request(struct autofs_wait_queue **wait, struct autofs_wait_queue *wq; struct autofs_info *ino; - if (sbi->catatonic) + if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; /* Wait in progress, continue; */ @@ -290,7 +290,7 @@ static int validate_request(struct autofs_wait_queue **wait, if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; - if (sbi->catatonic) + if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; wq = autofs_find_wait(sbi, qstr); @@ -359,7 +359,7 @@ int autofs_wait(struct autofs_sb_info *sbi, pid_t tgid; /* In catatonic mode, we don't wait for nobody */ - if (sbi->catatonic) + if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; /* From f5162216b7dab0c07e070b8b7f98891a85047f59 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 3 Jan 2019 15:27:43 -0800 Subject: [PATCH 31/58] autofs: add strictexpire mount option Commit 092a53452bb7 ("autofs: take more care to not update last_used on path walk") helped to (partially) resolve a problem where automounts were not expiring due to aggressive accesses from user space. This patch was later reverted because, for very large environments, it meant more mount requests from clients and when there are a lot of clients this caused a fairly significant increase in server load. But there is a need for both types of expire check, depending on use case, so add a mount option to allow for strict update of last use of autofs dentrys (which just means not updating the last use on path walk access). Link: http://lkml.kernel.org/r/154296973880.9889.14085372741514507967.stgit@pluto-themaw-net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 1 + fs/autofs/inode.c | 8 +++++++- fs/autofs/root.c | 5 ++++- include/uapi/linux/auto_fs.h | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 9b81c10ef251dc..3e59f0ed777bff 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -104,6 +104,7 @@ struct autofs_wait_queue { #define AUTOFS_SBI_MAGIC 0x6d4a556d #define AUTOFS_SBI_CATATONIC 0x0001 +#define AUTOFS_SBI_STRICTEXPIRE 0x0002 struct autofs_sb_info { u32 magic; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 3a94dbda36dd2e..0e8ea2d9a2bba2 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -87,6 +87,8 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",direct"); else seq_printf(m, ",indirect"); + if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) + seq_printf(m, ",strictexpire"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); @@ -109,7 +111,7 @@ static const struct super_operations autofs_sops = { }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset}; + Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire}; static const match_table_t tokens = { {Opt_fd, "fd=%u"}, @@ -121,6 +123,7 @@ static const match_table_t tokens = { {Opt_indirect, "indirect"}, {Opt_direct, "direct"}, {Opt_offset, "offset"}, + {Opt_strictexpire, "strictexpire"}, {Opt_err, NULL} }; @@ -200,6 +203,9 @@ static int parse_options(char *options, case Opt_offset: set_autofs_type_offset(&sbi->type); break; + case Opt_strictexpire: + sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; + break; default: return 1; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 164ccd3402cf3d..1246f396bf0e9d 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -275,8 +275,11 @@ static int autofs_mount_wait(const struct path *path, bool rcu_walk) pr_debug("waiting for mount name=%pd\n", path->dentry); status = autofs_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); + ino->last_used = jiffies; + return status; } - ino->last_used = jiffies; + if (!(sbi->flags & AUTOFS_SBI_STRICTEXPIRE)) + ino->last_used = jiffies; return status; } diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index df31aa9c9a8c3a..082119630b49c2 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 3 +#define AUTOFS_PROTO_SUBVERSION 4 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed From f93ca1ed9ba09fa54d372ab17649d781384e34f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Thu, 3 Jan 2019 15:27:46 -0800 Subject: [PATCH 32/58] hfsplus: return file attributes on statx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The immutable, append-only and no-dump attributes can only be retrieved with an ioctl; implement the ->getattr() method to return them on statx. Do not return the inode birthtime yet, because the issue of how best to handle the post-2038 timestamps is still under discussion. This patch is needed to pass xfstests generic/424. Link: http://lkml.kernel.org/r/20181014163558.sxorxlzjqccq2lpw@eaf Signed-off-by: Ernesto A. Fernández Cc: Viacheslav Dubeyko Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/dir.c | 1 + fs/hfsplus/hfsplus_fs.h | 2 ++ fs/hfsplus/inode.c | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index f37662675c3a34..29a9dcfbe81f72 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -565,6 +565,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { .symlink = hfsplus_symlink, .mknod = hfsplus_mknod, .rename = hfsplus_rename, + .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, }; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index dd7ad9f13e3aad..b8471bf05def95 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -488,6 +488,8 @@ void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork); int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); int hfsplus_cat_write_inode(struct inode *inode); +int hfsplus_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index d7ab9d8c4b6743..d131c8ea7eb616 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -270,6 +270,26 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) return 0; } +int hfsplus_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + + if (inode->i_flags & S_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (inode->i_flags & S_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (hip->userflags & HFSPLUS_FLG_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP; + + generic_fillattr(inode, stat); + return 0; +} + int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -329,6 +349,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .setattr = hfsplus_setattr, + .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, }; From 9da22854761a76c45d78aa2ae2b4bbd504b4f171 Mon Sep 17 00:00:00 2001 From: Carmeli Tamir Date: Thu, 3 Jan 2019 15:27:49 -0800 Subject: [PATCH 33/58] include/uapi/linux/msdos_fs.h: use MSDOS_NAME for volume label size The FAT file system volume label file stored in the root directory should match the volume label field in the FAT boot sector. As consequence, the max length of these fields ought to be the same. This patch replaces the magic '11' usef in the struct fat_boot_sector with MSDOS_NAME, which is used in struct msdos_dir_entry. Please check the following references: 1. Microsoft FAT specification 2005 (http://read.pudn.com/downloads77/ebook/294884/FAT32%20Spec%20%28SDA%20Contribution%29.pdf). Search for 'volume label'. 2. Microsoft Extensible Firmware Initiative, FAT32 File System Specification (https://staff.washington.edu/dittrich/misc/fatgen103.pdf). Search for 'volume label'. 3. User space code that creates FAT filesystem sometimes uses MSDOS_NAME for the label, sometimes not. Search for 'if (memcmp(label, NO_NAME, MSDOS_NAME))'. I consider to make the same patch there as well. https://github.com/dosfstools/dosfstools/blob/master/src/mkfs.fat.c Link: http://lkml.kernel.org/r/1543096879-82837-1-git-send-email-carmeli.tamir@gmail.com Signed-off-by: Carmeli Tamir Reviewed-by: Sergey Senozhatsky Reviewed-by: Johannes Thumshirn Acked-by: OGAWA Hirofumi Cc: Jens Axboe Cc: Bart Van Assche Cc: Martin K. Petersen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/msdos_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h index fde753735abaec..1216e6caf59b0b 100644 --- a/include/uapi/linux/msdos_fs.h +++ b/include/uapi/linux/msdos_fs.h @@ -135,7 +135,7 @@ struct fat_boot_sector { for mount state. */ __u8 signature; /* extended boot signature */ __u8 vol_id[4]; /* volume ID */ - __u8 vol_label[11]; /* volume label */ + __u8 vol_label[MSDOS_NAME]; /* volume label */ __u8 fs_type[8]; /* file system type */ /* other fields are not added here */ } fat16; @@ -158,7 +158,7 @@ struct fat_boot_sector { for mount state. */ __u8 signature; /* extended boot signature */ __u8 vol_id[4]; /* volume ID */ - __u8 vol_label[11]; /* volume label */ + __u8 vol_label[MSDOS_NAME]; /* volume label */ __u8 fs_type[8]; /* file system type */ /* other fields are not added here */ } fat32; From b553337a57cf4f077464292520f4e975ea4cda83 Mon Sep 17 00:00:00 2001 From: Carmeli Tamir Date: Thu, 3 Jan 2019 15:27:53 -0800 Subject: [PATCH 34/58] fat: remove FAT_FIRST_ENT macro The comment edited in this patch was the only reference to the FAT_FIRST_ENT macro, which is not used anymore. Moreover, the commented line of code does not compile with the current code. Since the FAT_FIRST_ENT macro checks the FAT variant in a way that the patch series changes, I removed it, and instead wrote a clear explanation of what was checked. I verified that the changed comment is correct according to Microsoft FAT spec, search for "BPB_Media" in the following references: 1. Microsoft FAT specification 2005 (http://read.pudn.com/downloads77/ebook/294884/FAT32%20Spec%20%28SDA%20Contribution%29.pdf). Search for 'volume label'. 2. Microsoft Extensible Firmware Initiative, FAT32 File System Specification (https://staff.washington.edu/dittrich/misc/fatgen103.pdf). Search for 'volume label'. Link: http://lkml.kernel.org/r/1544990640-11604-2-git-send-email-carmeli.tamir@gmail.com Signed-off-by: Carmeli Tamir Acked-by: OGAWA Hirofumi Reviewed-by: Sergey Senozhatsky Cc: Bart Van Assche Cc: Johannes Thumshirn Cc: Martin K. Petersen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 12 ++++++++---- include/uapi/linux/msdos_fs.h | 3 --- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c0b5b5c3373bd9..269e7b91b8b131 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1803,11 +1803,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_ent_access_init(sb); /* - * The low byte of FAT's first entry must have same value with - * media-field. But in real world, too many devices is - * writing wrong value. So, removed that validity check. + * The low byte of the first FAT entry must have the same value as + * the media field of the boot sector. But in real world, too many + * devices are writing wrong values. So, removed that validity check. * - * if (FAT_FIRST_ENT(sb, media) != first) + * The removed check compared the first FAT entry to a value dependent + * on the media field like this: + * == (0x0F00 | media), for FAT12 + * == (0XFF00 | media), for FAT16 + * == (0x0FFFFF | media), for FAT32 */ error = -EINVAL; diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h index 1216e6caf59b0b..833c7079a1e3db 100644 --- a/include/uapi/linux/msdos_fs.h +++ b/include/uapi/linux/msdos_fs.h @@ -58,9 +58,6 @@ #define MSDOS_DOT ". " /* ".", padded to MSDOS_NAME chars */ #define MSDOS_DOTDOT ".. " /* "..", padded to MSDOS_NAME chars */ -#define FAT_FIRST_ENT(s, x) ((MSDOS_SB(s)->fat_bits == 32 ? 0x0FFFFF00 : \ - MSDOS_SB(s)->fat_bits == 16 ? 0xFF00 : 0xF00) | (x)) - /* start of data cluster's entry (number of reserved clusters) */ #define FAT_START_ENT 2 From d19dc016187502dda6b8095e44eb46a18e89b2b3 Mon Sep 17 00:00:00 2001 From: Carmeli Tamir Date: Thu, 3 Jan 2019 15:27:56 -0800 Subject: [PATCH 35/58] fat: move MAX_FAT to fat.h and change it to inline function MAX_FAT is useless in msdos_fs.h, since it uses the MSDOS_SB function that is defined in fat.h. So really, this macro can be only called from code that already includes fat.h. Hence, this patch moves it to fat.h, right after MSDOS_SB is defined. I also changed it to an inline function in order to save the double call to MSDOS_SB. This was suggested by joe@perches.com in the previous version. This patch is required for the next in the series, in which the variant (whether this is FAT12, FAT16 or FAT32) checks are replaced with new macros. Link: http://lkml.kernel.org/r/1544990640-11604-3-git-send-email-carmeli.tamir@gmail.com Signed-off-by: Carmeli Tamir Acked-by: OGAWA Hirofumi Reviewed-by: Sergey Senozhatsky Cc: Bart Van Assche Cc: Johannes Thumshirn Cc: Martin K. Petersen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/fat.h | 9 +++++++++ fs/fat/inode.c | 2 +- include/uapi/linux/msdos_fs.h | 2 -- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 4e1b2f6df5e6a6..979bb11832f6b4 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -142,6 +142,15 @@ static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb) return sb->s_fs_info; } +/* Maximum number of clusters */ +static inline u32 max_fat(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + return sbi->fat_bits == 32 ? MAX_FAT32 : + sbi->fat_bits == 16 ? MAX_FAT16 : MAX_FAT12; +} + static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) { return container_of(inode, struct msdos_inode_info, vfs_inode); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 269e7b91b8b131..8d0de1aac4eb98 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1781,7 +1781,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, /* check that FAT table does not overflow */ fat_clusters = calc_fat_clusters(sb); total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); - if (total_clusters > MAX_FAT(sb)) { + if (total_clusters > max_fat(sb)) { if (!silent) fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", total_clusters); diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h index 833c7079a1e3db..a5773899f4d910 100644 --- a/include/uapi/linux/msdos_fs.h +++ b/include/uapi/linux/msdos_fs.h @@ -65,8 +65,6 @@ #define MAX_FAT12 0xFF4 #define MAX_FAT16 0xFFF4 #define MAX_FAT32 0x0FFFFFF6 -#define MAX_FAT(s) (MSDOS_SB(s)->fat_bits == 32 ? MAX_FAT32 : \ - MSDOS_SB(s)->fat_bits == 16 ? MAX_FAT16 : MAX_FAT12) /* bad cluster mark */ #define BAD_FAT12 0xFF7 From 306790f75ac2fe021a900395255e468807002c42 Mon Sep 17 00:00:00 2001 From: Carmeli Tamir Date: Thu, 3 Jan 2019 15:28:00 -0800 Subject: [PATCH 36/58] fat: new inline functions to determine the FAT variant (32, 16 or 12) This patch introduces 3 new inline functions - is_fat12, is_fat16 and is_fat32, and replaces every occurrence in the code in which the FS variant (whether this is FAT12, FAT16 or FAT32) was previously checked using msdos_sb_info->fat_bits. Link: http://lkml.kernel.org/r/1544990640-11604-4-git-send-email-carmeli.tamir@gmail.com Signed-off-by: Carmeli Tamir Acked-by: OGAWA Hirofumi Reviewed-by: Sergey Senozhatsky Cc: Johannes Thumshirn Cc: Bart Van Assche Cc: Martin K. Petersen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/cache.c | 2 +- fs/fat/dir.c | 4 ++-- fs/fat/fat.h | 25 ++++++++++++++++++++++--- fs/fat/fatent.c | 16 +++++++--------- fs/fat/inode.c | 12 ++++++------ fs/fat/misc.c | 2 +- 6 files changed, 39 insertions(+), 22 deletions(-) diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 78d501c1fb658f..738e427e2d2150 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -363,7 +363,7 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, *phys = 0; *mapped_blocks = 0; - if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) { + if (!is_fat32(sbi) && (inode->i_ino == MSDOS_ROOT_INO)) { if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) { *phys = sector + sbi->dir_start; *mapped_blocks = 1; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index c8366cb8eccd7c..20acaea8a7e6f2 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -57,7 +57,7 @@ static inline void fat_dir_readahead(struct inode *dir, sector_t iblock, if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1) return; /* root dir of FAT12/FAT16 */ - if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO)) + if (!is_fat32(sbi) && (dir->i_ino == MSDOS_ROOT_INO)) return; bh = sb_find_get_block(sb, phys); @@ -1313,7 +1313,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, } } if (dir->i_ino == MSDOS_ROOT_INO) { - if (sbi->fat_bits != 32) + if (!is_fat32(sbi)) goto error; } else if (MSDOS_I(dir)->i_start == 0) { fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)", diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 979bb11832f6b4..922a0c6ba46c49 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -142,13 +142,32 @@ static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb) return sb->s_fs_info; } +/* + * Functions that determine the variant of the FAT file system (i.e., + * whether this is FAT12, FAT16 or FAT32. + */ +static inline bool is_fat12(const struct msdos_sb_info *sbi) +{ + return sbi->fat_bits == 12; +} + +static inline bool is_fat16(const struct msdos_sb_info *sbi) +{ + return sbi->fat_bits == 16; +} + +static inline bool is_fat32(const struct msdos_sb_info *sbi) +{ + return sbi->fat_bits == 32; +} + /* Maximum number of clusters */ static inline u32 max_fat(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - return sbi->fat_bits == 32 ? MAX_FAT32 : - sbi->fat_bits == 16 ? MAX_FAT16 : MAX_FAT12; + return is_fat32(sbi) ? MAX_FAT32 : + is_fat16(sbi) ? MAX_FAT16 : MAX_FAT12; } static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) @@ -266,7 +285,7 @@ static inline int fat_get_start(const struct msdos_sb_info *sbi, const struct msdos_dir_entry *de) { int cluster = le16_to_cpu(de->start); - if (sbi->fat_bits == 32) + if (is_fat32(sbi)) cluster |= (le16_to_cpu(de->starthi) << 16); return cluster; } diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index f58c0cacc531df..495edeafd60a0c 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -290,19 +290,17 @@ void fat_ent_access_init(struct super_block *sb) mutex_init(&sbi->fat_lock); - switch (sbi->fat_bits) { - case 32: + if (is_fat32(sbi)) { sbi->fatent_shift = 2; sbi->fatent_ops = &fat32_ops; - break; - case 16: + } else if (is_fat16(sbi)) { sbi->fatent_shift = 1; sbi->fatent_ops = &fat16_ops; - break; - case 12: + } else if (is_fat12(sbi)) { sbi->fatent_shift = -1; sbi->fatent_ops = &fat12_ops; - break; + } else { + fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits); } } @@ -310,7 +308,7 @@ static void mark_fsinfo_dirty(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - if (sb_rdonly(sb) || sbi->fat_bits != 32) + if (sb_rdonly(sb) || !is_fat32(sbi)) return; __mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC); @@ -327,7 +325,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb, /* Is this fatent's blocks including this entry? */ if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr) return 0; - if (sbi->fat_bits == 12) { + if (is_fat12(sbi)) { if ((offset + 1) < sb->s_blocksize) { /* This entry is on bhs[0]. */ if (fatent->nr_bhs == 2) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8d0de1aac4eb98..79bb0e73a65f48 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -686,7 +686,7 @@ static void fat_set_state(struct super_block *sb, b = (struct fat_boot_sector *) bh->b_data; - if (sbi->fat_bits == 32) { + if (is_fat32(sbi)) { if (set) b->fat32.state |= FAT_STATE_DIRTY; else @@ -1396,7 +1396,7 @@ static int fat_read_root(struct inode *inode) inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO); inode->i_op = sbi->dir_ops; inode->i_fop = &fat_dir_operations; - if (sbi->fat_bits == 32) { + if (is_fat32(sbi)) { MSDOS_I(inode)->i_start = sbi->root_cluster; error = fat_calc_dir_size(inode); if (error < 0) @@ -1423,7 +1423,7 @@ static unsigned long calc_fat_clusters(struct super_block *sb) struct msdos_sb_info *sbi = MSDOS_SB(sb); /* Divide first to avoid overflow */ - if (sbi->fat_bits != 12) { + if (!is_fat12(sbi)) { unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; return ent_per_sec * sbi->fat_length; } @@ -1743,7 +1743,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, } /* interpret volume ID as a little endian 32 bit integer */ - if (sbi->fat_bits == 32) + if (is_fat32(sbi)) sbi->vol_id = bpb.fat32_vol_id; else /* fat 16 or 12 */ sbi->vol_id = bpb.fat16_vol_id; @@ -1769,11 +1769,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus; - if (sbi->fat_bits != 32) + if (!is_fat32(sbi)) sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ - if (sbi->fat_bits == 32) + if (is_fat32(sbi)) sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY; else /* fat 16 or 12 */ sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index fce0a76f3f1e4c..4fc950bb64335c 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -64,7 +64,7 @@ int fat_clusters_flush(struct super_block *sb) struct buffer_head *bh; struct fat_boot_fsinfo *fsinfo; - if (sbi->fat_bits != 32) + if (!is_fat32(sbi)) return 0; bh = sb_bread(sb, sbi->fsinfo_sector); From fb5bf31722d0805a3f394f7d59f2e8cd07acccb7 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 3 Jan 2019 15:28:03 -0800 Subject: [PATCH 37/58] fork: fix some -Wmissing-prototypes warnings We get a warning when building kernel with W=1: kernel/fork.c:167:13: warning: no previous prototype for `arch_release_thread_stack' [-Wmissing-prototypes] kernel/fork.c:779:13: warning: no previous prototype for `fork_init' [-Wmissing-prototypes] Add the missing declaration in head file to fix this. Also, remove arch_release_thread_stack() completely because no arch seems to implement it since bb9d81264 (arch: remove tile port). Link: http://lkml.kernel.org/r/1542170087-23645-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Yi Wang Acked-by: Michal Hocko Acked-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/task.h | 2 ++ init/main.c | 1 - kernel/fork.c | 5 ----- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 108ede99e53350..44c6f15800ff55 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -39,6 +39,8 @@ void __noreturn do_task_dead(void); extern void proc_caches_init(void); +extern void fork_init(void); + extern void release_task(struct task_struct * p); #ifdef CONFIG_HAVE_COPY_THREAD_TLS diff --git a/init/main.c b/init/main.c index 6a74ba0892d223..e2e80ca3165a87 100644 --- a/init/main.c +++ b/init/main.c @@ -105,7 +105,6 @@ static int kernel_init(void *); extern void init_IRQ(void); -extern void fork_init(void); extern void radix_tree_init(void); /* diff --git a/kernel/fork.c b/kernel/fork.c index d439c48ecf181c..a60459947f186d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,10 +164,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_stack(unsigned long *stack) -{ -} - #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* @@ -422,7 +418,6 @@ static void release_task_stack(struct task_struct *tsk) return; /* Better to leak the stack than to free prematurely */ account_kernel_stack(tsk, -1); - arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); tsk->stack = NULL; #ifdef CONFIG_VMAP_STACK From 8099b047ecc431518b9bb6bdbba3549bbecdc343 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jan 2019 15:28:07 -0800 Subject: [PATCH 38/58] exec: load_script: don't blindly truncate shebang string load_script() simply truncates bprm->buf and this is very wrong if the length of shebang string exceeds BINPRM_BUF_SIZE-2. This can silently truncate i_arg or (worse) we can execute the wrong binary if buf[2:126] happens to be the valid executable path. Change load_script() to return ENOEXEC if it can't find '\n' or zero in bprm->buf. Note that '\0' can come from either prepare_binprm()->memset() or from kernel_read(), we do not care. Link: http://lkml.kernel.org/r/20181112160931.GA28463@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Ben Woodard Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_script.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 7cde3f46ad263a..d0078cbb718b48 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -42,10 +42,14 @@ static int load_script(struct linux_binprm *bprm) fput(bprm->file); bprm->file = NULL; - bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; - if ((cp = strchr(bprm->buf, '\n')) == NULL) - cp = bprm->buf+BINPRM_BUF_SIZE-1; + for (cp = bprm->buf+2;; cp++) { + if (cp >= bprm->buf + BINPRM_BUF_SIZE) + return -ENOEXEC; + if (!*cp || (*cp == '\n')) + break; + } *cp = '\0'; + while (cp > bprm->buf) { cp--; if ((*cp == ' ') || (*cp == '\t')) From 655c16a8ce9c15842547f40ce23fd148aeccc074 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jan 2019 15:28:11 -0800 Subject: [PATCH 39/58] exec: separate MM_ANONPAGES and RLIMIT_STACK accounting get_arg_page() checks bprm->rlim_stack.rlim_cur and re-calculates the "extra" size for argv/envp pointers every time, this is a bit ugly and even not strictly correct: acct_arg_size() must not account this size. Remove all the rlimit code in get_arg_page(). Instead, add bprm->argmin calculated once at the start of __do_execve_file() and change copy_strings to check bprm->p >= bprm->argmin. The patch adds the new helper, prepare_arg_pages() which initializes bprm->argc/envc and bprm->argmin. [oleg@redhat.com: fix !CONFIG_MMU version of get_arg_page()] Link: http://lkml.kernel.org/r/20181126122307.GA1660@redhat.com [akpm@linux-foundation.org: use max_t] Link: http://lkml.kernel.org/r/20181112160910.GA28440@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Tested-by: Guenter Roeck Cc: "Eric W. Biederman" Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 105 ++++++++++++++++++++-------------------- include/linux/binfmts.h | 1 + 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index fc281b738a9822..ea7d439cf79efe 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -218,55 +218,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (ret <= 0) return NULL; - if (write) { - unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; - unsigned long ptr_size, limit; - - /* - * Since the stack will hold pointers to the strings, we - * must account for them as well. - * - * The size calculation is the entire vma while each arg page is - * built, so each time we get here it's calculating how far it - * is currently (rather than each call being just the newly - * added size from the arg page). As a result, we need to - * always add the entire size of the pointers, so that on the - * last call to get_arg_page() we'll actually have the entire - * correct size. - */ - ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); - if (ptr_size > ULONG_MAX - size) - goto fail; - size += ptr_size; - - acct_arg_size(bprm, size / PAGE_SIZE); - - /* - * We've historically supported up to 32 pages (ARG_MAX) - * of argument strings even with small stacks - */ - if (size <= ARG_MAX) - return page; - - /* - * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM - * (whichever is smaller) for the argv+env strings. - * This ensures that: - * - the remaining binfmt code will not run out of stack space, - * - the program will have a reasonable amount of stack left - * to work from. - */ - limit = _STK_LIM / 4 * 3; - limit = min(limit, bprm->rlim_stack.rlim_cur / 4); - if (size > limit) - goto fail; - } + if (write) + acct_arg_size(bprm, vma_pages(bprm->vma)); return page; - -fail: - put_page(page); - return NULL; } static void put_arg_page(struct page *page) @@ -492,6 +447,50 @@ static int count(struct user_arg_ptr argv, int max) return i; } +static int prepare_arg_pages(struct linux_binprm *bprm, + struct user_arg_ptr argv, struct user_arg_ptr envp) +{ + unsigned long limit, ptr_size; + + bprm->argc = count(argv, MAX_ARG_STRINGS); + if (bprm->argc < 0) + return bprm->argc; + + bprm->envc = count(envp, MAX_ARG_STRINGS); + if (bprm->envc < 0) + return bprm->envc; + + /* + * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM + * (whichever is smaller) for the argv+env strings. + * This ensures that: + * - the remaining binfmt code will not run out of stack space, + * - the program will have a reasonable amount of stack left + * to work from. + */ + limit = _STK_LIM / 4 * 3; + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); + /* + * We've historically supported up to 32 pages (ARG_MAX) + * of argument strings even with small stacks + */ + limit = max_t(unsigned long, limit, ARG_MAX); + /* + * We must account for the size of all the argv and envp pointers to + * the argv and envp strings, since they will also take up space in + * the stack. They aren't stored until much later when we can't + * signal to the parent that the child has run out of stack space. + * Instead, calculate it here so it's possible to fail gracefully. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (limit <= ptr_size) + return -E2BIG; + limit -= ptr_size; + + bprm->argmin = bprm->p - limit; + return 0; +} + /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() @@ -527,6 +526,10 @@ static int copy_strings(int argc, struct user_arg_ptr argv, pos = bprm->p; str += len; bprm->p -= len; +#ifdef CONFIG_MMU + if (bprm->p < bprm->argmin) + goto out; +#endif while (len > 0) { int offset, bytes_to_copy; @@ -1789,12 +1792,8 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval) goto out_unmark; - bprm->argc = count(argv, MAX_ARG_STRINGS); - if ((retval = bprm->argc) < 0) - goto out; - - bprm->envc = count(envp, MAX_ARG_STRINGS); - if ((retval = bprm->envc) < 0) + retval = prepare_arg_pages(bprm, argv, envp); + if (retval < 0) goto out; retval = prepare_binprm(bprm); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e9f5fe69df312d..03200a8c01787c 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -25,6 +25,7 @@ struct linux_binprm { #endif struct mm_struct *mm; unsigned long p; /* current top of mem */ + unsigned long argmin; /* rlimit marker for copy_strings() */ unsigned int /* * True after the bprm_set_creds hook has been called once From d1877155891020cb26ad4fba45bfee52d8da9951 Mon Sep 17 00:00:00 2001 From: Tigran Aivazian Date: Thu, 3 Jan 2019 15:28:14 -0800 Subject: [PATCH 40/58] bfs: extra sanity checking and static inode bitmap Strengthen validation of BFS superblock against corruption. Make in-core inode bitmap static part of superblock info structure. Print a warning when mounting a BFS filesystem created with "-N 512" option as only 510 files can be created in the root directory. Make the kernel messages more uniform. Update the 'prefix' passed to bfs_dump_imap() to match the current naming of operations. White space and comments cleanup. Link: http://lkml.kernel.org/r/CAK+_RLkFZMduoQF36wZFd3zLi-6ZutWKsydjeHFNdtRvZZEb4w@mail.gmail.com Signed-off-by: Tigran Aivazian Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/bfs/bfs.h | 11 +++++-- fs/bfs/dir.c | 4 +-- fs/bfs/file.c | 2 +- fs/bfs/inode.c | 65 ++++++++++++++++--------------------- include/uapi/linux/bfs_fs.h | 2 +- 5 files changed, 41 insertions(+), 43 deletions(-) diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index 67aef3bb89e4c4..606f9378b2f03d 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -1,13 +1,20 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/bfs/bfs.h - * Copyright (C) 1999 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian */ #ifndef _FS_BFS_BFS_H #define _FS_BFS_BFS_H #include +/* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive. + In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511) + will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'. + So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning + if a filesystem is mounted with such "impossible to fill up" number of inodes */ +#define BFS_MAX_LASTI 513 + /* * BFS file system in-core superblock info */ @@ -17,7 +24,7 @@ struct bfs_sb_info { unsigned long si_freei; unsigned long si_lf_eblk; unsigned long si_lasti; - unsigned long *si_imap; + DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1); struct mutex bfs_lock; }; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index f32f21c3bbc7b0..d8dfe3a0cb390d 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -2,8 +2,8 @@ /* * fs/bfs/dir.c * BFS directory operations. - * Copyright (C) 1999,2000 Tigran Aivazian - * Made endianness-clean by Andrew Stribblehill 2005 + * Copyright (C) 1999-2018 Tigran Aivazian + * Made endianness-clean by Andrew Stribblehill 2005 */ #include diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 1476cdd90cfbb1..0dceefc54b48ab 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -2,7 +2,7 @@ /* * fs/bfs/file.c * BFS file operations. - * Copyright (C) 1999,2000 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian * * Make the file block allocation algorithm understand the size * of the underlying block device. diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index d81c148682e715..d136b2aaafb3a9 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -1,10 +1,9 @@ /* * fs/bfs/inode.c * BFS superblock and inode operations. - * Copyright (C) 1999-2006 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. - * - * Made endianness-clean by Andrew Stribblehill , 2005. + * Made endianness-clean by Andrew Stribblehill , 2005. */ #include @@ -118,12 +117,12 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; - unsigned long i_sblock; + unsigned long i_sblock; struct bfs_inode *di; struct buffer_head *bh; int err = 0; - dprintf("ino=%08x\n", ino); + dprintf("ino=%08x\n", ino); di = find_inode(inode->i_sb, ino, &bh); if (IS_ERR(di)) @@ -144,7 +143,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - i_sblock = BFS_I(inode)->i_sblock; + i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); @@ -188,13 +187,13 @@ static void bfs_evict_inode(struct inode *inode) mark_buffer_dirty(bh); brelse(bh); - if (bi->i_dsk_ino) { + if (bi->i_dsk_ino) { if (bi->i_sblock) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); - bfs_dump_imap("delete_inode", s); - } + bfs_dump_imap("evict_inode", s); + } /* * If this was the last file, make the previous block @@ -214,7 +213,6 @@ static void bfs_put_super(struct super_block *s) return; mutex_destroy(&info->bfs_lock); - kfree(info->si_imap); kfree(info); s->s_fs_info = NULL; } @@ -311,8 +309,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s) else strcat(tmpbuf, "0"); } - printf("BFS-fs: %s: lasti=%08lx <%s>\n", - prefix, BFS_SB(s)->si_lasti, tmpbuf); + printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf); free_page((unsigned long)tmpbuf); #endif } @@ -322,7 +319,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct buffer_head *bh, *sbh; struct bfs_super_block *bfs_sb; struct inode *inode; - unsigned i, imap_len; + unsigned i; struct bfs_sb_info *info; int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; @@ -341,8 +338,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) bfs_sb = (struct bfs_super_block *)sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) - printf("No BFS filesystem on %s (magic=%08x)\n", - s->s_id, le32_to_cpu(bfs_sb->s_magic)); + printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) @@ -351,18 +347,16 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_magic = BFS_MAGIC; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || - le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { - printf("Superblock is corrupted\n"); + le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) { + printf("Superblock is corrupted on %s\n", s->s_id); goto out1; } - info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / - sizeof(struct bfs_inode) - + BFS_ROOT_INO - 1; - imap_len = (info->si_lasti / 8) + 1; - info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); - if (!info->si_imap) { - printf("Cannot allocate %u bytes\n", imap_len); + info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; + if (info->si_lasti == BFS_MAX_LASTI) + printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id); + else if (info->si_lasti > BFS_MAX_LASTI) { + printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id); goto out1; } for (i = 0; i < BFS_ROOT_INO; i++) @@ -372,26 +366,25 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - goto out2; + goto out1; } s->s_root = d_make_root(inode); if (!s->s_root) { ret = -ENOMEM; - goto out2; + goto out1; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; - info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; + info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; /* can we read the last block? */ bh = sb_bread(s, info->si_blocks - 1); if (!bh) { - printf("Last block not available: %lu\n", info->si_blocks - 1); + printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1); ret = -EIO; - goto out3; + goto out2; } brelse(bh); @@ -425,11 +418,11 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) (i_eoff != le32_to_cpu(-1) && i_eoff > s_size) || i_sblock * BFS_BSIZE > i_eoff) { - printf("Inode 0x%08x corrupted\n", i); + printf("Inode 0x%08x corrupted on %s\n", i, s->s_id); brelse(bh); ret = -EIO; - goto out3; + goto out2; } if (!di->i_ino) { @@ -445,14 +438,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) } brelse(bh); brelse(sbh); - bfs_dump_imap("read_super", s); + bfs_dump_imap("fill_super", s); return 0; -out3: +out2: dput(s->s_root); s->s_root = NULL; -out2: - kfree(info->si_imap); out1: brelse(sbh); out: @@ -482,7 +473,7 @@ static int __init init_bfs_fs(void) int err = init_inodecache(); if (err) goto out1; - err = register_filesystem(&bfs_fs_type); + err = register_filesystem(&bfs_fs_type); if (err) goto out; return 0; diff --git a/include/uapi/linux/bfs_fs.h b/include/uapi/linux/bfs_fs.h index 940b04772af801..08f6b495635982 100644 --- a/include/uapi/linux/bfs_fs.h +++ b/include/uapi/linux/bfs_fs.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * include/linux/bfs_fs.h - BFS data structures on disk. - * Copyright (C) 1999 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian */ #ifndef _LINUX_BFS_FS_H From d999bd9392dea7c1a9ac43b8680b22c4425ae4c7 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jan 2019 15:28:17 -0800 Subject: [PATCH 41/58] panic: add options to print system info when panic happens Kernel panic issues are always painful to debug, partially because it's not easy to get enough information of the context when panic happens. And we have ramoops and kdump for that, while this commit tries to provide a easier way to show the system info by adding a cmdline parameter, referring some idea from sysrq handler. Link: http://lkml.kernel.org/r/1543398842-19295-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Kees Cook Acked-by: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 8 ++++++ kernel/panic.c | 28 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 408781ee142c9f..e7b5c49702bbf2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3092,6 +3092,14 @@ timeout < 0: reboot immediately Format: + panic_print= Bitmask for printing system info when panic happens. + User can chose combination of the following bits: + bit 0: print all tasks info + bit 1: print system memory info + bit 2: print timer info + bit 3: print locks info if CONFIG_LOCKDEP is on + bit 4: print ftrace buffer + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). diff --git a/kernel/panic.c b/kernel/panic.c index d10c340c43b0e1..855f66738bc7b1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -46,6 +46,13 @@ int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); +#define PANIC_PRINT_TASK_INFO 0x00000001 +#define PANIC_PRINT_MEM_INFO 0x00000002 +#define PANIC_PRINT_TIMER_INFO 0x00000004 +#define PANIC_PRINT_LOCK_INFO 0x00000008 +#define PANIC_PRINT_FTRACE_INFO 0x00000010 +static unsigned long panic_print; + ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -125,6 +132,24 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); +static void panic_print_sys_info(void) +{ + if (panic_print & PANIC_PRINT_TASK_INFO) + show_state(); + + if (panic_print & PANIC_PRINT_MEM_INFO) + show_mem(0, NULL); + + if (panic_print & PANIC_PRINT_TIMER_INFO) + sysrq_timer_list_show(); + + if (panic_print & PANIC_PRINT_LOCK_INFO) + debug_show_all_locks(); + + if (panic_print & PANIC_PRINT_FTRACE_INFO) + ftrace_dump(DUMP_ALL); +} + /** * panic - halt the system * @fmt: The text string to print @@ -254,6 +279,8 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(); + panic_print_sys_info(); + if (!panic_blink) panic_blink = no_blink; @@ -658,6 +685,7 @@ void refcount_error_report(struct pt_regs *regs, const char *err) #endif core_param(panic, panic_timeout, int, 0644); +core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); From 81c9d43f94870be66146739c6e61df40dc17bb64 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jan 2019 15:28:20 -0800 Subject: [PATCH 42/58] kernel/sysctl: add panic_print into sysctl So that we can also runtime chose to print out the needed system info for panic, other than setting the kernel cmdline. Link: http://lkml.kernel.org/r/1543398842-19295-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Steven Rostedt Acked-by: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 17 +++++++++++++++++ include/linux/kernel.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/panic.c | 2 +- kernel/sysctl.c | 7 +++++++ kernel/sysctl_binary.c | 1 + 6 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 1b8775298cf7a0..c0527d8a468a1d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -60,6 +60,7 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- panic_print - panic_on_rcu_stall - perf_cpu_time_max_percent - perf_event_paranoid @@ -654,6 +655,22 @@ a kernel rebuild when attempting to kdump at the location of a WARN(). ============================================================== +panic_print: + +Bitmask for printing system info when panic happens. User can chose +combination of the following bits: + +bit 0: print all tasks info +bit 1: print system memory info +bit 2: print timer info +bit 3: print locks info if CONFIG_LOCKDEP is on +bit 4: print ftrace buffer + +So for example to print tasks and memory info on panic, user can: + echo 3 > /proc/sys/kernel/panic_print + +============================================================== + panic_on_rcu_stall: When set to 1, calls panic() after RCU stall detection messages. This diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6aac75b51baab..8f0e68e250a760 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -527,6 +527,7 @@ static inline u32 int_sqrt64(u64 x) extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; +extern unsigned long panic_print; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d71013fffaf65b..87aa2a6d912566 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -153,6 +153,7 @@ enum KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */ + KERN_PANIC_PRINT=78, /* ulong: bitmask to print system info on panic */ }; diff --git a/kernel/panic.c b/kernel/panic.c index 855f66738bc7b1..f121e6ba7e114e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 -static unsigned long panic_print; +unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7f6c1a3b3485f1..ba4d9e85feb8ca 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -807,6 +807,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, #if defined CONFIG_PRINTK { .procname = "printk", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 07148b4974516c..73c132095a7bd4 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -140,6 +140,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, + { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" }, {} }; From 634724431607f6f46c495dfef801a1c8b44a96d9 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 3 Jan 2019 15:28:24 -0800 Subject: [PATCH 43/58] kernel/kcov.c: mark write_comp_data() as notrace Since __sanitizer_cov_trace_const_cmp4 is marked as notrace, the function called from __sanitizer_cov_trace_const_cmp4 shouldn't be traceable either. ftrace_graph_caller() gets called every time func write_comp_data() gets called if it isn't marked 'notrace'. This is the backtrace from gdb: #0 ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:179 #1 0xffffff8010201920 in ftrace_caller () at ../arch/arm64/kernel/entry-ftrace.S:151 #2 0xffffff8010439714 in write_comp_data (type=5, arg1=0, arg2=0, ip=18446743524224276596) at ../kernel/kcov.c:116 #3 0xffffff8010439894 in __sanitizer_cov_trace_const_cmp4 (arg1=, arg2=) at ../kernel/kcov.c:188 #4 0xffffff8010201874 in prepare_ftrace_return (self_addr=18446743524226602768, parent=0xffffff801014b918, frame_pointer=18446743524223531344) at ./include/generated/atomic-instrumented.h:27 #5 0xffffff801020194c in ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:182 Rework so that write_comp_data() that are called from __sanitizer_cov_trace_*_cmp*() are marked as 'notrace'. Commit 903e8ff86753 ("kernel/kcov.c: mark funcs in __sanitizer_cov_trace_pc() as notrace") missed to mark write_comp_data() as 'notrace'. When that patch was created gcc-7 was used. In lib/Kconfig.debug config KCOV_ENABLE_COMPARISONS depends on $(cc-option,-fsanitize-coverage=trace-cmp) That code path isn't hit with gcc-7. However, it were that with gcc-8. Link: http://lkml.kernel.org/r/20181206143011.23719-1-anders.roxell@linaro.org Signed-off-by: Anders Roxell Signed-off-by: Arnd Bergmann Co-developed-by: Arnd Bergmann Acked-by: Steven Rostedt (VMware) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 97959d7b77e2a9..c2277dbdbfb148 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -112,7 +112,7 @@ void notrace __sanitizer_cov_trace_pc(void) EXPORT_SYMBOL(__sanitizer_cov_trace_pc); #ifdef CONFIG_KCOV_ENABLE_COMPARISONS -static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) +static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) { struct task_struct *t; u64 *area; From b058809bfc8faeb7b7cae047666e23375a060059 Mon Sep 17 00:00:00 2001 From: Du Changbin Date: Thu, 3 Jan 2019 15:28:27 -0800 Subject: [PATCH 44/58] scripts/gdb: fix lx-version string output A bug is present in GDB which causes early string termination when parsing variables. This has been reported [0], but we should ensure that we can support at least basic printing of the core kernel strings. For current gdb version (has been tested with 7.3 and 8.1), 'lx-version' only prints one character. (gdb) lx-version L(gdb) This can be fixed by casting 'linux_banner' as (char *). (gdb) lx-version Linux version 4.19.0-rc1+ (changbin@acer) (gcc version 7.3.0 (Ubuntu 7.3.0-16ubuntu3)) #21 SMP Sat Sep 1 21:43:30 CST 2018 [0] https://sourceware.org/bugzilla/show_bug.cgi?id=20077 [kbingham@kernel.org: add detail to commit message] Link: http://lkml.kernel.org/r/20181111162035.8356-1-kieran.bingham@ideasonboard.com Fixes: 2d061d999424 ("scripts/gdb: add version command") Signed-off-by: Du Changbin Signed-off-by: Kieran Bingham Acked-by: Jan Kiszka Cc: Jan Kiszka Cc: Jason Wessel Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/proc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 086d27223c0cf3..0aebd7565b032f 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -41,7 +41,7 @@ def __init__(self): def invoke(self, arg, from_tty): # linux_banner should contain a newline - gdb.write(gdb.parse_and_eval("linux_banner").string()) + gdb.write(gdb.parse_and_eval("(char *)linux_banner").string()) LxVersion() From ff1522bb7d98450c72aea729f0b4147bc9986aed Mon Sep 17 00:00:00 2001 From: David Engraf Date: Thu, 3 Jan 2019 15:28:31 -0800 Subject: [PATCH 45/58] initramfs: cleanup incomplete rootfs Unpacking an external initrd may fail e.g. not enough memory. This leads to an incomplete rootfs because some files might be extracted already. Fixed by cleaning the rootfs so the kernel is not using an incomplete rootfs. Link: http://lkml.kernel.org/r/20181030151805.5519-1-david.engraf@sysgo.com Signed-off-by: David Engraf Cc: Dominik Brodowski Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: Arnd Bergmann Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index fca899622937f4..7cea802d00efa3 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -550,7 +550,6 @@ static void __init free_initrd(void) initrd_end = 0; } -#ifdef CONFIG_BLK_DEV_RAM #define BUF_SIZE 1024 static void __init clean_rootfs(void) { @@ -597,7 +596,6 @@ static void __init clean_rootfs(void) ksys_close(fd); kfree(buf); } -#endif static int __init populate_rootfs(void) { @@ -640,8 +638,10 @@ static int __init populate_rootfs(void) printk(KERN_INFO "Unpacking initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); - if (err) + if (err) { printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); + clean_rootfs(); + } free_initrd(); #endif } From 4cf58924951ef80eec636b863e7a53973c44261a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 3 Jan 2019 15:28:34 -0800 Subject: [PATCH 46/58] mm: treewide: remove unused address argument from pte_alloc functions Patch series "Add support for fast mremap". This series speeds up the mremap(2) syscall by copying page tables at the PMD level even for non-THP systems. There is concern that the extra 'address' argument that mremap passes to pte_alloc may do something subtle architecture related in the future that may make the scheme not work. Also we find that there is no point in passing the 'address' to pte_alloc since its unused. This patch therefore removes this argument tree-wide resulting in a nice negative diff as well. Also ensuring along the way that the enabled architectures do not do anything funky with the 'address' argument that goes unnoticed by the optimization. Build and boot tested on x86-64. Build tested on arm64. The config enablement patch for arm64 will be posted in the future after more testing. The changes were obtained by applying the following Coccinelle script. (thanks Julia for answering all Coccinelle questions!). Following fix ups were done manually: * Removal of address argument from pte_fragment_alloc * Removal of pte_alloc_one_fast definitions from m68k and microblaze. // Options: --include-headers --no-includes // Note: I split the 'identifier fn' line, so if you are manually // running it, please unsplit it so it runs for you. virtual patch @pte_alloc_func_def depends on patch exists@ identifier E2; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; type T2; @@ fn(... - , T2 E2 ) { ... } @pte_alloc_func_proto_noarg depends on patch exists@ type T1, T2, T3, T4; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; @@ ( - T3 fn(T1, T2); + T3 fn(T1); | - T3 fn(T1, T2, T4); + T3 fn(T1, T2); ) @pte_alloc_func_proto depends on patch exists@ identifier E1, E2, E4; type T1, T2, T3, T4; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; @@ ( - T3 fn(T1 E1, T2 E2); + T3 fn(T1 E1); | - T3 fn(T1 E1, T2 E2, T4 E4); + T3 fn(T1 E1, T2 E2); ) @pte_alloc_func_call depends on patch exists@ expression E2; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; @@ fn(... -, E2 ) @pte_alloc_macro depends on patch exists@ identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; identifier a, b, c; expression e; position p; @@ ( - #define fn(a, b, c) e + #define fn(a, b) e | - #define fn(a, b) e + #define fn(a) e ) Link: http://lkml.kernel.org/r/20181108181201.88826-2-joelaf@google.com Signed-off-by: Joel Fernandes (Google) Suggested-by: Kirill A. Shutemov Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Julia Lawall Cc: Kirill A. Shutemov Cc: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgalloc.h | 6 +++--- arch/arc/include/asm/pgalloc.h | 5 ++--- arch/arm/include/asm/pgalloc.h | 4 ++-- arch/arm64/include/asm/pgalloc.h | 4 ++-- arch/hexagon/include/asm/pgalloc.h | 6 ++---- arch/ia64/include/asm/pgalloc.h | 5 ++--- arch/m68k/include/asm/mcf_pgalloc.h | 8 ++------ arch/m68k/include/asm/motorola_pgalloc.h | 4 ++-- arch/m68k/include/asm/sun3_pgalloc.h | 6 ++---- arch/microblaze/include/asm/pgalloc.h | 19 ++----------------- arch/microblaze/mm/pgtable.c | 3 +-- arch/mips/include/asm/pgalloc.h | 6 ++---- arch/nds32/include/asm/pgalloc.h | 5 ++--- arch/nios2/include/asm/pgalloc.h | 6 ++---- arch/openrisc/include/asm/pgalloc.h | 5 ++--- arch/openrisc/mm/ioremap.c | 3 +-- arch/parisc/include/asm/pgalloc.h | 4 ++-- arch/powerpc/include/asm/book3s/32/pgalloc.h | 6 +++--- arch/powerpc/include/asm/book3s/64/pgalloc.h | 12 +++++------- arch/powerpc/include/asm/nohash/32/pgalloc.h | 6 +++--- arch/powerpc/include/asm/nohash/64/pgalloc.h | 6 ++---- arch/powerpc/mm/pgtable-frag.c | 2 +- arch/powerpc/mm/pgtable_32.c | 8 ++++---- arch/riscv/include/asm/pgalloc.h | 6 ++---- arch/s390/include/asm/pgalloc.h | 4 ++-- arch/sh/include/asm/pgalloc.h | 6 ++---- arch/sparc/include/asm/pgalloc_32.h | 5 ++--- arch/sparc/include/asm/pgalloc_64.h | 6 ++---- arch/sparc/mm/init_64.c | 6 ++---- arch/sparc/mm/srmmu.c | 4 ++-- arch/um/include/asm/pgalloc.h | 4 ++-- arch/um/kernel/mem.c | 4 ++-- arch/unicore32/include/asm/pgalloc.h | 4 ++-- arch/x86/include/asm/pgalloc.h | 4 ++-- arch/x86/mm/pgtable.c | 4 ++-- arch/xtensa/include/asm/pgalloc.h | 8 +++----- include/linux/mm.h | 13 ++++++------- mm/huge_memory.c | 8 ++++---- mm/kasan/init.c | 2 +- mm/memory.c | 17 ++++++++--------- mm/migrate.c | 2 +- mm/mremap.c | 2 +- mm/userfaultfd.c | 2 +- virt/kvm/arm/mmu.c | 2 +- 44 files changed, 101 insertions(+), 151 deletions(-) diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index ab3e3a8638fbe5..02f9f91bb4f069 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -52,7 +52,7 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) } static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; @@ -65,9 +65,9 @@ pte_free_kernel(struct mm_struct *mm, pte_t *pte) } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long address) +pte_alloc_one(struct mm_struct *mm) { - pte_t *pte = pte_alloc_one_kernel(mm, address); + pte_t *pte = pte_alloc_one_kernel(mm); struct page *page; if (!pte) diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 3749234b74196c..9c9b5a5ebf2e74 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -90,8 +90,7 @@ static inline int __get_order_pte(void) return get_order(PTRS_PER_PTE * sizeof(pte_t)); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -102,7 +101,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long address) +pte_alloc_one(struct mm_struct *mm) { pgtable_t pte_pg; struct page *page; diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 2d7344f0e2085b..17ab72f0cc4e61 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -81,7 +81,7 @@ static inline void clean_pte_table(pte_t *pte) * +------------+ */ static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -93,7 +93,7 @@ pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long addr) +pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 2e05bcd944c839..52fa47c73bf042 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -91,13 +91,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp); static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page(PGALLOC_GFP); } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long addr) +pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index eeebf862c46c6a..d36183887b60aa 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -59,8 +59,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long) pgd); } -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; @@ -75,8 +74,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, } /* _kernel variant gets to use a different allocator */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { gfp_t flags = GFP_KERNEL | __GFP_ZERO; return (pte_t *) __get_free_page(flags); diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index 3ee5362f266145..c9e481023c25bd 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -83,7 +83,7 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) pmd_val(*pmd_entry) = __pa(pte); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; void *pg; @@ -99,8 +99,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) return page; } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long addr) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return quicklist_alloc(0, GFP_KERNEL, NULL); } diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 12fe700632f458..4399d712f6db72 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -12,8 +12,7 @@ extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) extern const char bad_pmd_string[]; -extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { unsigned long page = __get_free_page(GFP_DMA); @@ -32,8 +31,6 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) #define pmd_alloc_one_fast(mm, address) ({ BUG(); ((pmd_t *)1); }) #define pmd_alloc_one(mm, address) ({ BUG(); ((pmd_t *)2); }) -#define pte_alloc_one_fast(mm, addr) pte_alloc_one(mm, addr) - #define pmd_populate(mm, pmd, page) (pmd_val(*pmd) = \ (unsigned long)(page_address(page))) @@ -50,8 +47,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, #define __pmd_free_tlb(tlb, pmd, address) do { } while (0) -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_pages(GFP_DMA, 0); pte_t *pte; diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index 7859a86319cf7b..d04d9ba9b97674 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -8,7 +8,7 @@ extern pmd_t *get_pointer_table(void); extern int free_pointer_table(pmd_t *); -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -28,7 +28,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) free_page((unsigned long) pte); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; pte_t *pte; diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 11485d38de4e6e..1456c5eecbd940 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -35,8 +35,7 @@ do { \ tlb_remove_page((tlb), pte); \ } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { unsigned long page = __get_free_page(GFP_KERNEL); @@ -47,8 +46,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return (pte_t *) (page); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_pages(GFP_KERNEL, 0); diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 7c89390c0c13dc..f4cc9ffc449e10 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -108,10 +108,9 @@ static inline void free_pgd_slow(pgd_t *pgd) #define pmd_alloc_one_fast(mm, address) ({ BUG(); ((pmd_t *)1); }) #define pmd_alloc_one(mm, address) ({ BUG(); ((pmd_t *)2); }) -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *ptepage; @@ -132,20 +131,6 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return ptepage; } -static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, - unsigned long address) -{ - unsigned long *ret; - - ret = pte_quicklist; - if (ret != NULL) { - pte_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - pgtable_cache_size--; - } - return (pte_t *)ret; -} - static inline void pte_free_fast(pte_t *pte) { *(unsigned long **)pte = pte_quicklist; diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 7f525962cdfaa0..c2ce1e42b88871 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -235,8 +235,7 @@ unsigned long iopa(unsigned long addr) return pa; } -__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; if (mem_init_done) { diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 39b9f311c4ef47..27808d9461f477 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -50,14 +50,12 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PTE_ORDER); } -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index 27448869131afc..3c5fee5b575953 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -22,8 +22,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); #define check_pgt_cache() do { } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long addr) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -34,7 +33,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return pte; } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pgtable_t pte; diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index bb47d08c8ef76f..3a149ead120716 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -37,8 +37,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -47,8 +46,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return pte; } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 8999b922651210..149c82ee4b8b22 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -70,10 +70,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; pte = alloc_pages(GFP_KERNEL, 0); diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index c9697529b3f079..270d1c9bc0d669 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -118,8 +118,7 @@ EXPORT_SYMBOL(iounmap); * the memblock infrastructure. */ -pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index cf13275f7c6d8c..d05c678c77c43e 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -122,7 +122,7 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) #define pmd_pgtable(pmd) pmd_page(pmd) static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long address) +pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!page) @@ -135,7 +135,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long address) } static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index b5b955eb2fb733..3633502e102cec 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -61,10 +61,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); +extern pgtable_t pte_alloc_one(struct mm_struct *mm); void pte_frag_destroy(void *pte_frag); -pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel); +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); void pte_fragment_free(unsigned long *table, int kernel); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 4aba625389c47a..9c1173283b96bf 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -39,7 +39,7 @@ extern struct vmemmap_backing *vmemmap_list; extern struct kmem_cache *pgtable_cache[]; #define PGT_CACHE(shift) pgtable_cache[shift] -extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern pte_t *pte_fragment_alloc(struct mm_struct *, int); extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long); extern void pte_fragment_free(unsigned long *, int); extern void pmd_fragment_free(unsigned long *); @@ -190,16 +190,14 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return (pte_t *)pte_fragment_alloc(mm, address, 1); + return (pte_t *)pte_fragment_alloc(mm, 1); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { - return (pgtable_t)pte_fragment_alloc(mm, address, 0); + return (pgtable_t)pte_fragment_alloc(mm, 0); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 17963951bdb065..bd186e85b4f701 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -79,10 +79,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) #endif -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); +extern pgtable_t pte_alloc_one(struct mm_struct *mm); void pte_frag_destroy(void *pte_frag); -pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel); +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); void pte_fragment_free(unsigned long *table, int kernel); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index e95eb499a1744b..66d086f85bd5b1 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -93,14 +93,12 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; pte_t *pte; diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index af23a587f0193b..a7b05214760ce3 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -95,7 +95,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) return (pte_t *)ret; } -pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) { pte_t *pte; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index d67215248d8226..ded71126ce4c67 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -43,17 +43,17 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[], _sinittext[], _einittext[]; -__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { if (!slab_is_available()) return memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); - return (pte_t *)pte_fragment_alloc(mm, address, 1); + return (pte_t *)pte_fragment_alloc(mm, 1); } -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { - return (pgtable_t)pte_fragment_alloc(mm, address, 0); + return (pgtable_t)pte_fragment_alloc(mm, 0); } void __iomem * diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index a79ed5faff3aaa..94043cf83c90fe 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -82,15 +82,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #endif /* __PAGETABLE_PMD_FOLDED */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page( GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO); } -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 5ee733720a5716..bccb8f4a63e204 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -139,8 +139,8 @@ static inline void pmd_populate(struct mm_struct *mm, /* * page table entry allocation/free routines. */ -#define pte_alloc_one_kernel(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) -#define pte_alloc_one(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) +#define pte_alloc_one_kernel(mm) ((pte_t *)page_table_alloc(mm)) +#define pte_alloc_one(mm) ((pte_t *)page_table_alloc(mm)) #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index ed053a359ab746..8ad73cb311216a 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -32,14 +32,12 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, /* * Allocate and free page tables. */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; void *pg; diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 90459481c6c7a8..282be50a4adfcf 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -58,10 +58,9 @@ void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep); void pmd_set(pmd_t *pmdp, pte_t *ptep); #define pmd_populate_kernel(MM, PMD, PTE) pmd_set(PMD, PTE) -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address); +pgtable_t pte_alloc_one(struct mm_struct *mm); -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return srmmu_get_nocache(PTE_SIZE, PTE_SIZE); } diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 874632f34f624b..48abccba49915f 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -60,10 +60,8 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(pgtable_cache, pmd); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address); -pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address); +pte_t *pte_alloc_one_kernel(struct mm_struct *mm); +pgtable_t pte_alloc_one(struct mm_struct *mm); void pte_free_kernel(struct mm_struct *mm, pte_t *pte); void pte_free(struct mm_struct *mm, pgtable_t ptepage); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 3c8aac21f4260b..b4221d3727d0a5 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2925,8 +2925,7 @@ void __flush_tlb_all(void) : : "r" (pstate)); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); pte_t *pte = NULL; @@ -2937,8 +2936,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return pte; } -pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index a6142c5abf6141..b609362e846fcc 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -364,12 +364,12 @@ pgd_t *get_pgd_fast(void) * Alignments up to the page size are the same for physical and virtual * addresses of the nocache area. */ -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { unsigned long pte; struct page *page; - if ((pte = (unsigned long)pte_alloc_one_kernel(mm, address)) == 0) + if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0) return NULL; page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); if (!pgtable_page_ctor(page)) { diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index bf90b2aa200254..99eb5682792a30 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -25,8 +25,8 @@ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *); +extern pgtable_t pte_alloc_one(struct mm_struct *); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 8d21a83dd28933..799b571a8f884b 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -199,7 +199,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long) pgd); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -207,7 +207,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) return pte; } -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index f0fdb268f8f2ef..7cceabecf4e37f 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -34,7 +34,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); * Allocate one PTE table. */ static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -46,7 +46,7 @@ pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long addr) +pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 1ea41aaef68bf4..a281e61ec60c55 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -47,8 +47,8 @@ extern gfp_t __userpte_alloc_gfp; extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *); +extern pgtable_t pte_alloc_one(struct mm_struct *); /* Should really implement gc for free page table pages. This could be done with a reference count in struct page. */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b0284eab14dc6c..7bd01709a0914a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -23,12 +23,12 @@ EXPORT_SYMBOL(physical_mask); gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); } -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index 1065bc8bcae56b..b3b388ff2f01e4 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -38,8 +38,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *ptep; int i; @@ -52,13 +51,12 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return ptep; } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long addr) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pte_t *pte; struct page *page; - pte = pte_alloc_one_kernel(mm, addr); + pte = pte_alloc_one_kernel(mm); if (!pte) return NULL; page = virt_to_page(pte); diff --git a/include/linux/mm.h b/include/linux/mm.h index ea1f12d15365bd..0d946f063cba38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1873,8 +1873,8 @@ static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); +int __pte_alloc_kernel(pmd_t *pmd); /* * The following ifdef needed to get the 4level-fixup.h header to work. @@ -2005,18 +2005,17 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc(mm, pmd, address) \ - (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, address)) +#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ - (pte_alloc(mm, pmd, address) ? NULL : pte_offset_map(pmd, address)) + (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - (pte_alloc(mm, pmd, address) ? \ + (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if USE_SPLIT_PMD_PTLOCKS diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cbd977b1d60d40..faf357eaf0cee5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -568,7 +568,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; @@ -702,7 +702,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct page *zero_page; bool set; vm_fault_t ret; - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = mm_get_huge_zero_page(vma->vm_mm); @@ -791,7 +791,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm, addr); + pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } @@ -927,7 +927,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!vma_is_anonymous(vma)) return 0; - pgtable = pte_alloc_one(dst_mm, addr); + pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 34afad56497bd3..45a1b5e38e1e24 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -123,7 +123,7 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, pte_t *p; if (slab_is_available()) - p = pte_alloc_one_kernel(&init_mm, addr); + p = pte_alloc_one_kernel(&init_mm); else p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); if (!p) diff --git a/mm/memory.c b/mm/memory.c index 2dd2f9ab57f465..a52663c0612d4d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -400,10 +400,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - pgtable_t new = pte_alloc_one(mm, address); + pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; @@ -434,9 +434,9 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) return 0; } -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +int __pte_alloc_kernel(pmd_t *pmd) { - pte_t *new = pte_alloc_one_kernel(&init_mm, address); + pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; @@ -2896,7 +2896,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ @@ -3043,7 +3043,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; } map_pte: @@ -3122,7 +3122,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3360,8 +3360,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { - vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, - vmf->address); + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ diff --git a/mm/migrate.c b/mm/migrate.c index 5d1839a9148df0..ccf8966caf6fe1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2636,7 +2636,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(mm, pmdp, addr)) + if (pte_alloc(mm, pmdp)) goto abort; /* See the comment in pte_alloc_one_map() */ diff --git a/mm/mremap.c b/mm/mremap.c index def01d86e36fd3..fc3f92962a7e13 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -236,7 +236,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_unstable(old_pmd)) continue; } - if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) + if (pte_alloc(new_vma->vm_mm, new_pmd)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 48368589f51990..065c1ce191c47f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -550,7 +550,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 3053bf2584f899..fbdf3ac2f00106 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -647,7 +647,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, BUG_ON(pmd_sect(*pmd)); if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL, addr); + pte = pte_alloc_one_kernel(NULL); if (!pte) { kvm_err("Cannot allocate Hyp pte\n"); return -ENOMEM; From 2c91bd4a4e2e530582d6fd643ea7b86b27907151 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 3 Jan 2019 15:28:38 -0800 Subject: [PATCH 47/58] mm: speed up mremap by 20x on large regions Android needs to mremap large regions of memory during memory management related operations. The mremap system call can be really slow if THP is not enabled. The bottleneck is move_page_tables, which is copying each pte at a time, and can be really slow across a large map. Turning on THP may not be a viable option, and is not for us. This patch speeds up the performance for non-THP system by copying at the PMD level when possible. The speedup is an order of magnitude on x86 (~20x). On a 1GB mremap, the mremap completion times drops from 3.4-3.6 milliseconds to 144-160 microseconds. Before: Total mremap time for 1GB data: 3521942 nanoseconds. Total mremap time for 1GB data: 3449229 nanoseconds. Total mremap time for 1GB data: 3488230 nanoseconds. After: Total mremap time for 1GB data: 150279 nanoseconds. Total mremap time for 1GB data: 144665 nanoseconds. Total mremap time for 1GB data: 158708 nanoseconds. If THP is enabled the optimization is mostly skipped except in certain situations. [joel@joelfernandes.org: fix 'move_normal_pmd' unused function warning] Link: http://lkml.kernel.org/r/20181108224457.GB209347@google.com Link: http://lkml.kernel.org/r/20181108181201.88826-3-joelaf@google.com Signed-off-by: Joel Fernandes (Google) Acked-by: Kirill A. Shutemov Reviewed-by: William Kucharski Cc: Julia Lawall Cc: Michal Hocko Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 5 ++++ mm/mremap.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index e1e540ffa9793d..b70c952ac838c8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -535,6 +535,11 @@ config HAVE_IRQ_TIME_ACCOUNTING Archs need to ensure they use a high enough resolution clock to support irq time accounting and then call enable_sched_clock_irqtime(). +config HAVE_MOVE_PMD + bool + help + Archs that select this are able to move page tables at the PMD level. + config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool diff --git a/mm/mremap.c b/mm/mremap.c index fc3f92962a7e13..3320616ed93f4b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -191,6 +191,52 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } +#ifdef CONFIG_HAVE_MOVE_PMD +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pmd_t pmd; + + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK) + || old_end - old_addr < PMD_SIZE) + return false; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + old_ptl = pmd_lock(vma->vm_mm, old_pmd); + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pmd */ + pmd = *old_pmd; + pmd_clear(old_pmd); + + VM_BUG_ON(!pmd_none(*new_pmd)); + + /* Set the new pmd */ + set_pmd_at(mm, new_addr, new_pmd, pmd); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#endif + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, @@ -235,7 +281,25 @@ unsigned long move_page_tables(struct vm_area_struct *vma, split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; + } else if (extent == PMD_SIZE) { +#ifdef CONFIG_HAVE_MOVE_PMD + /* + * If the extent is PMD-sized, try to speed the move by + * moving at the PMD level if possible. + */ + bool moved; + + if (need_rmap_locks) + take_rmap_locks(vma); + moved = move_normal_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); + if (need_rmap_locks) + drop_rmap_locks(vma); + if (moved) + continue; +#endif } + if (pte_alloc(new_vma->vm_mm, new_pmd)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; From 9f132f7e145506efc0744426cb338b18a54afc3b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 3 Jan 2019 15:28:41 -0800 Subject: [PATCH 48/58] mm: select HAVE_MOVE_PMD on x86 for faster mremap Moving page-tables at the PMD-level on x86 is known to be safe. Enable this option so that we can do fast mremap when possible. Link: http://lkml.kernel.org/r/20181108181201.88826-4-joelaf@google.com Signed-off-by: Joel Fernandes (Google) Suggested-by: Kirill A. Shutemov Acked-by: Kirill A. Shutemov Cc: Julia Lawall Cc: Michal Hocko Cc: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e260460210e17a..6185d4f332965e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -172,6 +172,7 @@ config X86 select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC + select HAVE_MOVE_PMD select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES From 3bb5f4ac55dd91d516e7e36b45c94bd57efbb068 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:28:44 -0800 Subject: [PATCH 49/58] kernel/locking/mutex.c: remove caller signal_pending branch predictions This is already done for us internally by the signal machinery. Link: http://lkml.kernel.org/r/20181116002713.8474-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 3f8a35104285ab..db578783dd36a3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -987,7 +987,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { ret = -EINTR; goto err; } From 34ec35ad8f5f4624e8391dbb83afb4c791f027e3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:28:48 -0800 Subject: [PATCH 50/58] kernel/sched/: remove caller signal_pending branch predictions This is already done for us internally by the signal machinery. Link: http://lkml.kernel.org/r/20181116002713.8474-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 2 +- kernel/sched/swait.c | 2 +- kernel/sched/wait.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f6692017337032..17a954c9e15377 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3416,7 +3416,7 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nivcsw; if (!preempt && prev->state) { - if (unlikely(signal_pending_state(prev->state, prev))) { + if (signal_pending_state(prev->state, prev)) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 66b59ac77c2209..e83a3f8449f653 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -93,7 +93,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait long ret = 0; raw_spin_lock_irqsave(&q->lock, flags); - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { /* * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() * must not see us. diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5dd47f1103d18b..6eb1f8efd221c5 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -264,7 +264,7 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en long ret = 0; spin_lock_irqsave(&wq_head->lock, flags); - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { /* * Exclusive waiter must not fail if it was selected by wakeup, * it should "consume" the condition we were waiting for. From d8d7d842e8286a98fb56df9caf5d19b46e01ba4b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:28:51 -0800 Subject: [PATCH 51/58] arch/arc/mm/fault.c: remove caller signal_pending_branch predictions This is already done for us internally by the signal machinery. Link: http://lkml.kernel.org/r/20181116002713.8474-4-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index e2d9fc3fea01e7..a1d7231970848d 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -142,7 +142,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) fault = handle_mm_fault(vma, address, flags); /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { if ((fault & VM_FAULT_ERROR) && !(fault & VM_FAULT_RETRY)) up_read(&mm->mmap_sem); if (user_mode(regs)) From fa45f1162f28cbba6c38180647b7b300f317ecb4 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:28:55 -0800 Subject: [PATCH 52/58] mm/: remove caller signal_pending branch predictions This is already done for us internally by the signal machinery. Link: http://lkml.kernel.org/r/20181116002713.8474-5-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/gup.c | 2 +- mm/hugetlb.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 29655fb47a2c4b..9f5e323e883e6f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1125,7 +1125,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, break; } - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { ret = -EINTR; break; } diff --git a/mm/gup.c b/mm/gup.c index 8cb68a50dbdf28..6dd33e16a8063c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -727,7 +727,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e37efd5d831830..7450888109651f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4231,7 +4231,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { remainder = 0; break; } From 08d405c8b845a4b871fa3606c9ebe0d0f3b74614 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:28:58 -0800 Subject: [PATCH 53/58] fs/: remove caller signal_pending branch predictions This is already done for us internally by the signal machinery. [akpm@linux-foundation.org: fix fs/buffer.c] Link: http://lkml.kernel.org/r/20181116002713.8474-7-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/fs_probe.c | 2 +- fs/afs/vl_probe.c | 2 +- fs/buffer.c | 2 +- fs/exec.c | 4 ++-- fs/orangefs/orangefs-bufmap.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index fde6b4d4121e38..3a9eaec0675699 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -247,7 +247,7 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) } } - if (!still_probing || unlikely(signal_pending(current))) + if (!still_probing || signal_pending(current)) goto stop; schedule(); } diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index f0b032976487cd..f402ee8171a1d7 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -248,7 +248,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, } } - if (!still_probing || unlikely(signal_pending(current))) + if (!still_probing || signal_pending(current)) goto stop; schedule(); } diff --git a/fs/buffer.c b/fs/buffer.c index d60d61e8ed7de4..52d024bfdbc121 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2366,7 +2366,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, balance_dirty_pages_ratelimited(mapping); - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { err = -EINTR; goto out; } diff --git a/fs/exec.c b/fs/exec.c index ea7d439cf79efe..44320d893f1a38 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1087,7 +1087,7 @@ static int de_thread(struct task_struct *tsk) __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); - if (unlikely(__fatal_signal_pending(tsk))) + if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } @@ -1115,7 +1115,7 @@ static int de_thread(struct task_struct *tsk) write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); - if (unlikely(__fatal_signal_pending(tsk))) + if (__fatal_signal_pending(tsk)) goto killed; } diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index c4e98c9c16217b..443bcd8c3c1991 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -105,7 +105,7 @@ static int wait_for_free(struct slot_map *m) left = t; else left = t + (left - n); - if (unlikely(signal_pending(current))) + if (signal_pending(current)) left = -EINTR; } while (left > 0); From f86196ea8737c98ea96e5f95c99d0367be39a5d2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 3 Jan 2019 15:29:02 -0800 Subject: [PATCH 54/58] fs: don't open code lru_to_page() Multiple filesystems open code lru_to_page(). Rectify this by moving the macro from mm_inline (which is specific to lru stuff) to the more generic mm.h header and start using the macro where appropriate. No functional changes. Link: http://lkml.kernel.org/r/20181129104810.23361-1-nborisov@suse.com Link: https://lkml.kernel.org/r/20181129075301.29087-1-nborisov@suse.com Signed-off-by: Nikolay Borisov Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport Acked-by: Pankaj gupta Acked-by: "Yan, Zheng" [ceph] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/file.c | 5 +++-- fs/btrfs/extent_io.c | 3 +-- fs/ceph/addr.c | 5 ++--- fs/cifs/file.c | 3 ++- fs/ext4/readpage.c | 2 +- fs/ocfs2/aops.c | 3 ++- fs/orangefs/inode.c | 2 +- include/linux/mm.h | 2 ++ include/linux/mm_inline.h | 3 --- mm/swap.c | 2 +- 10 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index d6bc3f5d784b56..323ae991220399 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); @@ -441,7 +442,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping, /* Count the number of contiguous pages at the front of the list. Note * that the list goes prev-wards rather than next-wards. */ - first = list_entry(pages->prev, struct page, lru); + first = lru_to_page(pages); index = first->index + 1; n = 1; for (p = first->lru.prev; p != pages; p = p->prev) { @@ -473,7 +474,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping, * page at the end of the file. */ do { - page = list_entry(pages->prev, struct page, lru); + page = lru_to_page(pages); list_del(&page->lru); index = page->index; if (add_to_page_cache_lru(page, mapping, index, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fc126b92ea5961..52abe408268088 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4103,8 +4103,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, while (!list_empty(pages)) { for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = list_entry(pages->prev, - struct page, lru); + struct page *page = lru_to_page(pages); prefetchw(&page->flags); list_del(&page->lru); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8eade7a993c1f5..5d0c05e288ccb3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -306,7 +306,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); - struct page *page = list_entry(page_list->prev, struct page, lru); + struct page *page = lru_to_page(page_list); struct ceph_vino vino; struct ceph_osd_request *req; u64 off; @@ -333,8 +333,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, if (got) ceph_put_cap_refs(ci, got); while (!list_empty(page_list)) { - page = list_entry(page_list->prev, - struct page, lru); + page = lru_to_page(page_list); list_del(&page->lru); put_page(page); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5e405164394a1c..e3e3a755020561 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "cifsfs.h" #include "cifspdu.h" @@ -3964,7 +3965,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, INIT_LIST_HEAD(tmplist); - page = list_entry(page_list->prev, struct page, lru); + page = lru_to_page(page_list); /* * Lock the page and put it in the cache. Since no one else diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f461d75ac049f0..6aa282ee455a92 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -128,7 +128,7 @@ int ext4_mpage_readpages(struct address_space *mapping, prefetchw(&page->flags); if (pages) { - page = list_entry(pages->prev, struct page, lru); + page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, readahead_gfp_mask(mapping))) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index eb1ce30412dc3e..832c1759a09af4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -397,7 +398,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = list_entry(pages->prev, struct page, lru); + last = lru_to_page(pages); start = (loff_t)last->index << PAGE_SHIFT; if (start >= i_size_read(inode)) goto out_unlock; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index fe53381b26b184..f038235c64bdf6 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -77,7 +77,7 @@ static int orangefs_readpages(struct file *file, for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page; - page = list_entry(pages->prev, struct page, lru); + page = lru_to_page(pages); list_del(&page->lru); if (!add_to_page_cache(page, mapping, diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d946f063cba38..80bb6408fe73a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -171,6 +171,8 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) +#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) + /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 10191c28fc04ce..04ec454d44cefd 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -124,7 +124,4 @@ static __always_inline enum lru_list page_lru(struct page *page) } return lru; } - -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) - #endif diff --git a/mm/swap.c b/mm/swap.c index 4d8a1f1afaab44..4929bc1be60efa 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -126,7 +126,7 @@ void put_pages_list(struct list_head *pages) while (!list_empty(pages)) { struct page *victim; - victim = list_entry(pages->prev, struct page, lru); + victim = lru_to_page(pages); list_del(&victim->lru); put_page(victim); } From 967d3010df8b6f6f9aa95c198edc5fe3646ebf36 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 3 Jan 2019 15:29:05 -0800 Subject: [PATCH 55/58] drivers/base/platform.c: kmemleak ignore a known leak unreferenced object 0xffff808ec6dc5a80 (size 128): comm "swapper/0", pid 1, jiffies 4294938063 (age 2560.530s) hex dump (first 32 bytes): ff ff ff ff 00 00 00 00 6b 6b 6b 6b 6b 6b 6b 6b ........kkkkkkkk 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk backtrace: [<00000000476dcf8c>] kmem_cache_alloc_trace+0x430/0x500 [<000000004f708d37>] platform_device_register_full+0xbc/0x1e8 [<000000006c2a7ec7>] acpi_create_platform_device+0x370/0x450 [<00000000ef135642>] acpi_default_enumeration+0x34/0x78 [<000000003bd9a052>] acpi_bus_attach+0x2dc/0x3e0 [<000000003cf4f7f2>] acpi_bus_attach+0x108/0x3e0 [<000000003cf4f7f2>] acpi_bus_attach+0x108/0x3e0 [<000000002968643e>] acpi_bus_scan+0xb0/0x110 [<0000000010dd0bd7>] acpi_scan_init+0x1a8/0x410 [<00000000965b3c5a>] acpi_init+0x408/0x49c [<00000000ed4b9fe2>] do_one_initcall+0x178/0x7f4 [<00000000a5ac5a74>] kernel_init_freeable+0x9d4/0xa9c [<0000000070ea6c15>] kernel_init+0x18/0x138 [<00000000fb8fff06>] ret_from_fork+0x10/0x1c [<0000000041273a0d>] 0xffffffffffffffff Then, faddr2line pointed out this line, /* * This memory isn't freed when the device is put, * I don't have a nice idea for that though. Conceptually * dma_mask in struct device should not be a pointer. * See http://thread.gmane.org/gmane.linux.kernel.pci/9081 */ pdev->dev.dma_mask = kmalloc(sizeof(*pdev->dev.dma_mask), GFP_KERNEL); Since this leak has existed for more than 8 years and it does not reference other parts of the memory, let kmemleak ignore it, so users don't need to waste time reporting this in the future. Link: http://lkml.kernel.org/r/20181206160751.36211-1-cai@gmx.us Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J . Wysocki" Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/platform.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index be6c1eb3cbe204..1c958eb33ef4d2 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "base.h" #include "power/power.h" @@ -524,6 +525,8 @@ struct platform_device *platform_device_register_full( if (!pdev->dev.dma_mask) goto err; + kmemleak_ignore(pdev->dev.dma_mask); + *pdev->dev.dma_mask = pdevinfo->dma_mask; pdev->dev.coherent_dma_mask = pdevinfo->dma_mask; } From ae67ee6c5e1d5b6acdb0d51fddde651834096d75 Mon Sep 17 00:00:00 2001 From: Jorge Ramirez-Ortiz Date: Thu, 3 Jan 2019 15:29:09 -0800 Subject: [PATCH 56/58] docs: fix Co-Developed-by docs The accepted terminology will be Co-developed-by therefore lose the capital letter from now on. Link: http://lkml.kernel.org/r/1544808928-20002-2-git-send-email-jorge.ramirez-ortiz@linaro.org Signed-off-by: Jorge Ramirez-Ortiz Acked-by: Himanshu Jha Cc: Jonathan Cameron Cc: Joe Perches Cc: Greg Kroah-Hartman Cc: Niklas Cassel Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/process/submitting-patches.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index c0917107b90ab5..30dc00a364e894 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -510,7 +510,7 @@ tracking your trees, and to people trying to troubleshoot bugs in your tree. -12) When to use Acked-by:, Cc:, and Co-Developed-by: +12) When to use Acked-by:, Cc:, and Co-developed-by: ------------------------------------------------------- The Signed-off-by: tag indicates that the signer was involved in the @@ -543,7 +543,7 @@ person it names - but it should indicate that this person was copied on the patch. This tag documents that potentially interested parties have been included in the discussion. -A Co-Developed-by: states that the patch was also created by another developer +A Co-developed-by: states that the patch was also created by another developer along with the original author. This is useful at times when multiple people work on a single patch. Note, this person also needs to have a Signed-off-by: line in the patch as well. From d499480cc435ff153bf7e2e7ee6ab6ac44306c28 Mon Sep 17 00:00:00 2001 From: Jorge Ramirez-Ortiz Date: Thu, 3 Jan 2019 15:29:12 -0800 Subject: [PATCH 57/58] checkpatch: add Co-developed-by to signature tags As per Documentation/process/submitting-patches, Co-developed-by is a valid signature. This commit removes the warning. Link: http://lkml.kernel.org/r/1544808928-20002-3-git-send-email-jorge.ramirez-ortiz@linaro.org Signed-off-by: Jorge Ramirez-Ortiz Cc: Greg Kroah-Hartman Cc: Himanshu Jha Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Niklas Cassel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 93e84c9504a15f..b737ca9d720441 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -468,6 +468,7 @@ sub hash_show_words { our $signature_tags = qr{(?xi: Signed-off-by:| + Co-developed-by:| Acked-by:| Tested-by:| Reviewed-by:| From b685a7350ae76bc0f388e24b36d06a63776c68ee Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jan 2019 15:29:15 -0800 Subject: [PATCH 58/58] mm/page_io.c: fix polled swap page in swap_readpage() wants to do polling to bring in pages if asked to, but it doesn't mark the bio as being polled. Additionally, the looping around the blk_poll() check isn't correct - if we get a zero return, we should call io_schedule(), we can't just assume that the bio has completed. The regular bio->bi_private check should be used for that. Link: http://lkml.kernel.org/r/e15243a8-2cdf-c32c-ecee-f289377c8ef9@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index d975fa3f02aa56..2e8019d0e048dd 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -401,6 +401,8 @@ int swap_readpage(struct page *page, bool synchronous) get_task_struct(current); bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (synchronous) + bio->bi_opf |= REQ_HIPRI; count_vm_event(PSWPIN); bio_get(bio); qc = submit_bio(bio); @@ -410,7 +412,7 @@ int swap_readpage(struct page *page, bool synchronous) break; if (!blk_poll(disk->queue, qc, true)) - break; + io_schedule(); } __set_current_state(TASK_RUNNING); bio_put(bio);