From dcea7964764aad41c2994084a4c0292371b14e36 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 May 2022 12:03:17 -0700 Subject: [PATCH 01/72] checkpatch: add XA_STATE and XA_STATE_ORDER to the macro declaration list XA_STATE() and XA_STATE_ORDER macro uses are declarations. Add them to the declaration macro list to avoid suggesting a blank line after declarations when used. Link: https://lkml.kernel.org/r/144314f4bf2c58cf2336028a75a5127e848abd81.camel@perches.com Signed-off-by: Joe Perches Reported-by: David Howells Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 503e8abbb2c1e4..205bf5055acffb 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1042,7 +1042,8 @@ sub build_types { our $declaration_macros = qr{(?x: (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(| (?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(| - (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\( + (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\(| + (?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\( )}; our %allow_repeated_words = ( From 0fe6ee8f123a4dfb529a5aff07536bb481f34043 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 31 May 2022 09:28:54 +0800 Subject: [PATCH 02/72] profiling: fix shift too large makes kernel panic 2d186afd04d6 ("profiling: fix shift-out-of-bounds bugs") limits shift value by [0, BITS_PER_LONG -1], which means [0, 63]. However, syzbot found that the max shift value should be the bit number of (_etext - _stext). If shift is outside of this, the "buffer_bytes" will be zero and will cause kzalloc(0). Then the kernel panics due to dereferencing the returned pointer 16. This can be easily reproduced by passing a large number like 60 to enable profiling and then run readprofile. LOGS: BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 6148067 P4D 6148067 PUD 6142067 PMD 0 PREEMPT SMP CPU: 4 PID: 184 Comm: readprofile Not tainted 5.18.0+ #162 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_profile+0x104/0x220 RSP: 0018:ffffc900006fbe80 EFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888006150000 RSI: 0000000000000001 RDI: ffffffff82aba4a0 RBP: 000000000188bb60 R08: 0000000000000010 R09: ffff888006151000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82aba4a0 R13: 0000000000000000 R14: ffffc900006fbf08 R15: 0000000000020c30 FS: 000000000188a8c0(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000006144000 CR4: 00000000000006e0 Call Trace: proc_reg_read+0x56/0x70 vfs_read+0x9a/0x1b0 ksys_read+0xa1/0xe0 ? fpregs_assert_state_consistent+0x1e/0x40 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x4d4b4e RSP: 002b:00007ffebb668d58 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000000000188a8a0 RCX: 00000000004d4b4e RDX: 0000000000000400 RSI: 000000000188bb60 RDI: 0000000000000003 RBP: 0000000000000003 R08: 000000000000006e R09: 0000000000000000 R10: 0000000000000041 R11: 0000000000000246 R12: 000000000188bb60 R13: 0000000000000400 R14: 0000000000000000 R15: 000000000188bb60 Modules linked in: CR2: 0000000000000010 Killed ---[ end trace 0000000000000000 ]--- Check prof_len in profile_init() to prevent it be zero. Link: https://lkml.kernel.org/r/20220531012854.229439-1-chenzhongjin@huawei.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Chen Zhongjin Signed-off-by: Andrew Morton --- kernel/profile.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/profile.c b/kernel/profile.c index 37640a0bd8a3c7..ae82ddfc6a6845 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -109,6 +109,13 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; + + if (!prof_len) { + pr_warn("profiling shift: %u too large\n", prof_shift); + prof_on = 0; + return -EINVAL; + } + buffer_bytes = prof_len*sizeof(atomic_t); if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) From 53fd5ffbb5197b8cc2d73d2bbc0f688afd45736c Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 7 Jun 2022 10:12:26 -0700 Subject: [PATCH 03/72] ocfs2: kill EBUSY from dlmfs_evict_inode When unlinking a dlmfs, first it will invoke dlmfs_unlink(), and then invoke dlmfs_evict_inode(), user_dlm_destroy_lock() is invoked in both places, the second one from dlmfs_evict_inode() will get EBUSY error because USER_LOCK_IN_TEARDOWN is already set in lockres. This doesn't affect any function, just the error log is annoying. Link: https://lkml.kernel.org/r/20220607171226.86672-1-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlmfs/dlmfs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index e360543ad7e714..8b2020f92b5f07 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode) { int status; struct dlmfs_inode_private *ip; + struct user_lock_res *lockres; + int teardown; clear_inode(inode); mlog(0, "inode %lu\n", inode->i_ino); ip = DLMFS_I(inode); + lockres = &ip->ip_lockres; if (S_ISREG(inode->i_mode)) { - status = user_dlm_destroy_lock(&ip->ip_lockres); - if (status < 0) - mlog_errno(status); + spin_lock(&lockres->l_lock); + teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN); + spin_unlock(&lockres->l_lock); + if (!teardown) { + status = user_dlm_destroy_lock(lockres); + if (status < 0) + mlog_errno(status); + } iput(ip->ip_parent); goto clear_fields; } From 0cc011c576aaa4de505046f7a6c90933d7c749a9 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 31 May 2022 15:29:51 -0700 Subject: [PATCH 04/72] lib/list_debug.c: Detect uninitialized lists In some circumstances, attempts are made to add entries to or to remove entries from an uninitialized list. A prime example is amdgpu_bo_vm_destroy(): It is indirectly called from ttm_bo_init_reserved() if that function fails, and tries to remove an entry from a list. However, that list is only initialized in amdgpu_bo_create_vm() after the call to ttm_bo_init_reserved() returned success. This results in crashes such as BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 1479 Comm: chrome Not tainted 5.10.110-15768-g29a72e65dae5 Hardware name: Google Grunt/Grunt, BIOS Google_Grunt.11031.149.0 07/15/2020 RIP: 0010:__list_del_entry_valid+0x26/0x7d ... Call Trace: amdgpu_bo_vm_destroy+0x48/0x8b ttm_bo_init_reserved+0x1d7/0x1e0 amdgpu_bo_create+0x212/0x476 ? amdgpu_bo_user_destroy+0x23/0x23 ? kmem_cache_alloc+0x60/0x271 amdgpu_bo_create_vm+0x40/0x7d amdgpu_vm_pt_create+0xe8/0x24b ... Check if the list's prev and next pointers are NULL to catch such problems. Link: https://lkml.kernel.org/r/20220531222951.92073-1-linux@roeck-us.net Signed-off-by: Guenter Roeck Cc: Steven Rostedt Signed-off-by: Andrew Morton --- lib/list_debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/list_debug.c b/lib/list_debug.c index 9daa3fb9d1cd61..d98d43f80958b8 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,7 +20,11 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (CHECK_DATA_CORRUPTION(next->prev != prev, + if (CHECK_DATA_CORRUPTION(prev == NULL, + "list_add corruption. prev is NULL.\n") || + CHECK_DATA_CORRUPTION(next == NULL, + "list_add corruption. next is NULL.\n") || + CHECK_DATA_CORRUPTION(next->prev != prev, "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || CHECK_DATA_CORRUPTION(prev->next != next, @@ -42,7 +46,11 @@ bool __list_del_entry_valid(struct list_head *entry) prev = entry->prev; next = entry->next; - if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, + if (CHECK_DATA_CORRUPTION(next == NULL, + "list_del corruption, %px->next is NULL\n", entry) || + CHECK_DATA_CORRUPTION(prev == NULL, + "list_del corruption, %px->prev is NULL\n", entry) || + CHECK_DATA_CORRUPTION(next == LIST_POISON1, "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || CHECK_DATA_CORRUPTION(prev == LIST_POISON2, From a91befde350375b1ff954635acdde14dc92cd9a8 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 4 Jun 2022 21:15:02 +0800 Subject: [PATCH 05/72] lib/flex_proportions.c: remove local_irq_ops in fprop_new_period() commit e78d4833c03e28> "lib: Fix possible deadlock in flexible proportion code" adds the local_irq_ops because percpu_counter_{sum |add} ops'lock can cause deadlock by interrupts. Now percpu_counter _{sum|add} ops use raw_spin_(un)lock_irq*, so revert the commit and resolve the conflict. Link: https://lkml.kernel.org/r/20220604131502.5190-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Jan Kara Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- lib/flex_proportions.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 53e7eb1dd76c96..05cccbcf1661a3 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -63,18 +63,13 @@ void fprop_global_destroy(struct fprop_global *p) */ bool fprop_new_period(struct fprop_global *p, int periods) { - s64 events; - unsigned long flags; + s64 events = percpu_counter_sum(&p->events); - local_irq_save(flags); - events = percpu_counter_sum(&p->events); /* * Don't do anything if there are no events. */ - if (events <= 1) { - local_irq_restore(flags); + if (events <= 1) return false; - } write_seqcount_begin(&p->sequence); if (periods < 64) events -= events >> periods; @@ -82,7 +77,6 @@ bool fprop_new_period(struct fprop_global *p, int periods) percpu_counter_add(&p->events, -events); p->period += periods; write_seqcount_end(&p->sequence); - local_irq_restore(flags); return true; } From 4815a36009044ba69a9b8d781943ec6505c451a2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 3 Jun 2022 20:10:12 +0300 Subject: [PATCH 06/72] include/linux/rbtree.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20220603171012.48880-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 235047d7a1b5e8..f7edca369edadd 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -17,9 +17,9 @@ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H +#include #include -#include #include #include From 9776e3861e0e30330f6c8ca9c30348f336d24b1c Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Sun, 5 Jun 2022 18:07:38 +0200 Subject: [PATCH 07/72] ia64: fix sparse warnings with cmpxchg() & xchg() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On IA64, new sparse's warnings where issued after fixing some __rcu annotations in kernel/bpf/. These new warnings are false positives and appear on IA64 because on this architecture, the macros for cmpxchg() and xchg() make casts that ignore sparse annotations. This patch contains the minimal patch to fix this issue: adding a missing cast and some missing '__force'. Link: https://lore.kernel.org/r/20220601120013.bq5a3ynbkc3hngm5@mail Link: https://lkml.kernel.org/r/20220605160738.79736-1-luc.vanoostenryck@gmail.com Signed-off-by: Luc Van Oostenryck Reported-by: kernel test robot Acked-by: Paul E. McKenney Acked-by: Toke Høiland-Jørgensen Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- arch/ia64/include/uapi/asm/cmpxchg.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h index 2c2f3cfeaa77b3..ca2e0268534384 100644 --- a/arch/ia64/include/uapi/asm/cmpxchg.h +++ b/arch/ia64/include/uapi/asm/cmpxchg.h @@ -33,24 +33,24 @@ extern void ia64_xchg_called_with_bad_pointer(void); \ switch (size) { \ case 1: \ - __xchg_result = ia64_xchg1((__u8 *)ptr, x); \ + __xchg_result = ia64_xchg1((__u8 __force *)ptr, x); \ break; \ \ case 2: \ - __xchg_result = ia64_xchg2((__u16 *)ptr, x); \ + __xchg_result = ia64_xchg2((__u16 __force *)ptr, x); \ break; \ \ case 4: \ - __xchg_result = ia64_xchg4((__u32 *)ptr, x); \ + __xchg_result = ia64_xchg4((__u32 __force *)ptr, x); \ break; \ \ case 8: \ - __xchg_result = ia64_xchg8((__u64 *)ptr, x); \ + __xchg_result = ia64_xchg8((__u64 __force *)ptr, x); \ break; \ default: \ ia64_xchg_called_with_bad_pointer(); \ } \ - __xchg_result; \ + (__typeof__ (*(ptr)) __force) __xchg_result; \ }) #ifndef __KERNEL__ @@ -76,42 +76,42 @@ extern long ia64_cmpxchg_called_with_bad_pointer(void); \ switch (size) { \ case 1: \ - _o_ = (__u8) (long) (old); \ + _o_ = (__u8) (long __force) (old); \ break; \ case 2: \ - _o_ = (__u16) (long) (old); \ + _o_ = (__u16) (long __force) (old); \ break; \ case 4: \ - _o_ = (__u32) (long) (old); \ + _o_ = (__u32) (long __force) (old); \ break; \ case 8: \ - _o_ = (__u64) (long) (old); \ + _o_ = (__u64) (long __force) (old); \ break; \ default: \ break; \ } \ switch (size) { \ case 1: \ - _r_ = ia64_cmpxchg1_##sem((__u8 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg1_##sem((__u8 __force *) ptr, new, _o_); \ break; \ \ case 2: \ - _r_ = ia64_cmpxchg2_##sem((__u16 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg2_##sem((__u16 __force *) ptr, new, _o_); \ break; \ \ case 4: \ - _r_ = ia64_cmpxchg4_##sem((__u32 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg4_##sem((__u32 __force *) ptr, new, _o_); \ break; \ \ case 8: \ - _r_ = ia64_cmpxchg8_##sem((__u64 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg8_##sem((__u64 __force *) ptr, new, _o_); \ break; \ \ default: \ _r_ = ia64_cmpxchg_called_with_bad_pointer(); \ break; \ } \ - (__typeof__(old)) _r_; \ + (__typeof__(old) __force) _r_; \ }) #define cmpxchg_acq(ptr, o, n) \ From c0af32fdc625c0e7f03465a813b04cbfb5419a1e Mon Sep 17 00:00:00 2001 From: wuchi Date: Tue, 7 Jun 2022 21:35:56 +0800 Subject: [PATCH 08/72] lib/btree: simplify btree_{lookup|update} btree_{lookup|update} both need to look up node by key, using the common parts(add function btree_lookup_node) to simplify code. Link: https://lkml.kernel.org/r/20220607133556.34732-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Signed-off-by: Andrew Morton --- lib/btree.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/lib/btree.c b/lib/btree.c index b4cf08a5c26789..a82100c73b5597 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -238,7 +238,7 @@ static int keyzero(struct btree_geo *geo, unsigned long *key) return 1; } -void *btree_lookup(struct btree_head *head, struct btree_geo *geo, +static void *btree_lookup_node(struct btree_head *head, struct btree_geo *geo, unsigned long *key) { int i, height = head->height; @@ -257,7 +257,16 @@ void *btree_lookup(struct btree_head *head, struct btree_geo *geo, if (!node) return NULL; } + return node; +} +void *btree_lookup(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int i; + unsigned long *node; + + node = btree_lookup_node(head, geo, key); if (!node) return NULL; @@ -271,23 +280,10 @@ EXPORT_SYMBOL_GPL(btree_lookup); int btree_update(struct btree_head *head, struct btree_geo *geo, unsigned long *key, void *val) { - int i, height = head->height; - unsigned long *node = head->node; - - if (height == 0) - return -ENOENT; - - for ( ; height > 1; height--) { - for (i = 0; i < geo->no_pairs; i++) - if (keycmp(geo, node, i, key) <= 0) - break; - if (i == geo->no_pairs) - return -ENOENT; - node = bval(geo, node, i); - if (!node) - return -ENOENT; - } + int i; + unsigned long *node; + node = btree_lookup_node(head, geo, key); if (!node) return -ENOENT; From d30dfd490f7dc4cb6a7c11a647bd1ff7a22139e7 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 8 Jun 2022 15:35:39 -0700 Subject: [PATCH 09/72] include/uapi/linux/swab.h: move explicit cast outside ternary A cast inside __builtin_constant_p doesn't do anything since it should evaluate as constant at compile time irrespective of this cast. Instead, I moved this cast outside the ternary to ensure the return type is as expected. Additionally, if __HAVE_BUILTIN_BSWAP16__ was not defined then __swab16 is actually returning an `int` not a `u16` due to integer promotion. As Al Viro notes: You *can't* get smaller-than-int out of ? :, same as you can't get it out of addition, etc. This also fixes some clang -Wformat warnings involving default argument promotion. Link: https://github.com/ClangBuiltLinux/linux/issues/378 Link: https://lkml.kernel.org/r/20220608223539.470472-1-justinstitt@google.com Signed-off-by: Justin Stitt Suggested-by: Al Viro Suggested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Suggested-by: Nick Desaulniers Signed-off-by: Andrew Morton --- include/uapi/linux/swab.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h index 7272f85d6d6ab5..0723a9cce747c8 100644 --- a/include/uapi/linux/swab.h +++ b/include/uapi/linux/swab.h @@ -102,7 +102,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x)) #else #define __swab16(x) \ - (__builtin_constant_p((__u16)(x)) ? \ + (__u16)(__builtin_constant_p(x) ? \ ___constant_swab16(x) : \ __fswab16(x)) #endif @@ -115,7 +115,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x)) #else #define __swab32(x) \ - (__builtin_constant_p((__u32)(x)) ? \ + (__u32)(__builtin_constant_p(x) ? \ ___constant_swab32(x) : \ __fswab32(x)) #endif @@ -128,7 +128,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) #define __swab64(x) (__u64)__builtin_bswap64((__u64)(x)) #else #define __swab64(x) \ - (__builtin_constant_p((__u64)(x)) ? \ + (__u64)(__builtin_constant_p(x) ? \ ___constant_swab64(x) : \ __fswab64(x)) #endif From dabba87229411a5e9d20ac03ffc36463c53ae672 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 May 2022 02:55:34 +0000 Subject: [PATCH 10/72] fs/kernel_read_file: allow to read files up-to ssize_t Patch series "Allow to kexec with initramfs larger than 2G", v2. Currently, the largest initramfs that is supported by kexec_file_load() syscall is 2G. This is because kernel_read_file() returns int, and is limited to INT_MAX or 2G. On the other hand, there are kexec based boot loaders (i.e. u-root), that may need to boot netboot images that might be larger than 2G. The first patch changes the return type from int to ssize_t in kernel_read_file* functions. The second patch increases the maximum initramfs file size to 4G. Tested: verified that can kexec_file_load() works with 4G initramfs on x86_64. This patch (of 2): Currently, the maximum file size that is supported is 2G. This may be too small in some cases. For example, kexec_file_load() system call loads initramfs. In some netboot cases initramfs can be rather large. Allow to use up-to ssize_t bytes. The callers still can limit the maximum file size via buf_size. Link: https://lkml.kernel.org/r/20220527025535.3953665-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20220527025535.3953665-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Al Viro Cc: Baoquan He Cc: "Eric W. Biederman" Cc: Greg Thelen Cc: Sasha Levin Signed-off-by: Andrew Morton --- fs/kernel_read_file.c | 38 ++++++++++++++++---------------- include/linux/kernel_read_file.h | 32 +++++++++++++-------------- include/linux/limits.h | 1 + 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 1b07550485b964..5d826274570cab 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -29,15 +29,15 @@ * change between calls to kernel_read_file(). * * Returns number of bytes read (no single read will be bigger - * than INT_MAX), or negative on error. + * than SSIZE_MAX), or negative on error. * */ -int kernel_read_file(struct file *file, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { loff_t i_size, pos; - size_t copied; + ssize_t copied; void *allocated = NULL; bool whole_file; int ret; @@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf, goto out; } /* The file is too big for sane activities. */ - if (i_size > INT_MAX) { + if (i_size > SSIZE_MAX) { ret = -EFBIG; goto out; } @@ -124,12 +124,12 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf, } EXPORT_SYMBOL_GPL(kernel_read_file); -int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct file *file; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path); -int kernel_read_file_from_path_initns(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id) { struct file *file; struct path root; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); -int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct fd f = fdget(fd); - int ret = -EBADF; + ssize_t ret = -EBADF; if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h index 575ffa1031d348..90451e2e12bd19 100644 --- a/include/linux/kernel_read_file.h +++ b/include/linux/kernel_read_file.h @@ -35,21 +35,21 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id) return kernel_read_file_str[id]; } -int kernel_read_file(struct file *file, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); -int kernel_read_file_from_path(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); -int kernel_read_file_from_path_initns(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); -int kernel_read_file_from_fd(int fd, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); +ssize_t kernel_read_file(struct file *file, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); +ssize_t kernel_read_file_from_path(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); +ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); +ssize_t kernel_read_file_from_fd(int fd, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); #endif /* _LINUX_KERNEL_READ_FILE_H */ diff --git a/include/linux/limits.h b/include/linux/limits.h index b568b9c30bbf58..f6bcc936901071 100644 --- a/include/linux/limits.h +++ b/include/linux/limits.h @@ -7,6 +7,7 @@ #include #define SIZE_MAX (~(size_t)0) +#define SSIZE_MAX ((ssize_t)(SIZE_MAX >> 1)) #define PHYS_ADDR_MAX (~(phys_addr_t)0) #define U8_MAX ((u8)~0U) From f4da7afe07523ff8930c4466b09a15db18508cd4 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 May 2022 02:55:35 +0000 Subject: [PATCH 11/72] kexec_file: increase maximum file size to 4G In some case initrd can be large. For example, it could be a netboot image loaded by u-root, that is kexec'ing into it. The maximum size of initrd is arbitrary set to 2G. Also, the limit is not very obvious because it is hidden behind a generic INT_MAX macro. Theoretically, we could make it LONG_MAX, but it is safer to keep it sane, and just increase it to 4G. Increase the size to 4G, and make it obvious by having a new macro that specifies the maximum file size supported by kexec_file_load() syscall: KEXEC_FILE_SIZE_MAX. Link: https://lkml.kernel.org/r/20220527025535.3953665-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Sasha Levin Cc: "Eric W. Biederman" Cc: Greg Thelen Cc: Al Viro Cc: Baoquan He Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 145321a5e798a6..9b2839775c837b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -31,6 +31,9 @@ static int kexec_calculate_store_digests(struct kimage *image); +/* Maximum size in bytes for kernel/initrd files. */ +#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX) + /* * Currently this is the only default function that is exported as some * architectures need it to do additional handlings. @@ -189,11 +192,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret; + ssize_t ret; void *ldata; ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, - INT_MAX, NULL, READING_KEXEC_IMAGE); + KEXEC_FILE_SIZE_MAX, NULL, + READING_KEXEC_IMAGE); if (ret < 0) return ret; image->kernel_buf_len = ret; @@ -213,7 +217,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, - INT_MAX, NULL, + KEXEC_FILE_SIZE_MAX, NULL, READING_KEXEC_INITRAMFS); if (ret < 0) goto out; From 0aed4724a8392f2567f83c9c4b9decf447d752a2 Mon Sep 17 00:00:00 2001 From: cxbing Date: Thu, 9 Jun 2022 07:44:59 -0700 Subject: [PATCH 12/72] delayacct: remove some unused variables Drop the unused variables *done* and *count*. Link: https://lkml.kernel.org/r/20220609144459.86379-1-zhangkkoo@126.com Signed-off-by: cxbing Signed-off-by: Andrew Morton --- tools/accounting/getdelays.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index e83e6e47a21ea1..938dec0dfaad84 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -45,7 +45,6 @@ exit(code); \ } while (0) -int done; int rcvbufsz; char name[100]; int dbg; @@ -285,7 +284,6 @@ int main(int argc, char *argv[]) pid_t rtid = 0; int fd = 0; - int count = 0; int write_file = 0; int maskset = 0; char *logfile = NULL; @@ -495,7 +493,6 @@ int main(int argc, char *argv[]) len2 = 0; /* For nested attributes, na follows */ na = (struct nlattr *) NLA_DATA(na); - done = 0; while (len2 < aggr_len) { switch (na->nla_type) { case TASKSTATS_TYPE_PID: @@ -509,7 +506,6 @@ int main(int argc, char *argv[]) printf("TGID\t%d\n", rtid); break; case TASKSTATS_TYPE_STATS: - count++; if (print_delays) print_delayacct((struct taskstats *) NLA_DATA(na)); if (print_io_accounting) From f268eedddf3595e85f8883dc50aed29654785696 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sat, 11 Jun 2022 04:21:32 +0100 Subject: [PATCH 13/72] squashfs: extend "page actor" to handle missing pages Patch series "Squashfs: handle missing pages decompressing into page cache". This patchset enables Squashfs to handle missing pages when directly decompressing datablocks into the page cache. Previously if the full set of pages needed was not available, Squashfs would have to fall back to using an intermediate buffer (the older method), which is slower, involving a memcopy, and it introduces contention on a shared buffer. The first patch extends the "page actor" code to handle missing pages. The second patch updates Squashfs_readpage_block() to use the new functionality, and removes the code that falls back to using an intermediate buffer. This patchset is independent of the readahead work, and it is standalone. It can be merged on its own. But the readahead patch for efficiency also needs this patch-set. This patch (of 2): This patch extends the "page actor" code to handle missing pages. Previously if the full set of pages needed to decompress a Squashfs datablock was unavailable, this would cause decompression to fail on the missing pages. In this case direct decompression into the page cache could not be achieved and the code would fall back to using the older intermediate buffer method. With this patch, direct decompression into the page cache can be achieved with missing pages. For "multi-shot" decompressors (zlib, xz, zstd), the page actor will allocate a temporary buffer which is passed to the decompressor, and then freed by the page actor. For "single shot" decompressors (lz4, lzo) which decompress into a contiguous "bounce buffer", and which is then copied into the page cache, it would be pointless to allocate a temporary buffer, memcpy into it, and then free it. For these decompressors -ENOMEM is returned, which signifies that the memcpy for that page should be skipped. This also happens if the data block is uncompressed. Link: https://lkml.kernel.org/r/20220611032133.5743-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20220611032133.5743-2-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Cc: Matthew Wilcox (Oracle) Cc: Hsin-Yi Wang Cc: Xiongwei Song Signed-off-by: Andrew Morton --- fs/squashfs/block.c | 10 ++++--- fs/squashfs/decompressor.h | 1 + fs/squashfs/file_direct.c | 21 ++++++++------- fs/squashfs/lz4_wrapper.c | 7 +++-- fs/squashfs/lzo_wrapper.c | 7 +++-- fs/squashfs/page_actor.c | 55 ++++++++++++++++++++++++++++++++------ fs/squashfs/page_actor.h | 21 ++++++++++++--- fs/squashfs/xz_wrapper.c | 11 +++++++- fs/squashfs/zlib_wrapper.c | 12 ++++++++- fs/squashfs/zstd_wrapper.c | 12 ++++++++- 10 files changed, 126 insertions(+), 31 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8879d052f96c6a..833aca92301f0e 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio, struct squashfs_page_actor *actor, int offset, int req_length) { - void *actor_addr = squashfs_first_page(actor); + void *actor_addr; struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); int copied_bytes = 0; int actor_offset = 0; + squashfs_actor_nobuff(actor); + actor_addr = squashfs_first_page(actor); + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) return 0; @@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, - bytes_to_copy); + if (!IS_ERR(actor_addr)) + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + + offset, bytes_to_copy); actor_offset += bytes_to_copy; copied_bytes += bytes_to_copy; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 1b9ccfd0aa519b..19ab608343895f 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -20,6 +20,7 @@ struct squashfs_decompressor { struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; + int alloc_buffer; int supported; }; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index a4894cc5944719..5af5802f5626cb 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -47,14 +47,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, if (page == NULL) return res; - /* - * Create a "page actor" which will kmap and kunmap the - * page cache pages appropriately within the decompressor - */ - actor = squashfs_page_actor_init_special(page, pages, 0); - if (actor == NULL) - goto out; - /* Try to grab all the pages covered by the Squashfs block */ for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { page[i] = (n == target_page->index) ? target_page : @@ -89,8 +81,19 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, goto out; } + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(msblk, page, pages, 0); + if (actor == NULL) + goto out; + /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + if (res < 0) goto mark_errored; @@ -116,7 +119,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, put_page(page[i]); } - kfree(actor); kfree(page); return 0; @@ -135,7 +137,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, } out: - kfree(actor); kfree(page); return res; } diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index b685b6238316c7..49797729f14383 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = { .decompress = lz4_uncompress, .id = LZ4_COMPRESSION, .name = "lz4", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index cb510a63196836..d216aeefa865ce 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } else { - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = { .decompress = lzo_uncompress, .id = LZO_COMPRESSION, .name = "lzo", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 520d323a99ce67..b23b780d8f42ec 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -7,6 +7,8 @@ #include #include #include +#include "squashfs_fs_sb.h" +#include "decompressor.h" #include "page_actor.h" /* @@ -57,29 +59,62 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, } /* Implementation of page_actor for decompressing directly into page cache. */ +static void *handle_next_page(struct squashfs_page_actor *actor) +{ + int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (actor->returned_pages == max_pages) + return NULL; + + if ((actor->next_page == actor->pages) || + (actor->next_index != actor->page[actor->next_page]->index)) { + if (actor->alloc_buffer) { + void *tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + + if (tmp_buffer) { + actor->tmp_buffer = tmp_buffer; + actor->next_index++; + actor->returned_pages++; + return tmp_buffer; + } + } + + actor->next_index++; + actor->returned_pages++; + return ERR_PTR(-ENOMEM); + } + + actor->next_index++; + actor->returned_pages++; + return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); +} + static void *direct_first_page(struct squashfs_page_actor *actor) { - actor->next_page = 1; - return actor->pageaddr = kmap_atomic(actor->page[0]); + return handle_next_page(actor); } static void *direct_next_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); + actor->pageaddr = actor->tmp_buffer = NULL; - return actor->pageaddr = actor->next_page == actor->pages ? NULL : - kmap_atomic(actor->page[actor->next_page++]); + return handle_next_page(actor); } static void direct_finish_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); } -struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, - int pages, int length) +struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, + struct page **page, int pages, int length) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -90,7 +125,11 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, actor->page = page; actor->pages = pages; actor->next_page = 0; + actor->returned_pages = 0; + actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->tmp_buffer = NULL; + actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; actor->squashfs_finish_page = direct_finish_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 2e3073ace0097b..37523c54256fa7 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -45,6 +45,11 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { /* empty */ } + +static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) +{ + /* empty */ +} #else struct squashfs_page_actor { union { @@ -52,17 +57,23 @@ struct squashfs_page_actor { struct page **page; }; void *pageaddr; + void *tmp_buffer; void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); int pages; int length; int next_page; + int alloc_buffer; + int returned_pages; + pgoff_t next_index; }; -extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); -extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page - **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length); +extern struct squashfs_page_actor *squashfs_page_actor_init_special( + struct squashfs_sb_info *msblk, + struct page **page, int pages, int length); static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { return actor->squashfs_first_page(actor); @@ -75,5 +86,9 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { actor->squashfs_finish_page(actor); } +static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) +{ + actor->alloc_buffer = 0; +} #endif #endif diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 68f6d09bb3a2bc..6c49481a2f8c43 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_pos = 0; stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + goto finish; + } for (;;) { enum xz_ret xz_err; @@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->buf.out_pos == stream->buf.out_size) { stream->buf.out = squashfs_next_page(output); - if (stream->buf.out != NULL) { + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + break; + } else if (stream->buf.out != NULL) { stream->buf.out_pos = 0; total += PAGE_SIZE; } @@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); return error ? error : total + stream->buf.out_pos; @@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = { .decompress = squashfs_xz_uncompress, .id = XZ_COMPRESSION, .name = "xz", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index a20e9042146bd6..cbb7afe7bc4679 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->next_out = squashfs_first_page(output); stream->avail_in = 0; + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + goto finish; + } + for (;;) { int zlib_err; @@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); - if (stream->next_out != NULL) + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + break; + } else if (stream->next_out != NULL) stream->avail_out = PAGE_SIZE; } @@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); if (!error) @@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = { .decompress = zlib_uncompress, .id = ZLIB_COMPRESSION, .name = "zlib", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index c40445dbf38c77..0e407c4d8b3bc3 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + goto finish; + } for (;;) { size_t zstd_err; @@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, if (out_buf.pos == out_buf.size) { out_buf.dst = squashfs_next_page(output); - if (out_buf.dst == NULL) { + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + break; + } else if (out_buf.dst == NULL) { /* Shouldn't run out of pages * before stream is done. */ @@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: + squashfs_finish_page(output); return error ? error : total_out; @@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = { .decompress = zstd_uncompress, .id = ZSTD_COMPRESSION, .name = "zstd", + .alloc_buffer = 1, .supported = 1 }; From 1bb1a07afad97303f14b8d1b319b03f1f01a0091 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sat, 11 Jun 2022 04:21:33 +0100 Subject: [PATCH 14/72] squashfs: don't use intermediate buffer if pages missing Now that the "page actor" can handle missing pages, we don't have to fall back to using an intermediate buffer in Squashfs_readpage_block() if all the pages necessary can't be obtained. Link: https://lkml.kernel.org/r/20220611032133.5743-3-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Cc: Hsin-Yi Wang Cc: Matthew Wilcox (Oracle) Cc: Xiongwei Song Signed-off-by: Andrew Morton --- fs/squashfs/file_direct.c | 75 +++++++-------------------------------- 1 file changed, 12 insertions(+), 63 deletions(-) diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 5af5802f5626cb..be4b12d31e0c36 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -18,9 +18,6 @@ #include "squashfs.h" #include "page_actor.h" -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes); - /* Read separately compressed datablock directly into page cache */ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int expected) @@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; int start_index = target_page->index & ~mask; int end_index = start_index | mask; - int i, n, pages, missing_pages, bytes, res = -ENOMEM; + int i, n, pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; void *pageaddr; @@ -48,44 +45,29 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return res; /* Try to grab all the pages covered by the Squashfs block */ - for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + for (i = 0, n = start_index; n <= end_index; n++) { page[i] = (n == target_page->index) ? target_page : grab_cache_page_nowait(target_page->mapping, n); - if (page[i] == NULL) { - missing_pages++; + if (page[i] == NULL) continue; - } if (PageUptodate(page[i])) { unlock_page(page[i]); put_page(page[i]); - page[i] = NULL; - missing_pages++; + continue; } - } - - if (missing_pages) { - /* - * Couldn't get one or more pages, this page has either - * been VM reclaimed, but others are still in the page cache - * and uptodate, or we're racing with another thread in - * squashfs_readpage also trying to grab them. Fall back to - * using an intermediate buffer. - */ - res = squashfs_read_cache(target_page, block, bsize, pages, - page, expected); - if (res < 0) - goto mark_errored; - goto out; + i++; } + pages = i; + /* * Create a "page actor" which will kmap and kunmap the * page cache pages appropriately within the decompressor */ - actor = squashfs_page_actor_init_special(msblk, page, pages, 0); + actor = squashfs_page_actor_init_special(msblk, page, pages, expected); if (actor == NULL) goto out; @@ -102,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, goto mark_errored; } - /* Last page may have trailing bytes not filled */ + /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (bytes) { - pageaddr = kmap_atomic(page[pages - 1]); + if (page[pages - 1]->index == end_index && bytes) { + pageaddr = kmap_local_page(page[pages - 1]); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap_atomic(pageaddr); + kunmap_local(pageaddr); } /* Mark pages as uptodate, unlock and release */ @@ -140,36 +122,3 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, kfree(page); return res; } - - -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes) -{ - struct inode *i = target_page->mapping->host; - struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, - block, bsize); - int res = buffer->error, n, offset = 0; - - if (res) { - ERROR("Unable to read page, block %llx, size %x\n", block, - bsize); - goto out; - } - - for (n = 0; n < pages && bytes > 0; n++, - bytes -= PAGE_SIZE, offset += PAGE_SIZE) { - int avail = min_t(int, bytes, PAGE_SIZE); - - if (page[n] == NULL) - continue; - - squashfs_fill_page(page[n], buffer, offset, avail); - unlock_page(page[n]); - if (page[n] != target_page) - put_page(page[n]); - } - -out: - squashfs_cache_put(buffer); - return res; -} From 019a0c9e377c9f7bd477a0742706d93cdddaee4d Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 10 Jun 2022 09:57:18 +0200 Subject: [PATCH 15/72] fat: add a vfat_rename2() and make existing .rename callback a helper Patch series "fat: add support for the renameat2 RENAME_EXCHANGE flag", v6. The series adds support for the renameat2 system call RENAME_EXCHANGE flag (which allows to atomically replace two paths) to the vfat filesystem code. There are many use cases for this, but we are particularly interested in making possible for vfat filesystems to be part of OSTree [0] deployments. Currently OSTree relies on symbolic links to make the deployment updates an atomic transactional operation. But RENAME_EXCHANGE could be used [1] to achieve a similar level of robustness when using a vfat filesystem. Patch #1 is just a preparatory patch to introduce the RENAME_EXCHANGE support, patch #2 moves some code blocks in vfat_rename() to a set of helper functions, that can be reused by tvfat_rename_exchange() that's added by patch #3 and finally patch #4 adds some kselftests to test it. This patch (of 4): Currently vfat only supports the RENAME_NOREPLACE flag which is handled by the virtual file system layer but doesn't support the RENAME_EXCHANGE flag. Add a vfat_rename2() function to be used as the .rename callback and move the current vfat_rename() handler to a helper. This is in preparation for implementing the RENAME_NOREPLACE flag using a different helper function. Link: https://lkml.kernel.org/r/20220610075721.1182745-1-javierm@redhat.com Link: https://lkml.kernel.org/r/20220610075721.1182745-2-javierm@redhat.com Signed-off-by: Javier Martinez Canillas Acked-by: OGAWA Hirofumi Cc: Christian Kellner Cc: Peter Jones Cc: Chung-Chiang Cheng Cc: Lennart Poettering Cc: Alexander Larsson Cc: Colin Walters Cc: Muhammad Usama Anjum Signed-off-by: Andrew Morton --- fs/fat/namei_vfat.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c573314806cf82..88ccb2ee3537bc 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -889,9 +889,8 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, - struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry, unsigned int flags) +static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { struct buffer_head *dotdot_bh; struct msdos_dir_entry *dotdot_de; @@ -902,9 +901,6 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = d_inode(old_dentry); new_inode = d_inode(new_dentry); @@ -1021,13 +1017,24 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out; } +static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */ + return vfat_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations vfat_dir_inode_operations = { .create = vfat_create, .lookup = vfat_lookup, .unlink = vfat_unlink, .mkdir = vfat_mkdir, .rmdir = vfat_rmdir, - .rename = vfat_rename, + .rename = vfat_rename2, .setattr = fat_setattr, .getattr = fat_getattr, .update_time = fat_update_time, From 204d03203a145b443cd8676dc12dbb47e1a3751f Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 10 Jun 2022 09:57:19 +0200 Subject: [PATCH 16/72] fat: factor out reusable code in vfat_rename() as helper functions The vfat_rename() function is quite big and there are code blocks that can be moved into helper functions. This not only simplify the implementation of that function but also allows these helpers to be reused. For example, the helpers can be used by the handler of the RENAME_EXCHANGE flag once this is implemented in a subsequent patch. Link: https://lkml.kernel.org/r/20220610075721.1182745-3-javierm@redhat.com Signed-off-by: OGAWA Hirofumi Signed-off-by: Javier Martinez Canillas Acked-by: OGAWA Hirofumi Cc: Alexander Larsson Cc: Christian Kellner Cc: Chung-Chiang Cheng Cc: Colin Walters Cc: Lennart Poettering Cc: Muhammad Usama Anjum Cc: Peter Jones Signed-off-by: Andrew Morton --- fs/fat/namei_vfat.c | 89 +++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 88ccb2ee3537bc..9c04053a8f1cc5 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -889,16 +889,55 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, return err; } +static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + if (S_ISDIR(inode->i_mode)) { + if (fat_get_dotdot_entry(inode, bh, de)) + return -EIO; + } + return 0; +} + +static int vfat_sync_ipos(struct inode *dir, struct inode *inode) +{ + if (IS_DIRSYNC(dir)) + return fat_sync_inode(inode); + mark_inode_dirty(inode); + return 0; +} + +static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode, + struct buffer_head *dotdot_bh, + struct msdos_dir_entry *dotdot_de) +{ + fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, inode); + if (IS_DIRSYNC(dir)) + return sync_dirty_buffer(dotdot_bh); + return 0; +} + +static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts) +{ + inode_inc_iversion(dir); + fat_truncate_time(dir, ts, S_CTIME | S_MTIME); + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); +} + static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct buffer_head *dotdot_bh; - struct msdos_dir_entry *dotdot_de; + struct msdos_dir_entry *dotdot_de = NULL; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec64 ts; loff_t new_i_pos; - int err, is_dir, update_dotdot, corrupt = 0; + int err, is_dir, corrupt = 0; struct super_block *sb = old_dir->i_sb; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; @@ -909,15 +948,13 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; - is_dir = S_ISDIR(old_inode->i_mode); - update_dotdot = (is_dir && old_dir != new_dir); - if (update_dotdot) { - if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { - err = -EIO; + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de); + if (err) goto out; - } } + is_dir = S_ISDIR(old_inode->i_mode); ts = current_time(old_dir); if (new_inode) { if (is_dir) { @@ -938,21 +975,15 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, fat_detach(old_inode); fat_attach(old_inode, new_i_pos); - if (IS_DIRSYNC(new_dir)) { - err = fat_sync_inode(old_inode); - if (err) - goto error_inode; - } else - mark_inode_dirty(old_inode); + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_inode; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - if (IS_DIRSYNC(new_dir)) { - err = sync_dirty_buffer(dotdot_bh); - if (err) - goto error_dotdot; - } + if (dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh, + dotdot_de); + if (err) + goto error_dotdot; drop_nlink(old_dir); if (!new_inode) inc_nlink(new_dir); @@ -962,12 +993,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, old_sinfo.bh = NULL; if (err) goto error_dotdot; - inode_inc_iversion(old_dir); - fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); - if (IS_DIRSYNC(old_dir)) - (void)fat_sync_inode(old_dir); - else - mark_inode_dirty(old_dir); + vfat_update_dir_metadata(old_dir, &ts); if (new_inode) { drop_nlink(new_inode); @@ -987,10 +1013,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, /* data cluster is shared, serious corruption */ corrupt = 1; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - corrupt |= sync_dirty_buffer(dotdot_bh); + if (dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh, + dotdot_de); } error_inode: fat_detach(old_inode); From da87e1725ae2136baeb9aac04c572c283afc917f Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 10 Jun 2022 09:57:20 +0200 Subject: [PATCH 17/72] fat: add renameat2 RENAME_EXCHANGE flag support The renameat2 RENAME_EXCHANGE flag allows to atomically exchange two paths but is currently not supported by the Linux vfat filesystem driver. Add a vfat_rename_exchange() helper function that implements this support. The super block lock is acquired during the operation to ensure atomicity, and in the error path actions made are reversed also with the mutex held. It makes the operation as transactional as possible, within the limitation impossed by vfat due not having a journal with logs to replay. Link: https://lkml.kernel.org/r/20220610075721.1182745-4-javierm@redhat.com Signed-off-by: Javier Martinez Canillas Acked-by: OGAWA Hirofumi Cc: Alexander Larsson Cc: Christian Kellner Cc: Chung-Chiang Cheng Cc: Colin Walters Cc: Lennart Poettering Cc: Muhammad Usama Anjum Cc: Peter Jones Signed-off-by: Andrew Morton --- fs/fat/namei_vfat.c | 123 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 9c04053a8f1cc5..21620054e1c44e 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1042,13 +1042,134 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } +static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode, + loff_t old_i_pos, loff_t new_i_pos) +{ + fat_detach(old_inode); + fat_detach(new_inode); + fat_attach(old_inode, new_i_pos); + fat_attach(new_inode, old_i_pos); +} + +static void vfat_move_nlink(struct inode *src, struct inode *dst) +{ + drop_nlink(src); + inc_nlink(dst); +} + +static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL; + struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL; + struct inode *old_inode, *new_inode; + struct timespec64 ts = current_time(old_dir); + loff_t old_i_pos, new_i_pos; + int err, corrupt = 0; + struct super_block *sb = old_dir->i_sb; + + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); + + /* Acquire super block lock for the operation to be atomic */ + mutex_lock(&MSDOS_SB(sb)->s_lock); + + /* if directories are not the same, get ".." info to update */ + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh, + &old_dotdot_de); + if (err) + goto out; + + err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh, + &new_dotdot_de); + if (err) + goto out; + } + + old_i_pos = MSDOS_I(old_inode)->i_pos; + new_i_pos = MSDOS_I(new_inode)->i_pos; + + vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos); + + err = vfat_sync_ipos(old_dir, new_inode); + if (err) + goto error_exchange; + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_exchange; + + /* update ".." directory entry info */ + if (old_dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh, + old_dotdot_de); + if (err) + goto error_old_dotdot; + } + if (new_dotdot_de) { + err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh, + new_dotdot_de); + if (err) + goto error_new_dotdot; + } + + /* if cross directory and only one is a directory, adjust nlink */ + if (!old_dotdot_de != !new_dotdot_de) { + if (old_dotdot_de) + vfat_move_nlink(old_dir, new_dir); + else + vfat_move_nlink(new_dir, old_dir); + } + + vfat_update_dir_metadata(old_dir, &ts); + /* if directories are not the same, update new_dir as well */ + if (old_dir != new_dir) + vfat_update_dir_metadata(new_dir, &ts); + +out: + brelse(old_dotdot_bh); + brelse(new_dotdot_bh); + mutex_unlock(&MSDOS_SB(sb)->s_lock); + + return err; + +error_new_dotdot: + if (new_dotdot_de) { + corrupt |= vfat_update_dotdot_de(new_dir, new_inode, + new_dotdot_bh, new_dotdot_de); + } + +error_old_dotdot: + if (old_dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, + old_dotdot_bh, old_dotdot_de); + } + +error_exchange: + vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos); + corrupt |= vfat_sync_ipos(new_dir, new_inode); + corrupt |= vfat_sync_ipos(old_dir, old_inode); + + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld, %lld)", + __func__, old_i_pos, new_i_pos); + } + goto out; +} + static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; + if (flags & RENAME_EXCHANGE) { + return vfat_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); + } + /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */ return vfat_rename(old_dir, old_dentry, new_dir, new_dentry); } From dd7c9be330d87732766a95cfd7a6de38bf7a39c3 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 10 Jun 2022 09:57:21 +0200 Subject: [PATCH 18/72] selftests/filesystems: add a vfat RENAME_EXCHANGE test Add a test for the renameat2 RENAME_EXCHANGE support in vfat, but split it in a tool that just does the rename exchange and a script that is run by the kselftests framework on `make TARGETS="filesystems/fat" kselftest`. That way the script can be easily extended to test other file operations. The script creates a 1 MiB disk image, that is then formated with a vfat filesystem and mounted using a loop device. That way all file operations are done on an ephemeral filesystem. Link: https://lkml.kernel.org/r/20220610075721.1182745-5-javierm@redhat.com Signed-off-by: Javier Martinez Canillas Acked-by: Muhammad Usama Anjum Acked-by: OGAWA Hirofumi Cc: Alexander Larsson Cc: Christian Kellner Cc: Chung-Chiang Cheng Cc: Colin Walters Cc: Lennart Poettering Cc: Peter Jones Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + .../selftests/filesystems/fat/.gitignore | 2 + .../selftests/filesystems/fat/Makefile | 7 ++ .../testing/selftests/filesystems/fat/config | 2 + .../filesystems/fat/rename_exchange.c | 37 +++++++++ .../filesystems/fat/run_fat_tests.sh | 82 +++++++++++++++++++ 7 files changed, 132 insertions(+) create mode 100644 tools/testing/selftests/filesystems/fat/.gitignore create mode 100644 tools/testing/selftests/filesystems/fat/Makefile create mode 100644 tools/testing/selftests/filesystems/fat/config create mode 100644 tools/testing/selftests/filesystems/fat/rename_exchange.c create mode 100644 tools/testing/selftests/filesystems/fat/run_fat_tests.sh diff --git a/MAINTAINERS b/MAINTAINERS index 1fc9ead83d2aa3..addcc0cca3211a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20891,6 +20891,7 @@ M: OGAWA Hirofumi S: Maintained F: Documentation/filesystems/vfat.rst F: fs/fat/ +F: tools/testing/selftests/filesystems/fat/ VFIO DRIVER M: Alex Williamson diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index de11992dc57763..67668a9fa115ed 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,6 +17,7 @@ TARGETS += exec TARGETS += filesystems TARGETS += filesystems/binderfs TARGETS += filesystems/epoll +TARGETS += filesystems/fat TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/fat/.gitignore b/tools/testing/selftests/filesystems/fat/.gitignore new file mode 100644 index 00000000000000..b89920ed841cce --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +rename_exchange diff --git a/tools/testing/selftests/filesystems/fat/Makefile b/tools/testing/selftests/filesystems/fat/Makefile new file mode 100644 index 00000000000000..902033f6ef098b --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_PROGS := run_fat_tests.sh +TEST_GEN_PROGS_EXTENDED := rename_exchange +CFLAGS += -O2 -g -Wall $(KHDR_INCLUDES) + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/fat/config b/tools/testing/selftests/filesystems/fat/config new file mode 100644 index 00000000000000..6cf95e787a17b5 --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/config @@ -0,0 +1,2 @@ +CONFIG_BLK_DEV_LOOP=y +CONFIG_VFAT_FS=y diff --git a/tools/testing/selftests/filesystems/fat/rename_exchange.c b/tools/testing/selftests/filesystems/fat/rename_exchange.c new file mode 100644 index 00000000000000..e488ad354fce4f --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/rename_exchange.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Program that atomically exchanges two paths using + * the renameat2() system call RENAME_EXCHANGE flag. + * + * Copyright 2022 Red Hat Inc. + * Author: Javier Martinez Canillas + */ + +#define _GNU_SOURCE +#include +#include +#include + +void print_usage(const char *program) +{ + printf("Usage: %s [oldpath] [newpath]\n", program); + printf("Atomically exchange oldpath and newpath\n"); +} + +int main(int argc, char *argv[]) +{ + int ret; + + if (argc != 3) { + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + ret = renameat2(AT_FDCWD, argv[1], AT_FDCWD, argv[2], RENAME_EXCHANGE); + if (ret) { + perror("rename exchange failed"); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh new file mode 100644 index 00000000000000..7f35dc3d15dfa6 --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Run filesystem operations tests on an 1 MiB disk image that is formatted with +# a vfat filesystem and mounted in a temporary directory using a loop device. +# +# Copyright 2022 Red Hat Inc. +# Author: Javier Martinez Canillas + +set -e +set -u +set -o pipefail + +BASE_DIR="$(dirname $0)" +TMP_DIR="$(mktemp -d /tmp/fat_tests_tmp.XXXX)" +IMG_PATH="${TMP_DIR}/fat.img" +MNT_PATH="${TMP_DIR}/mnt" + +cleanup() +{ + mountpoint -q "${MNT_PATH}" && unmount_image + rm -rf "${TMP_DIR}" +} +trap cleanup SIGINT SIGTERM EXIT + +create_loopback() +{ + touch "${IMG_PATH}" + chattr +C "${IMG_PATH}" >/dev/null 2>&1 || true + + truncate -s 1M "${IMG_PATH}" + mkfs.vfat "${IMG_PATH}" >/dev/null 2>&1 +} + +mount_image() +{ + mkdir -p "${MNT_PATH}" + sudo mount -o loop "${IMG_PATH}" "${MNT_PATH}" +} + +rename_exchange_test() +{ + local rename_exchange="${BASE_DIR}/rename_exchange" + local old_path="${MNT_PATH}/old_file" + local new_path="${MNT_PATH}/new_file" + + echo old | sudo tee "${old_path}" >/dev/null 2>&1 + echo new | sudo tee "${new_path}" >/dev/null 2>&1 + sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1 + sudo sync -f "${MNT_PATH}" + grep new "${old_path}" >/dev/null 2>&1 + grep old "${new_path}" >/dev/null 2>&1 +} + +rename_exchange_subdir_test() +{ + local rename_exchange="${BASE_DIR}/rename_exchange" + local dir_path="${MNT_PATH}/subdir" + local old_path="${MNT_PATH}/old_file" + local new_path="${dir_path}/new_file" + + sudo mkdir -p "${dir_path}" + echo old | sudo tee "${old_path}" >/dev/null 2>&1 + echo new | sudo tee "${new_path}" >/dev/null 2>&1 + sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1 + sudo sync -f "${MNT_PATH}" + grep new "${old_path}" >/dev/null 2>&1 + grep old "${new_path}" >/dev/null 2>&1 +} + +unmount_image() +{ + sudo umount "${MNT_PATH}" &> /dev/null +} + +create_loopback +mount_image +rename_exchange_test +rename_exchange_subdir_test +unmount_image + +exit 0 From f858e23a29740757fe1ca602cb1f57845034b1c5 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Mon, 13 Jun 2022 12:00:55 +0200 Subject: [PATCH 19/72] checkpatch: fix incorrect camelcase detection on numeric constant The code fragment below int foo(int *array, int index) { return array[index & 0xFF]; } triggers an incorrect camelcase detection by checking a substring of the hex constant: CHECK: Avoid CamelCase: #3: FILE: test.c:3: + return array[index & 0xFF]; This is caused by passing the whole string "array[index & 0xFF]" to the inner loop that iterates over a "$Ident" match. The numeric constant is not a $Ident as it doesn't start with [A-Za-z_] and should be excluded from the match. Similar issue can be detected with other constants like "1uL", "0xffffU". Force the match to start at word boundary so the $Ident will be properly checked starting from its first char and the constants will be filtered-out. Link: https://lkml.kernel.org/r/20220613100055.77821-1-borneo.antonio@gmail.com Signed-off-by: Antonio Borneo Cc: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 205bf5055acffb..79e759aac543b8 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5721,7 +5721,7 @@ sub process { $var !~ /^(?:[a-z0-9_]*|[A-Z0-9_]*)?_?[a-z][A-Z](?:_[a-z0-9_]+|_[A-Z0-9_]+)?$/ && #Ignore some three character SI units explicitly, like MiB and KHz $var !~ /^(?:[a-z_]*?)_?(?:[KMGT]iB|[KMGT]?Hz)(?:_[a-z_]+)?$/) { - while ($var =~ m{($Ident)}g) { + while ($var =~ m{\b($Ident)}g) { my $word = $1; next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/); if ($check) { From 00c9d5632277b21ba8802e26c27254cd9d0dfa13 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sun, 12 Jun 2022 13:20:15 +0800 Subject: [PATCH 20/72] lib/error-inject: convert to DEFINE_SEQ_ATTRIBUTE Use DEFINE_SEQ_ATTRIBUTE helper macro to simplify the code. Link: https://lkml.kernel.org/r/20220612052015.23283-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Masami Hiramatsu (Google) Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Signed-off-by: Andrew Morton --- lib/error-inject.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/lib/error-inject.c b/lib/error-inject.c index 2ff5ef689d7271..4a4f1278c41916 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -197,24 +197,14 @@ static int ei_seq_show(struct seq_file *m, void *v) return 0; } -static const struct seq_operations ei_seq_ops = { +static const struct seq_operations ei_sops = { .start = ei_seq_start, .next = ei_seq_next, .stop = ei_seq_stop, .show = ei_seq_show, }; -static int ei_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &ei_seq_ops); -} - -static const struct file_operations debugfs_ei_ops = { - .open = ei_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(ei); static int __init ei_debugfs_init(void) { @@ -224,7 +214,7 @@ static int __init ei_debugfs_init(void) if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_ei_ops); + file = debugfs_create_file("list", 0444, dir, NULL, &ei_fops); if (!file) { debugfs_remove(dir); return -ENOMEM; From 5a704629f2c1ba33bbb444cb18e6957e97c76e8f Mon Sep 17 00:00:00 2001 From: Dan Moulding Date: Sun, 17 Jul 2022 17:31:37 -0700 Subject: [PATCH 21/72] init: add "hostname" kernel parameter The gethostname system call returns the hostname for the current machine. However, the kernel has no mechanism to initially set the current machine's name in such a way as to guarantee that the first userspace process to call gethostname will receive a meaningful result. It relies on some unspecified userspace process to first call sethostname before gethostname can produce a meaningful name. Traditionally the machine's hostname is set from userspace by the init system. The init system, in turn, often relies on a configuration file (say, /etc/hostname) to provide the value that it will supply in the call to sethostname. Consequently, the file system containing /etc/hostname usually must be available before the hostname will be set. There may, however, be earlier userspace processes that could call gethostname before the file system containing /etc/hostname is mounted. Such a process will get some other, likely meaningless, name from gethostname (such as "(none)", "localhost", or "darkstar"). A real-world example where this can happen, and lead to undesirable results, is with mdadm. When assembling arrays, mdadm distinguishes between "local" arrays and "foreign" arrays. A local array is one that properly belongs to the current machine, and a foreign array is one that is (possibly temporarily) attached to the current machine, but properly belongs to some other machine. To determine if an array is local or foreign, mdadm may compare the "homehost" recorded on the array with the current hostname. If mdadm is run before the root file system is mounted, perhaps because the root file system itself resides on an md-raid array, then /etc/hostname isn't yet available and the init system will not yet have called sethostname, causing mdadm to incorrectly conclude that all of the local arrays are foreign. Solving this problem *could* be delegated to the init system. It could be left up to the init system (including any init system that starts within an initramfs, if one is in use) to ensure that sethostname is called before any other userspace process could possibly call gethostname. However, it may not always be obvious which processes could call gethostname (for example, udev itself might not call gethostname, but it could via udev rules invoke processes that do). Additionally, the init system has to ensure that the hostname configuration value is stored in some place where it will be readily accessible during early boot. Unfortunately, every init system will attempt to (or has already attempted to) solve this problem in a different, possibly incorrect, way. This makes getting consistently working configurations harder for users. I believe it is better for the kernel to provide the means by which the hostname may be set early, rather than making this a problem for the init system to solve. The option to set the hostname during early startup, via a kernel parameter, provides a simple, reliable way to solve this problem. It also could make system configuration easier for some embedded systems. [dmoulding@me.com: v2] Link: https://lkml.kernel.org/r/20220506060310.7495-2-dmoulding@me.com Link: https://lkml.kernel.org/r/20220505180651.22849-2-dmoulding@me.com Signed-off-by: Dan Moulding Cc: Thomas Gleixner Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 13 +++++++++++++ init/version.c | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f23..2c9c0229b77071 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1667,6 +1667,19 @@ hlt [BUGS=ARM,SH] + hostname= [KNL] Set the hostname (aka UTS nodename). + Format: + This allows setting the system's hostname during early + startup. This sets the name returned by gethostname. + Using this parameter to set the hostname makes it + possible to ensure the hostname is correctly set before + any userspace processes run, avoiding the possibility + that a process may call gethostname before the hostname + has been explicitly set, resulting in the calling + process getting an incorrect result. The string must + not exceed the maximum allowed hostname length (usually + 64 characters) and will be truncated otherwise. + hpet= [X86-32,HPET] option to control HPET usage Format: { enable (default) | disable | force | verbose } diff --git a/init/version.c b/init/version.c index 1a356f5493e853..b7f9559d417c74 100644 --- a/init/version.c +++ b/init/version.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -35,6 +37,21 @@ struct uts_namespace init_uts_ns = { }; EXPORT_SYMBOL_GPL(init_uts_ns); +static int __init early_hostname(char *arg) +{ + size_t bufsize = sizeof(init_uts_ns.name.nodename); + size_t maxlen = bufsize - 1; + size_t arglen; + + arglen = strlcpy(init_uts_ns.name.nodename, arg, bufsize); + if (arglen > maxlen) { + pr_warn("hostname parameter exceeds %zd characters and will be truncated", + maxlen); + } + return 0; +} +early_param("hostname", early_hostname); + /* FIXED STRINGS! Don't touch! */ const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" From 5a66fce95b72e6359527415b33a7ae13f0d6b7eb Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 18 Jun 2022 16:25:21 +0800 Subject: [PATCH 22/72] lib/lru_cache: fix error free handing in lc_create When kmem_cache_alloc in function lc_create returns null, we will free the memory already allocated. The loop of kmem_cache_free is wrong, especially: i = 0 ==> do wrong loop i > 0 ==> do not free element[0] Link: https://lkml.kernel.org/r/20220618082521.7082-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Christoph Bhmwalder Signed-off-by: Andrew Morton --- lib/lru_cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 52313acbfa6284..dc35464216d3cc 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -147,8 +147,8 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, return lc; /* else: could not allocate all elements, give up */ - for (i--; i; i--) { - void *p = element[i]; + while (i) { + void *p = element[--i]; kmem_cache_free(cache, p - e_off); } kfree(lc); From 62df90b53e6f332bb69b73621998826c49a17323 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sun, 19 Jun 2022 15:46:41 +0800 Subject: [PATCH 23/72] net, lib/once: remove {net_}get_random_once_wait macro DO_ONCE(func, ...) will call func with spinlock which acquired by spin_lock_irqsave in __do_once_start. But the get_random_once_wait will sleep in get_random_bytes_wait -> wait_for_random_bytes. Fortunately, there is no place to use {net_}get_random_once_wait, so we could remove them simply. Link: https://lkml.kernel.org/r/20220619074641.40916-1-wuchi.zero@gmail.com Signed-off-by: wuchi Acked-by: Jakub Kicinski Cc: David S. Miller Cc: Eric Dumazet Cc: Paolo Abeni Signed-off-by: Andrew Morton --- include/linux/net.h | 2 -- include/linux/once.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index 12093f4db50c42..8613772a1f580e 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ do { \ #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) -#define net_get_random_once_wait(buf, nbytes) \ - get_random_once_wait((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta diff --git a/include/linux/once.h b/include/linux/once.h index f54523052bbcb5..b14d8b309d52b1 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -54,7 +54,5 @@ void __do_once_done(bool *done, struct static_key_true *once_key, #define get_random_once(buf, nbytes) \ DO_ONCE(get_random_bytes, (buf), (nbytes)) -#define get_random_once_wait(buf, nbytes) \ - DO_ONCE(get_random_bytes_wait, (buf), (nbytes)) \ #endif /* _LINUX_ONCE_H */ From f9987921cb541b1187a648141a9048547ea89ffb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 20 Jun 2022 17:02:49 +0200 Subject: [PATCH 24/72] lib/stackdepot: replace CONFIG_STACK_HASH_ORDER with automatic sizing As Linus explained [1], setting the stackdepot hash table size as a config option is suboptimal, especially as stackdepot becomes a dependency of less "expert" subsystems than initially (e.g. DRM, networking, SLUB_DEBUG): : (a) it introduces a new compile-time question that isn't sane to ask : a regular user, but is now exposed to regular users. : (b) this by default uses 1MB of memory for a feature that didn't in : the past, so now if you have small machines you need to make sure you : make a special kernel config for them. Ideally we would employ rhashtable for fully automatic resizing, which should be feasible for many of the new users, but problematic for the original users with restricted context that call __stack_depot_save() with can_alloc == false, i.e. KASAN. However we can easily remove the config option and scale the hash table automatically with system memory. The STACK_HASH_MASK constant becomes stack_hash_mask variable and is used only in one mask operation, so the overhead should be negligible to none. For early allocation we can employ the existing alloc_large_system_hash() function and perform similar scaling for the late allocation. The existing limits of the config option (between 4k and 1M buckets) are preserved, and scaling factor is set to one bucket per 16kB memory so on 64bit the max 1M buckets (8MB memory) is achieved with 16GB system, while a 1GB system will use 512kB. Because KASAN is reported to need the maximum number of buckets even with smaller amounts of memory [2], set it as such when kasan_enabled(). If needed, the automatic scaling could be complemented with a boot-time kernel parameter, but it feels pointless to add it without a specific use case. [1] https://lore.kernel.org/all/CAHk-=wjC5nS+fnf6EzRD9yQRJApAhxx7gRB87ZV+pAWo9oVrTg@mail.gmail.com/ [2] https://lore.kernel.org/all/CACT4Y+Y4GZfXOru2z5tFPzFdaSUd+GFc6KVL=bsa0+1m197cQQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220620150249.16814-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Linus Torvalds Acked-by: Dmitry Vyukov Cc: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Signed-off-by: Andrew Morton --- lib/Kconfig | 9 -------- lib/stackdepot.c | 59 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index eaaad4d85bf24b..986ea474836c3a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -685,15 +685,6 @@ config STACKDEPOT_ALWAYS_INIT bool select STACKDEPOT -config STACK_HASH_ORDER - int "stack depot hash size (12 => 4KB, 20 => 1024KB)" - range 12 20 - default 20 - depends on STACKDEPOT - help - Select the hash size as a power of 2 for the stackdepot hash table. - Choose a lower value to reduce the memory impact. - config REF_TRACKER bool depends on STACKTRACE_SUPPORT diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 5ca0d086ef4a3a..e73fda23388d8c 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -32,6 +32,7 @@ #include #include #include +#include #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) @@ -145,10 +146,16 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return stack; } -#define STACK_HASH_SIZE (1L << CONFIG_STACK_HASH_ORDER) -#define STACK_HASH_MASK (STACK_HASH_SIZE - 1) +/* one hash table bucket entry per 16kB of memory */ +#define STACK_HASH_SCALE 14 +/* limited between 4k and 1M buckets */ +#define STACK_HASH_ORDER_MIN 12 +#define STACK_HASH_ORDER_MAX 20 #define STACK_HASH_SEED 0x9747b28c +static unsigned int stack_hash_order; +static unsigned int stack_hash_mask; + static bool stack_depot_disable; static struct stack_record **stack_table; @@ -175,7 +182,7 @@ void __init stack_depot_want_early_init(void) int __init stack_depot_early_init(void) { - size_t size; + unsigned long entries = 0; /* This is supposed to be called only once, from mm_init() */ if (WARN_ON(__stack_depot_early_init_passed)) @@ -183,13 +190,23 @@ int __init stack_depot_early_init(void) __stack_depot_early_init_passed = true; + if (kasan_enabled() && !stack_hash_order) + stack_hash_order = STACK_HASH_ORDER_MAX; + if (!__stack_depot_want_early_init || stack_depot_disable) return 0; - size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); - pr_info("Stack Depot early init allocating hash table with memblock_alloc, %zu bytes\n", - size); - stack_table = memblock_alloc(size, SMP_CACHE_BYTES); + if (stack_hash_order) + entries = 1UL << stack_hash_order; + stack_table = alloc_large_system_hash("stackdepot", + sizeof(struct stack_record *), + entries, + STACK_HASH_SCALE, + HASH_EARLY | HASH_ZERO, + NULL, + &stack_hash_mask, + 1UL << STACK_HASH_ORDER_MIN, + 1UL << STACK_HASH_ORDER_MAX); if (!stack_table) { pr_err("Stack Depot hash table allocation failed, disabling\n"); @@ -207,13 +224,35 @@ int stack_depot_init(void) mutex_lock(&stack_depot_init_mutex); if (!stack_depot_disable && !stack_table) { - pr_info("Stack Depot allocating hash table with kvcalloc\n"); - stack_table = kvcalloc(STACK_HASH_SIZE, sizeof(struct stack_record *), GFP_KERNEL); + unsigned long entries; + int scale = STACK_HASH_SCALE; + + if (stack_hash_order) { + entries = 1UL << stack_hash_order; + } else { + entries = nr_free_buffer_pages(); + entries = roundup_pow_of_two(entries); + + if (scale > PAGE_SHIFT) + entries >>= (scale - PAGE_SHIFT); + else + entries <<= (PAGE_SHIFT - scale); + } + + if (entries < 1UL << STACK_HASH_ORDER_MIN) + entries = 1UL << STACK_HASH_ORDER_MIN; + if (entries > 1UL << STACK_HASH_ORDER_MAX) + entries = 1UL << STACK_HASH_ORDER_MAX; + + pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n", + entries); + stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { pr_err("Stack Depot hash table allocation failed, disabling\n"); stack_depot_disable = true; ret = -ENOMEM; } + stack_hash_mask = entries - 1; } mutex_unlock(&stack_depot_init_mutex); return ret; @@ -386,7 +425,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, goto fast_exit; hash = hash_stack(entries, nr_entries); - bucket = &stack_table[hash & STACK_HASH_MASK]; + bucket = &stack_table[hash & stack_hash_mask]; /* * Fast path: look the stack trace up without locking. From 86e5908ec293bf6505a59d02542da006226bcaa7 Mon Sep 17 00:00:00 2001 From: wuchi Date: Mon, 20 Jun 2022 18:02:44 +0800 Subject: [PATCH 25/72] lib/error-inject: traverse list with mutex Traversing list without mutex in get_injectable_error_type will race with the following code: list_del_init(&ent->list) kfree(ent) in module_unload_ei_list. So fix that. Link: https://lkml.kernel.org/r/20220620100244.82896-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Masami Hiramatsu (Google) Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Signed-off-by: Andrew Morton --- lib/error-inject.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/error-inject.c b/lib/error-inject.c index 4a4f1278c41916..1afca1b1cdead0 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -40,12 +40,18 @@ bool within_error_injection_list(unsigned long addr) int get_injectable_error_type(unsigned long addr) { struct ei_entry *ent; + int ei_type = EI_ETYPE_NONE; + mutex_lock(&ei_mutex); list_for_each_entry(ent, &error_injection_list, list) { - if (addr >= ent->start_addr && addr < ent->end_addr) - return ent->etype; + if (addr >= ent->start_addr && addr < ent->end_addr) { + ei_type = ent->etype; + break; + } } - return EI_ETYPE_NONE; + mutex_unlock(&ei_mutex); + + return ei_type; } /* From 43c249ea0b1e10baac4a1264a25d69723ce5d2c2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 24 Jun 2022 16:14:12 +0200 Subject: [PATCH 26/72] compiler-gcc.h: remove ancient workaround for gcc PR 58670 The workaround for 'asm goto' miscompilation introduces a compiler barrier quirk that inhibits many useful compiler optimizations. For example, __try_cmpxchg_user compiles to: 11375: 41 8b 4d 00 mov 0x0(%r13),%ecx 11379: 41 8b 02 mov (%r10),%eax 1137c: f0 0f b1 0a lock cmpxchg %ecx,(%rdx) 11380: 0f 94 c2 sete %dl 11383: 84 d2 test %dl,%dl 11385: 75 c4 jne 1134b <...> 11387: 41 89 02 mov %eax,(%r10) where the barrier inhibits flags propagation from asm when compiled with gcc-12. When the mentioned quirk is removed, the following code is generated: 11553: 41 8b 4d 00 mov 0x0(%r13),%ecx 11557: 41 8b 02 mov (%r10),%eax 1155a: f0 0f b1 0a lock cmpxchg %ecx,(%rdx) 1155e: 74 c9 je 11529 <...> 11560: 41 89 02 mov %eax,(%r10) The refered compiler bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 was fixed for gcc-4.8.2. Current minimum required version of GCC is version 5.1 which has the above 'asm goto' miscompilation fixed, so remove the workaround. Link: https://lkml.kernel.org/r/20220624141412.72274-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index a0c55eeaeaf163..9b157b71036f18 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -66,17 +66,6 @@ __builtin_unreachable(); \ } while (0) -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ From 045ed31e23aea840648c290dbde04797064960db Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 24 Jun 2022 08:30:04 +0300 Subject: [PATCH 27/72] kfifo: fix kfifo_to_user() return type The kfifo_to_user() macro is supposed to return zero for success or negative error codes. Unfortunately, there is a signedness bug so it returns unsigned int. This only affects callers which try to save the result in ssize_t and as far as I can see the only place which does that is line6_hwdep_read(). TL;DR: s/_uint/_int/. Link: https://lkml.kernel.org/r/YrVL3OJVLlNhIMFs@kili Fixes: 144ecf310eb5 ("kfifo: fix kfifo_alloc() to return a signed int value") Signed-off-by: Dan Carpenter Cc: Stefani Seibold Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/kfifo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 86249476b57f43..0b35a41440ff13 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -688,7 +688,7 @@ __kfifo_uint_must_check_helper( \ * writer, you don't need extra locking to use these macro. */ #define kfifo_to_user(fifo, to, len, copied) \ -__kfifo_uint_must_check_helper( \ +__kfifo_int_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ void __user *__to = (to); \ From cda83bb8a61e6d7ce231dc1c2b78a9b79b1f1411 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 25 Jun 2022 21:53:24 +0800 Subject: [PATCH 28/72] lib/radix-tree: remove unused argument of insert_entries insert_entries() doesn't use the 'bool replace' argument, and the function is only used locally, remove the argument. The historical context of the unused argument is as follow: 2: commit <3a08cd52c37c79> (radix tree: Remove multiorder support) Remove the code related to macro CONFIG_RADIX_TREE_MULTIORDER to convert to the xArray. Without the macro, there is no need to retain the argument. 1: commit <175542f575723e> (radix-tree: add radix_tree_join) Add insert_entries(..., bool replace) function, depending on the macro CONFIG_RADIX_TREE_MULTIORDER definition, the implementation is different. Notice that the implementation without the macro doesn't use the argument. [Matthew Wilcox: add historical context for argument] Link: https://lkml.kernel.org/r/20220625135324.72574-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/radix-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index b3afafe46fffbc..3c78e1e8b2ad60 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -677,7 +677,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node) } static inline int insert_entries(struct radix_tree_node *node, - void __rcu **slot, void *item, bool replace) + void __rcu **slot, void *item) { if (*slot) return -EEXIST; @@ -711,7 +711,7 @@ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, if (error) return error; - error = insert_entries(node, slot, item, false); + error = insert_entries(node, slot, item); if (error < 0) return error; From 2d8867f3e0833963dd4af64bed2fb47ed5cf55d8 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 27 Jun 2022 11:02:45 +0800 Subject: [PATCH 29/72] lib: make LZ4_decompress_safe_forceExtDict() static LZ4_decompress_safe_forceExtDict() is only used in lib/lz4/lz4_decompress.c, make it static to fix the build warning about "no previous prototype" [1]. [1] https://lore.kernel.org/lkml/202206260948.akgsho1q-lkp@intel.com/ Link: https://lkml.kernel.org/r/1656298965-8698-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Reported-by: kernel test robot Signed-off-by: Andrew Morton --- lib/lz4/lz4_decompress.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index fd1728d94babb2..59fe69a638000e 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -507,9 +507,9 @@ static int LZ4_decompress_safe_withSmallPrefix(const char *source, char *dest, (BYTE *)dest - prefixSize, NULL, 0); } -int LZ4_decompress_safe_forceExtDict(const char *source, char *dest, - int compressedSize, int maxOutputSize, - const void *dictStart, size_t dictSize) +static int LZ4_decompress_safe_forceExtDict(const char *source, char *dest, + int compressedSize, int maxOutputSize, + const void *dictStart, size_t dictSize) { return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, From 6d529ea80b8a03401195506f45c052c4937545d5 Mon Sep 17 00:00:00 2001 From: wuchi Date: Wed, 29 Jun 2022 11:02:41 +0800 Subject: [PATCH 30/72] lib/scatterlist: use matched parameter type when calling __sg_free_table() commit 4635873c561a ("scsi: lib/sg_pool.c: improve APIs for allocating sg pool") changeed @(bool)skip_first_chunk of __sg_free_table() to @(unsigned int)nents_first_chunk, so use unsigend int type instead of bool type (false -> 0) when calling the function in sg_free_append_table() and sg_free_table(). Link: https://lkml.kernel.org/r/20220629030241.84559-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Ming Lei Cc: Maor Gottlieb Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- lib/scatterlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index d5e82e4a57ad06..c8c3d675845c37 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -240,7 +240,7 @@ EXPORT_SYMBOL(__sg_free_table); **/ void sg_free_append_table(struct sg_append_table *table) { - __sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, false, sg_kfree, + __sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->total_nents); } EXPORT_SYMBOL(sg_free_append_table); @@ -253,7 +253,7 @@ EXPORT_SYMBOL(sg_free_append_table); **/ void sg_free_table(struct sg_table *table) { - __sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree, + __sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->orig_nents); } EXPORT_SYMBOL(sg_free_table); From 4a70ce5f93aaeb0aa81f29c4a3c70f39d8f21087 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 4 Jul 2022 22:53:25 +0100 Subject: [PATCH 31/72] lib/ts_bm.c: remove redundant store to variable consumed after addition There is no need to store the result of the addition back to variable consumed after the addition. The store is redundant, replace += with just + Cleans up clang scan build warning: lib/ts_bm.c:83:11: warning: Although the value stored to 'consumed' is used in the enclosing expression, the value is never actually read from 'consumed' [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220704215325.600993-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton --- lib/ts_bm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ts_bm.c b/lib/ts_bm.c index 4cf250031f0f0a..1f2234221dd114 100644 --- a/lib/ts_bm.c +++ b/lib/ts_bm.c @@ -80,7 +80,7 @@ static unsigned int bm_find(struct ts_config *conf, struct ts_state *state) /* London calling... */ DEBUGP("found!\n"); - return consumed += (shift-(bm->patlen-1)); + return consumed + (shift-(bm->patlen-1)); next: bs = bm->bad_shift[text[shift-i]]; From 71f8c15565d0f3d2f5b3339845e05cf4f03725cd Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 16 May 2022 17:05:07 -0700 Subject: [PATCH 32/72] kallsyms: move declarations to internal header Patch series "Expose kallsyms data in vmcoreinfo note". The kernel can be configured to contain a lot of introspection or debugging information built-in, such as ORC for unwinding stack traces, BTF for type information, and of course kallsyms. Debuggers could use this information to navigate a core dump or live system, but they need to be able to find it. This patch series adds the necessary symbols into vmcoreinfo, which would allow a debugger to find and interpret the kallsyms table. Using the kallsyms data, the debugger can then lookup any symbol, allowing it to find ORC, BTF, or any other useful data. This would allow a live kernel, or core dump, to be debugged without any DWARF debuginfo. This is useful for many cases: the debuginfo may not have been generated, or you may not want to deploy the large files everywhere you need them. I've demonstrated a proof of concept for this at LSF/MM+BPF during a lighting talk. Using a work-in-progress branch of the drgn debugger, and an extended set of BTF generated by a patched version of dwarves, I've been able to open a core dump without any DWARF info and do basic tasks such as enumerating slab caches, block devices, tasks, and doing backtraces. I hope this series can be a first step toward a new possibility of "DWARFless debugging". Related discussion around the BTF side of this: https://lore.kernel.org/bpf/586a6288-704a-f7a7-b256-e18a675927df@oracle.com/T/#u Some work-in-progress branches using this feature: https://github.com/brenns10/dwarves/tree/remove_percpu_restriction_1 https://github.com/brenns10/drgn/tree/kallsyms_plus_btf This patch (of 2): To include kallsyms data in the vmcoreinfo note, we must make the symbol declarations visible outside of kallsyms.c. Move these to a new internal header file. Link: https://lkml.kernel.org/r/20220517000508.777145-1-stephen.s.brennan@oracle.com Link: https://lkml.kernel.org/r/20220517000508.777145-2-stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan Acked-by: Baoquan He Cc: Nick Desaulniers Cc: Dave Young Cc: Kees Cook Cc: Jiri Olsa Cc: Stephen Boyd Cc: Bixuan Cui Cc: David Vernet Cc: Vivek Goyal Cc: Sami Tolvanen Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/kallsyms.c | 23 +---------------------- kernel/kallsyms_internal.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 22 deletions(-) create mode 100644 kernel/kallsyms_internal.h diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fbdf8d3279aca3..510fba0ba5b4a5 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -31,28 +31,7 @@ #include #include -/* - * These will be re-linked against their real values - * during the second link stage. - */ -extern const unsigned long kallsyms_addresses[] __weak; -extern const int kallsyms_offsets[] __weak; -extern const u8 kallsyms_names[] __weak; - -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned int kallsyms_num_syms -__section(".rodata") __attribute__((weak)); - -extern const unsigned long kallsyms_relative_base -__section(".rodata") __attribute__((weak)); - -extern const char kallsyms_token_table[] __weak; -extern const u16 kallsyms_token_index[] __weak; - -extern const unsigned int kallsyms_markers[] __weak; +#include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h new file mode 100644 index 00000000000000..2d0c6f2f0243a2 --- /dev/null +++ b/kernel/kallsyms_internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef LINUX_KALLSYMS_INTERNAL_H_ +#define LINUX_KALLSYMS_INTERNAL_H_ + +#include + +/* + * These will be re-linked against their real values + * during the second link stage. + */ +extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; +extern const u8 kallsyms_names[] __weak; + +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). + */ +extern const unsigned int kallsyms_num_syms +__section(".rodata") __attribute__((weak)); + +extern const unsigned long kallsyms_relative_base +__section(".rodata") __attribute__((weak)); + +extern const char kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; + +extern const unsigned int kallsyms_markers[] __weak; + +#endif // LINUX_KALLSYMS_INTERNAL_H_ From 5fd8fea935a1091083506d0b982fcc5d35062f06 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 16 May 2022 17:05:08 -0700 Subject: [PATCH 33/72] vmcoreinfo: include kallsyms symbols The internal kallsyms tables contain information which could be quite useful to a debugging tool in the absence of other debuginfo. If kallsyms is enabled, then a debugging tool could parse it and use it as a fallback symbol table. Combined with BTF data, live & post-mortem debuggers can support basic operations without needing a large DWARF debuginfo file available. As many as five symbols are necessary to properly parse kallsyms names and addresses. Add these to the vmcoreinfo note. CONFIG_KALLSYMS_ABSOLUTE_PERCPU does impact the computation of symbol addresses. However, a debugger can infer this configuration value by comparing the address of _stext in the vmcoreinfo with the address computed via kallsyms. So there's no need to include information about this config value in the vmcoreinfo note. To verify that we're still well below the maximum of 4096 bytes, I created a script[1] to compute a rough upper bound on the possible size of vmcoreinfo. On v5.18-rc7, the script reports 3106 bytes, and with this patch, the maximum become 3370 bytes. [1]: https://github.com/brenns10/kernel_stuff/blob/master/vmcoreinfosize/ Link: https://lkml.kernel.org/r/20220517000508.777145-3-stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan Acked-by: Baoquan He Cc: Bixuan Cui Cc: Dave Young Cc: David Vernet Cc: "Eric W. Biederman" Cc: Jiri Olsa Cc: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Cc: Stephen Boyd Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 71122e01623cc2..f64d35e2841198 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -15,6 +15,8 @@ #include +#include "kallsyms_internal.h" + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -480,6 +482,18 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); From 376b0c266143a1dda162db6d5bc9b3a7f0ae97c9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Jun 2022 14:22:06 +0300 Subject: [PATCH 34/72] proc: delete unused includes Those aren't necessary after seq files won. Link: https://lkml.kernel.org/r/YqnA3mS7KBt8Z4If@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/array.c | 1 - fs/proc/inode.c | 2 -- fs/proc/kmsg.c | 1 - fs/proc/nommu.c | 1 - fs/proc/proc_net.c | 3 --- fs/proc/proc_tty.c | 2 -- fs/proc/root.c | 3 --- fs/proc/vmcore.c | 1 - 8 files changed, 14 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index eb815759842ce4..65fa603422e04d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -69,7 +69,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 73aeb4e6d32e51..fd40d60169b5a2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -26,8 +26,6 @@ #include #include -#include - #include "internal.h" static void proc_evict_inode(struct inode *inode) diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index b38ad552887fb5..592e6dc7c1102b 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -15,7 +15,6 @@ #include #include -#include #include extern wait_queue_head_t log_wait; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 13452b32e2bd57..4d3493579458f0 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 913e5acefbb66b..bbce6fbe779c8c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -8,9 +8,6 @@ * * proc net directory handling functions */ - -#include - #include #include #include diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index c69ff191e5d8f5..5c6a5ceab2f1b7 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -4,8 +4,6 @@ * * Copyright 1997, Theodore Ts'o */ - -#include #include #include #include diff --git a/fs/proc/root.c b/fs/proc/root.c index c7e3b1350ef84f..5a7d15d197f8e0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -6,9 +6,6 @@ * * proc root directory handling functions */ - -#include - #include #include #include diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 4eaeb645e75966..f2aa86c421f2d4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include From 46d36b1be18b745fc9f6be2087633ba2f9895ffe Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Mon, 27 Jun 2022 15:44:41 +0800 Subject: [PATCH 35/72] kdump: round up the total memory size to 128M for crashkernel reservation The total memory size we get in kernel is usually slightly less than the actual memory size because BIOS/firmware will reserve some memory region. So it won't export all memory as usable. E.g, on my x86_64 kvm guest with 1G memory, the total_mem value shows: UEFI boot with ovmf: 0x3faef000 Legacy boot kvm guest: 0x3ff7ec00 When specifying crashkernel=1G-2G:128M, if we have a 1G memory machine, we get total size 1023M from firmware. Then it will not fall into 1G-2G, thus no memory reserved. User will never know this, it is hard to let user know the exact total value in kernel. One way is to use dmi/smbios to get physical memory size, but it's not reliable as well. According to Prarit hardware vendors sometimes screw this up. Thus round up total size to 128M to work around this problem. This patch is a resend of [1] and rebased onto v5.19-rc2, and the original credit goes to Dave Young. [1]: http://lists.infradead.org/pipermail/kexec/2018-April/020568.html Link: https://lkml.kernel.org/r/20220627074440.187222-1-ltao@redhat.com Signed-off-by: Tao Liu Acked-by: Baoquan He Cc: Dave Young Signed-off-by: Andrew Morton --- kernel/crash_core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f64d35e2841198..07b26df453a977 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline, unsigned long long *crash_base) { char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); /* for each entry of the comma-separated list */ do { @@ -89,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline, return -EINVAL; } cur = tmp; - if (size >= system_ram) { + if (size >= total_mem) { pr_warn("crashkernel: invalid size\n"); return -EINVAL; } /* match ? */ - if (system_ram >= start && system_ram < end) { + if (total_mem >= start && total_mem < end) { *crash_size = size; break; } From 2c795fb03f138e9602e1f1ee31b8bfc00a96c7e5 Mon Sep 17 00:00:00 2001 From: Yu Zhe Date: Tue, 28 Jun 2022 10:12:51 +0800 Subject: [PATCH 36/72] ipc/mqueue: remove unnecessary (void*) conversion Remove unnecessary void* type casting. Link: https://lkml.kernel.org/r/20220628021251.17197-1-yuzhe@nfschina.com Signed-off-by: Yu Zhe Signed-off-by: Andrew Morton --- ipc/mqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 12ad7860bb88f1..f98de32aeea174 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -489,7 +489,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) static void init_once(void *foo) { - struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo; + struct mqueue_inode_info *p = foo; inode_init_once(&p->vfs_inode); } From a16ceb13961068f7209e34d7984f8e42d2c06159 Mon Sep 17 00:00:00 2001 From: Benjamin Segall Date: Wed, 15 Jun 2022 14:24:23 -0700 Subject: [PATCH 37/72] epoll: autoremove wakers even more aggressively If a process is killed or otherwise exits while having active network connections and many threads waiting on epoll_wait, the threads will all be woken immediately, but not removed from ep->wq. Then when network traffic scans ep->wq in wake_up, every wakeup attempt will fail, and will not remove the entries from the list. This means that the cost of the wakeup attempt is far higher than usual, does not decrease, and this also competes with the dying threads trying to actually make progress and remove themselves from the wq. Handle this by removing visited epoll wq entries unconditionally, rather than only when the wakeup succeeds - the structure of ep_poll means that the only potential loss is the timed_out->eavail heuristic, which now can race and result in a redundant ep_send_events attempt. (But only when incoming data and a timeout actually race, not on every timeout) Shakeel added: : We are seeing this issue in production with real workloads and it has : caused hard lockups. Particularly network heavy workloads with a lot : of threads in epoll_wait() can easily trigger this issue if they get : killed (oom-killed in our case). Link: https://lkml.kernel.org/r/xm26fsjotqda.fsf@google.com Signed-off-by: Ben Segall Tested-by: Shakeel Butt Cc: Alexander Viro Cc: Linus Torvalds Cc: Shakeel Butt Cc: Eric Dumazet Cc: Roman Penyaev Cc: Jason Baron Cc: Khazhismel Kumykov Cc: Heiher Cc: Signed-off-by: Andrew Morton --- fs/eventpoll.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e2daa940ebce7c..8b56b94e2f56f8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1747,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) return to; } +/* + * autoremove_wake_function, but remove even on failure to wake up, because we + * know that default_wake_function/ttwu will only fail if the thread is already + * woken, and in that case the ep_poll loop will remove the entry anyways, not + * try to reuse it. + */ +static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int sync, void *key) +{ + int ret = default_wake_function(wq_entry, mode, sync, key); + + list_del_init(&wq_entry->entry); + return ret; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -1828,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. + * + * In fact, we now use an even more aggressive function that + * unconditionally removes, because we don't reuse the wait + * entry between loop iterations. This lets us also avoid the + * performance issue if a process is killed, causing all of its + * threads to wake up without being removed normally. */ init_wait(&wait); + wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* From b62eb2731e17e83c32e1a6089b4463da1a75e66e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 1 Jul 2022 14:35:12 +0300 Subject: [PATCH 38/72] scripts/bloat-o-meter: switch argument parsing to using argparse This will facilitate further extension to the arguments the script takes. As an added benefit it also produces saner usage output, where mutual exclusivity of the c|d|t parameters is clearly visible: ./scripts/bloat-o-meter -h usage: bloat-o-meter [-h] [-c | -d | -t] file1 file2 Simple script used to compare the symbol sizes of 2 object files positional arguments: file1 First file to compare file2 Second file to compare optional arguments: -h, --help show this help message and exit -c categorize output based on symbol type -d Show delta of Data Section -t Show delta of text Section Link: https://lkml.kernel.org/r/20220701113513.1938008-1-nborisov@suse.com Signed-off-by: Nikolay Borisov Signed-off-by: Andrew Morton --- scripts/bloat-o-meter | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index 4dd6a804ce41b2..2a360118710e53 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -7,18 +7,20 @@ # This software may be used and distributed according to the terms # of the GNU General Public License, incorporated herein by reference. -import sys, os, re +import sys, os, re, argparse from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) -if len(sys.argv) < 3: - sys.stderr.write("usage: %s [option] file1 file2\n" % sys.argv[0]) - sys.stderr.write("The options are:\n") - sys.stderr.write("-c categorize output based on symbol type\n") - sys.stderr.write("-d Show delta of Data Section\n") - sys.stderr.write("-t Show delta of text Section\n") - sys.exit(-1) +parser = argparse.ArgumentParser(description="Simple script used to compare the symbol sizes of 2 object files") +group = parser.add_mutually_exclusive_group() +group.add_argument('-c', help='categorize output based on symbol type', action='store_true') +group.add_argument('-d', help='Show delta of Data Section', action='store_true') +group.add_argument('-t', help='Show delta of text Section', action='store_true') +parser.add_argument('file1', help='First file to compare') +parser.add_argument('file2', help='Second file to compare') + +args = parser.parse_args() re_NUMBER = re.compile(r'\.[0-9]+') @@ -77,9 +79,9 @@ def calc(oldfile, newfile, format): delta.reverse() return grow, shrink, add, remove, up, down, delta, old, new, otot, ntot -def print_result(symboltype, symbolformat, argc): +def print_result(symboltype, symbolformat): grow, shrink, add, remove, up, down, delta, old, new, otot, ntot = \ - calc(sys.argv[argc - 1], sys.argv[argc], symbolformat) + calc(args.file1, args.file2, symbolformat) print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ (add, remove, grow, shrink, up, -down, up-down)) @@ -93,13 +95,13 @@ def print_result(symboltype, symbolformat, argc): percent = 0 print("Total: Before=%d, After=%d, chg %+.2f%%" % (otot, ntot, percent)) -if sys.argv[1] == "-c": - print_result("Function", "tT", 3) - print_result("Data", "dDbB", 3) - print_result("RO Data", "rR", 3) -elif sys.argv[1] == "-d": - print_result("Data", "dDbBrR", 3) -elif sys.argv[1] == "-t": - print_result("Function", "tT", 3) +if args.c: + print_result("Function", "tT") + print_result("Data", "dDbB") + print_result("RO Data", "rR") +elif args.d: + print_result("Data", "dDbBrR") +elif args.t: + print_result("Function", "tT") else: - print_result("Function", "tTdDbBrR", 2) + print_result("Function", "tTdDbBrR") From 8b5db6679807fd0ab1154375ea6e5aa6b11c4350 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 1 Jul 2022 14:35:13 +0300 Subject: [PATCH 39/72] scripts/bloat-o-meter: add -p argument When doing cross platform development on a machine sometimes it might be useful to invoke bloat-o-meter for files which haven't been build with the native toolchain. In cases when the host nm doesn't support the target one then a toolchain-specific nm could be used. Add this ability by adding the -p allowing invocations as: ./scripts/bloat-o-meter -p riscv64-unknown-linux-gnu- file1.o file2.o Link: https://lkml.kernel.org/r/20220701113513.1938008-2-nborisov@suse.com Signed-off-by: Nikolay Borisov Signed-off-by: Andrew Morton --- scripts/bloat-o-meter | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index 2a360118710e53..f9553f60a14a89 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -17,6 +17,7 @@ group = parser.add_mutually_exclusive_group() group.add_argument('-c', help='categorize output based on symbol type', action='store_true') group.add_argument('-d', help='Show delta of Data Section', action='store_true') group.add_argument('-t', help='Show delta of text Section', action='store_true') +parser.add_argument('-p', dest='prefix', help='Arch prefix for the tool being used. Useful in cross build scenarios') parser.add_argument('file1', help='First file to compare') parser.add_argument('file2', help='Second file to compare') @@ -26,7 +27,11 @@ re_NUMBER = re.compile(r'\.[0-9]+') def getsizes(file, format): sym = {} - with os.popen("nm --size-sort " + file) as f: + nm = "nm" + if args.prefix: + nm = "{}nm".format(args.prefix) + + with os.popen("{} --size-sort {}".format(nm, file)) as f: for line in f: if line.startswith("\n") or ":" in line: continue From adbcaef84088713bae9af5690cf566ce4fadfa22 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sat, 2 Jul 2022 18:08:24 +0200 Subject: [PATCH 40/72] x86/cacheinfo: move shared cache map definitions Patch series "cpumask: Fix invalid uniprocessor assumptions", v4. On uniprocessor builds, it is currently assumed that any cpumask will contain the single CPU: cpu0. This assumption is used to provide optimised implementations. The current assumption also appears to be wrong, by ignoring the fact that users can provide empty cpumasks. This can result in bugs as explained in [1] - for_each_cpu() will run one iteration of the loop even when passed an empty cpumask. This series introduces some basic tests, and updates the optimisations for uniprocessor builds. The x86 patch was written after the kernel test robot [2] ran into a failed build. I have tried to list the files potentially affected by the changes to cpumask.h, in an attempt to find any other cases that fail on !SMP. I've gone through some of the files manually, and ran a few cross builds, but nothing else popped up. I (build) checked about half of the potientally affected files, but I do not have the resources to do them all. I hope we can fix other issues if/when they pop up later. [1] https://lore.kernel.org/all/20220530082552.46113-1-sander@svanheule.net/ [2] https://lore.kernel.org/all/202206060858.wA0FOzRy-lkp@intel.com/ This patch (of 5): The maps to keep track of shared caches between CPUs on SMP systems are declared in asm/smp.h, among them specifically cpu_llc_shared_map. These maps are externally defined in cpu/smpboot.c. The latter is only compiled on CONFIG_SMP=y, which means the declared extern symbols from asm/smp.h do not have a corresponding definition on uniprocessor builds. The inline cpu_llc_shared_mask() function from asm/smp.h refers to the map declaration mentioned above. This function is referenced in cacheinfo.c inside for_each_cpu() loop macros, to provide cpumask for the loop. On uniprocessor builds, the symbol for the cpu_llc_shared_map does not exist. However, the current implementation of for_each_cpu() also (wrongly) ignores the provided mask. By sheer luck, the compiler thus optimises out this unused reference to cpu_llc_shared_map, and the linker therefore does not require the cpu_llc_shared_mask to actually exist on uniprocessor builds. Only on SMP bulids does smpboot.o exist to provide the required symbols. To no longer rely on compiler optimisations for successful uniprocessor builds, move the definitions of cpu_llc_shared_map and cpu_l2c_shared_map from smpboot.c to cacheinfo.c. Link: https://lkml.kernel.org/r/cover.1656777646.git.sander@svanheule.net Link: https://lkml.kernel.org/r/e8167ddb570f56744a3dc12c2149a660a324d969.1656777646.git.sander@svanheule.net Signed-off-by: Sander Vanheule Cc: Andy Shevchenko Cc: Marco Elver Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Yury Norov Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/cacheinfo.c | 6 ++++++ arch/x86/kernel/smpboot.c | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index fe98a1465be6ac..66556833d7af5d 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -29,6 +29,12 @@ #define LVL_3 4 #define LVL_TRACE 5 +/* Shared last level cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); + +/* Shared L2 cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + struct _cache_table { unsigned char descriptor; char cache_type; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5e7f9532a10d07..f24227bc3220a0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -95,10 +95,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); -DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); - -DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); - /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); From 4f09903078eeb9138cddce8db06100b82f8620e8 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sat, 2 Jul 2022 18:08:27 +0200 Subject: [PATCH 41/72] cpumask: add UP optimised for_each_*_cpu versions On uniprocessor builds, the following loops will always run over a mask that contains one enabled CPU (cpu0): - for_each_possible_cpu - for_each_online_cpu - for_each_present_cpu Provide uniprocessor-specific macros for these loops, that always run exactly once. Link: https://lkml.kernel.org/r/3a92869b902a075b97be5d1452c9c6badbbff0df.1656777646.git.sander@svanheule.net Signed-off-by: Sander Vanheule Acked-by: Yury Norov Cc: Andy Shevchenko Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Valentin Schneider Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index fe29ac7cc469c2..533612770bc077 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -811,9 +811,16 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) +#if NR_CPUS == 1 +/* Uniprocessor: the possible/online/present masks are always "1" */ +#define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) +#define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) +#define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) +#else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) +#endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); From b81dce77cedcea6f00292f02d4b1ebbfc2c5988d Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sat, 2 Jul 2022 18:08:25 +0200 Subject: [PATCH 42/72] cpumask: Fix invalid uniprocessor mask assumption On uniprocessor builds, any CPU mask is assumed to contain exactly one CPU (cpu0). This assumption ignores the existence of empty masks, resulting in incorrect behaviour. cpumask_first_zero(), cpumask_next_zero(), and for_each_cpu_not() don't provide behaviour matching the assumption that a UP mask is always "1", and instead provide behaviour matching the empty mask. Drop the incorrectly optimised code and use the generic implementations in all cases. Link: https://lkml.kernel.org/r/86bf3f005abba2d92120ddd0809235cab4f759a6.1656777646.git.sander@svanheule.net Signed-off-by: Sander Vanheule Suggested-by: Yury Norov Cc: Andy Shevchenko Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Valentin Schneider Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 99 ++++++++--------------------------------- lib/Makefile | 3 +- lib/cpumask.c | 2 + 3 files changed, 22 insertions(+), 82 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 533612770bc077..6c5b4ee000f2ea 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -116,85 +116,6 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu) return cpu; } -#if NR_CPUS == 1 -/* Uniprocessor. Assume all masks are "1". */ -static inline unsigned int cpumask_first(const struct cpumask *srcp) -{ - return 0; -} - -static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) -{ - return 0; -} - -static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, - const struct cpumask *srcp2) -{ - return 0; -} - -static inline unsigned int cpumask_last(const struct cpumask *srcp) -{ - return 0; -} - -/* Valid inputs for n are -1 and 0. */ -static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) -{ - return n+1; -} - -static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) -{ - return n+1; -} - -static inline unsigned int cpumask_next_and(int n, - const struct cpumask *srcp, - const struct cpumask *andp) -{ - return n+1; -} - -static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, - int start, bool wrap) -{ - /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */ - return (wrap && n == 0); -} - -/* cpu must be a valid cpu, ie 0, so there's no other choice. */ -static inline unsigned int cpumask_any_but(const struct cpumask *mask, - unsigned int cpu) -{ - return 1; -} - -static inline unsigned int cpumask_local_spread(unsigned int i, int node) -{ - return 0; -} - -static inline int cpumask_any_and_distribute(const struct cpumask *src1p, - const struct cpumask *src2p) { - return cpumask_first_and(src1p, src2p); -} - -static inline int cpumask_any_distribute(const struct cpumask *srcp) -{ - return cpumask_first(srcp); -} - -#define for_each_cpu(cpu, mask) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) -#define for_each_cpu_not(cpu, mask) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) -#define for_each_cpu_wrap(cpu, mask, start) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) -#define for_each_cpu_and(cpu, mask1, mask2) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) -#else /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer @@ -260,10 +181,29 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu); + +#if NR_CPUS == 1 +/* Uniprocessor: there is only one valid CPU */ +static inline unsigned int cpumask_local_spread(unsigned int i, int node) +{ + return 0; +} + +static inline int cpumask_any_and_distribute(const struct cpumask *src1p, + const struct cpumask *src2p) { + return cpumask_first_and(src1p, src2p); +} + +static inline int cpumask_any_distribute(const struct cpumask *srcp) +{ + return cpumask_first(srcp); +} +#else unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); int cpumask_any_distribute(const struct cpumask *srcp); +#endif /* NR_CPUS */ /** * for_each_cpu - iterate over every cpu in a mask @@ -324,7 +264,6 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool for ((cpu) = -1; \ (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) -#endif /* SMP */ #define CPU_BITS_NONE \ { \ diff --git a/lib/Makefile b/lib/Makefile index f99bf61f8bbc67..bcc7e8ea0cde5c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -34,10 +34,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \ - buildid.o + buildid.o cpumask.o lib-$(CONFIG_PRINTK) += dump_stack.o -lib-$(CONFIG_SMP) += cpumask.o lib-y += kobject.o klist.o obj-y += lockref.o diff --git a/lib/cpumask.c b/lib/cpumask.c index a971a82d2f4360..b9728513a4d401 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -192,6 +192,7 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) } #endif +#if NR_CPUS > 1 /** * cpumask_local_spread - select the i'th cpu with local numa cpu's first * @i: index number @@ -279,3 +280,4 @@ int cpumask_any_distribute(const struct cpumask *srcp) return next; } EXPORT_SYMBOL(cpumask_any_distribute); +#endif /* NR_CPUS */ From c41e8866c28c4d1a88a085fc3c3d6ba403510804 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sat, 2 Jul 2022 18:08:26 +0200 Subject: [PATCH 43/72] lib/test: introduce cpumask KUnit test suite Add a basic suite of tests for cpumask, providing some tests for empty and completely filled cpumasks. Link: https://lkml.kernel.org/r/c96980ec35c3bd23f17c3374bf42c22971545e85.1656777646.git.sander@svanheule.net Signed-off-by: Sander Vanheule Reviewed-by: Andy Shevchenko Suggested-by: Yury Norov Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Valentin Schneider Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 9 +++ lib/Makefile | 1 + lib/test_cpumask.c | 138 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+) create mode 100644 lib/test_cpumask.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2e24db4bff1921..04aaa20d50f982 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2021,6 +2021,15 @@ config LKDTM Documentation on how to use the module can be found in Documentation/fault-injection/provoke-crashes.rst +config TEST_CPUMASK + tristate "cpumask tests" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable to turn on cpumask tests, running at boot or module load time. + + If unsure, say N. + config TEST_LIST_SORT tristate "Linked list sorting test" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index bcc7e8ea0cde5c..de3e47453fe8eb 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_TEST_HMM) += test_hmm.o obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o +obj-$(CONFIG_TEST_CPUMASK) += test_cpumask.o CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE) obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o # diff --git a/lib/test_cpumask.c b/lib/test_cpumask.c new file mode 100644 index 00000000000000..a31a1622f1f6e8 --- /dev/null +++ b/lib/test_cpumask.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for cpumask. + * + * Author: Sander Vanheule + */ + +#include +#include +#include + +#define EXPECT_FOR_EACH_CPU_EQ(test, mask) \ + do { \ + const cpumask_t *m = (mask); \ + int mask_weight = cpumask_weight(m); \ + int cpu, iter = 0; \ + for_each_cpu(cpu, m) \ + iter++; \ + KUNIT_EXPECT_EQ((test), mask_weight, iter); \ + } while (0) + +#define EXPECT_FOR_EACH_CPU_NOT_EQ(test, mask) \ + do { \ + const cpumask_t *m = (mask); \ + int mask_weight = cpumask_weight(m); \ + int cpu, iter = 0; \ + for_each_cpu_not(cpu, m) \ + iter++; \ + KUNIT_EXPECT_EQ((test), nr_cpu_ids - mask_weight, iter); \ + } while (0) + +#define EXPECT_FOR_EACH_CPU_WRAP_EQ(test, mask) \ + do { \ + const cpumask_t *m = (mask); \ + int mask_weight = cpumask_weight(m); \ + int cpu, iter = 0; \ + for_each_cpu_wrap(cpu, m, nr_cpu_ids / 2) \ + iter++; \ + KUNIT_EXPECT_EQ((test), mask_weight, iter); \ + } while (0) + +#define EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, name) \ + do { \ + int mask_weight = num_##name##_cpus(); \ + int cpu, iter = 0; \ + for_each_##name##_cpu(cpu) \ + iter++; \ + KUNIT_EXPECT_EQ((test), mask_weight, iter); \ + } while (0) + +static cpumask_t mask_empty; +static cpumask_t mask_all; + +static void test_cpumask_weight(struct kunit *test) +{ + KUNIT_EXPECT_TRUE(test, cpumask_empty(&mask_empty)); + KUNIT_EXPECT_TRUE(test, cpumask_full(cpu_possible_mask)); + KUNIT_EXPECT_TRUE(test, cpumask_full(&mask_all)); + + KUNIT_EXPECT_EQ(test, 0, cpumask_weight(&mask_empty)); + KUNIT_EXPECT_EQ(test, nr_cpu_ids, cpumask_weight(cpu_possible_mask)); + KUNIT_EXPECT_EQ(test, nr_cpumask_bits, cpumask_weight(&mask_all)); +} + +static void test_cpumask_first(struct kunit *test) +{ + KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_first(&mask_empty)); + KUNIT_EXPECT_EQ(test, 0, cpumask_first(cpu_possible_mask)); + + KUNIT_EXPECT_EQ(test, 0, cpumask_first_zero(&mask_empty)); + KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_first_zero(cpu_possible_mask)); +} + +static void test_cpumask_last(struct kunit *test) +{ + KUNIT_EXPECT_LE(test, nr_cpumask_bits, cpumask_last(&mask_empty)); + KUNIT_EXPECT_EQ(test, nr_cpumask_bits - 1, cpumask_last(cpu_possible_mask)); +} + +static void test_cpumask_next(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, 0, cpumask_next_zero(-1, &mask_empty)); + KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_next_zero(-1, cpu_possible_mask)); + + KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_next(-1, &mask_empty)); + KUNIT_EXPECT_EQ(test, 0, cpumask_next(-1, cpu_possible_mask)); +} + +static void test_cpumask_iterators(struct kunit *test) +{ + EXPECT_FOR_EACH_CPU_EQ(test, &mask_empty); + EXPECT_FOR_EACH_CPU_NOT_EQ(test, &mask_empty); + EXPECT_FOR_EACH_CPU_WRAP_EQ(test, &mask_empty); + + EXPECT_FOR_EACH_CPU_EQ(test, cpu_possible_mask); + EXPECT_FOR_EACH_CPU_NOT_EQ(test, cpu_possible_mask); + EXPECT_FOR_EACH_CPU_WRAP_EQ(test, cpu_possible_mask); +} + +static void test_cpumask_iterators_builtin(struct kunit *test) +{ + EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, possible); + + /* Ensure the dynamic masks are stable while running the tests */ + cpu_hotplug_disable(); + + EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, online); + EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, present); + + cpu_hotplug_enable(); +} + +static int test_cpumask_init(struct kunit *test) +{ + cpumask_clear(&mask_empty); + cpumask_setall(&mask_all); + + return 0; +} + +static struct kunit_case test_cpumask_cases[] = { + KUNIT_CASE(test_cpumask_weight), + KUNIT_CASE(test_cpumask_first), + KUNIT_CASE(test_cpumask_last), + KUNIT_CASE(test_cpumask_next), + KUNIT_CASE(test_cpumask_iterators), + KUNIT_CASE(test_cpumask_iterators_builtin), + {} +}; + +static struct kunit_suite test_cpumask_suite = { + .name = "cpumask", + .init = test_cpumask_init, + .test_cases = test_cpumask_cases, +}; +kunit_test_suite(test_cpumask_suite); + +MODULE_LICENSE("GPL"); From 953257a9252a9b3c58ca68fc5bf26fc65e5b1cb8 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sat, 2 Jul 2022 18:08:28 +0200 Subject: [PATCH 44/72] cpumask: update cpumask_next_wrap() signature The extern specifier is not needed for this declaration, so drop it. The function also depends only on the input parameters, and has no side effects, so it can be marked __pure like other functions in cpumask.h. Link: https://lkml.kernel.org/r/72ab755695b74bb5fbaa756ae4c0edd708d172f1.1656777646.git.sander@svanheule.net Signed-off-by: Sander Vanheule Reviewed-by: Andy Shevchenko Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Yury Norov Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 6c5b4ee000f2ea..523857884ae440 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -229,7 +229,7 @@ int cpumask_any_distribute(const struct cpumask *srcp); (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) -extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); +int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location From bd27acaac24e4b252ee28dddcabaee80456d0faf Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 22 Jun 2022 14:46:31 +0900 Subject: [PATCH 45/72] lib/smp_processor_id: fix imbalanced instrumentation_end() call Currently instrumentation_end() won't be called if printk_ratelimit() returned false. Link: https://lkml.kernel.org/r/a636d8e0-ad32-5888-acac-671f7f553bb3@I-love.SAKURA.ne.jp Fixes: 126f21f0e8d46e2c ("lib/smp_processor_id: Move it into noinstr section") Signed-off-by: Tetsuo Handa Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/smp_processor_id.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 046ac6297c7811..a2bb7738c373cd 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -47,9 +47,9 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) printk("caller is %pS\n", __builtin_return_address(0)); dump_stack(); - instrumentation_end(); out_enable: + instrumentation_end(); preempt_enable_no_resched_notrace(); out: return this_cpu; From 55656016daa7155d95471627c1b1438d488f011b Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Fri, 8 Jul 2022 21:19:47 +0800 Subject: [PATCH 46/72] lib: devres: use numa aware allocation Allocate device resource from local node memory when the numa locality of the device is specified. Link: https://lkml.kernel.org/r/20220708131952.14500-1-mark-pk.tsai@mediatek.com Signed-off-by: Mark-PK Tsai Cc: Matthias Brugger Cc: YJ Chiang Cc: Hans de Goede Cc: Thomas Zimmermann Cc: Zhen Lei Cc: Jacob Keller Signed-off-by: Andrew Morton --- lib/devres.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/devres.c b/lib/devres.c index 14664bbb48757d..55eb07e80cbb3b 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -29,7 +29,8 @@ static void __iomem *__devm_ioremap(struct device *dev, resource_size_t offset, { void __iomem **ptr, *addr = NULL; - ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL); + ptr = devres_alloc_node(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); if (!ptr) return NULL; @@ -292,7 +293,8 @@ void __iomem *devm_ioport_map(struct device *dev, unsigned long port, { void __iomem **ptr, *addr; - ptr = devres_alloc(devm_ioport_map_release, sizeof(*ptr), GFP_KERNEL); + ptr = devres_alloc_node(devm_ioport_map_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); if (!ptr) return NULL; @@ -366,7 +368,8 @@ void __iomem * const *pcim_iomap_table(struct pci_dev *pdev) if (dr) return dr->table; - new_dr = devres_alloc(pcim_iomap_release, sizeof(*new_dr), GFP_KERNEL); + new_dr = devres_alloc_node(pcim_iomap_release, sizeof(*new_dr), GFP_KERNEL, + dev_to_node(&pdev->dev)); if (!new_dr) return NULL; dr = devres_get(&pdev->dev, new_dr, NULL, NULL); @@ -548,7 +551,8 @@ int devm_arch_phys_wc_add(struct device *dev, unsigned long base, unsigned long int *mtrr; int ret; - mtrr = devres_alloc(devm_arch_phys_ac_add_release, sizeof(*mtrr), GFP_KERNEL); + mtrr = devres_alloc_node(devm_arch_phys_ac_add_release, sizeof(*mtrr), GFP_KERNEL, + dev_to_node(dev)); if (!mtrr) return -ENOMEM; @@ -593,7 +597,8 @@ int devm_arch_io_reserve_memtype_wc(struct device *dev, resource_size_t start, struct arch_io_reserve_memtype_wc_devres *dr; int ret; - dr = devres_alloc(devm_arch_io_free_memtype_wc_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc_node(devm_arch_io_free_memtype_wc_release, sizeof(*dr), GFP_KERNEL, + dev_to_node(dev)); if (!dr) return -ENOMEM; From f71381fcdc3ab615f55278d435a9f35542dc9e63 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 8 Jul 2022 09:43:01 +0800 Subject: [PATCH 47/72] autofs: use inode permission method for write access Patch series "autofs: misc patches". This series contains several patches that resulted mostly from comments made by Al Viro (quite a long time ago now). This patch (of 5): Eliminate some code duplication from mkdir/rmdir/symlink/unlink methods by using the inode operation .permission(). Link: https://lkml.kernel.org/r/165724445154.30914.10970894936827635879.stgit@donald.themaw.net Link: https://lkml.kernel.org/r/165724458096.30914.13499431569758625806.stgit@donald.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: David Howells Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/autofs/root.c | 63 +++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 91fe4548c25657..fef6ed9910221d 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,6 +10,7 @@ #include "autofs_i.h" +static int autofs_dir_permission(struct user_namespace *, struct inode *, int); static int autofs_dir_symlink(struct user_namespace *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); @@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = { const struct inode_operations autofs_dir_inode_operations = { .lookup = autofs_lookup, + .permission = autofs_dir_permission, .unlink = autofs_dir_unlink, .symlink = autofs_dir_symlink, .mkdir = autofs_dir_mkdir, @@ -526,11 +528,30 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } +static int autofs_dir_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + if (mask & MAY_WRITE) { + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); + + if (!autofs_oz_mode(sbi)) + return -EACCES; + + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->flags & AUTOFS_SBI_CATATONIC) + return -EACCES; + } + + return generic_permission(mnt_userns, inode, mask); +} + static int autofs_dir_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; @@ -539,16 +560,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns, pr_debug("%s <- %pd\n", symname, dentry); - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - BUG_ON(!ino); autofs_clean_ino(ino); @@ -601,16 +612,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - ino->count--; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; @@ -683,16 +684,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) pr_debug("dentry %p, removing %pd\n", dentry, dentry); - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - if (ino->count != 1) return -ENOTEMPTY; @@ -726,16 +717,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns, struct autofs_info *p_ino; struct inode *inode; - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); From 9ccbac76e71de411b9c4beea9d91ba98f3fad690 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 8 Jul 2022 09:43:06 +0800 Subject: [PATCH 48/72] autofs: make dentry info count consistent If an autofs dentry is a mount root directory there's no ->mkdir() call to set its count to one. To make the dentry info count consistent for all autofs dentries set count to one when the dentry info struct is allocated. Link: https://lkml.kernel.org/r/165724458671.30914.2902424437132835325.stgit@donald.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: David Howells Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/autofs/inode.c | 1 + fs/autofs/root.c | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 9edf243713eb6e..affa70360b1f8f 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; + ino->count = 1; } return ino; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index fef6ed9910221d..442d27d9cb1b99 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -582,7 +582,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns, d_add(dentry, inode); dget(dentry); - ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; @@ -612,7 +611,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; - ino->count--; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); @@ -695,7 +693,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); - ino->count--; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); @@ -734,7 +731,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns, autofs_set_leaf_automount_flags(dentry); dget(dentry); - ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); From a4a87303874c1a7d49cc18a8fe33676b0002ffbf Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 8 Jul 2022 09:43:12 +0800 Subject: [PATCH 49/72] autofs: use dentry info count instead of simple_empty() The dentry info. field count is used to check if a dentry is in use during expire. But, to be used for this the count field must account for the presence of child dentries in a directory dentry. Therefore it can also be used to check for an empty directory dentry which can be done without having to to take an additional lock or account for the presence of a readdir cursor dentry as is done by simple_empty(). Link: https://lkml.kernel.org/r/165724459238.30914.1504611159945950108.stgit@donald.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: David Howells Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/autofs/autofs_i.h | 5 +++++ fs/autofs/expire.c | 2 +- fs/autofs/root.c | 18 ++++++++---------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 918826eaceea34..0117d6e0630089 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -148,6 +148,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) task_pgrp(current) == sbi->oz_pgrp); } +static inline bool autofs_empty(struct autofs_info *ino) +{ + return ino->count < 2; +} + struct inode *autofs_get_inode(struct super_block *, umode_t); void autofs_free_ino(struct autofs_info *); diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index b3fefd6237c362..038b3d2d9f572e 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (simple_empty(dentry)) + if (autofs_empty(ino)) return NULL; /* Case 2: tree mount, expire iff entire tree is not busy */ diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 442d27d9cb1b99..e0fa71eb5c0587 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -79,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); @@ -95,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { + if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } @@ -364,7 +365,7 @@ static struct vfsmount *autofs_d_automount(struct path *path) * the mount never trigger mounts themselves (they have an * autofs trigger mount mounted on them). But v4 pseudo direct * mounts do need the leaves to trigger mounts. In this case - * we have no choice but to use the list_empty() check and + * we have no choice but to use the autofs_empty() check and * require user space behave. */ if (sbi->version > 4) { @@ -373,7 +374,7 @@ static struct vfsmount *autofs_d_automount(struct path *path) goto done; } } else { - if (!simple_empty(dentry)) { + if (!autofs_empty(ino)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -428,9 +429,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) if (rcu_walk) { /* We don't need fs_lock in rcu_walk mode, - * just testing 'AUTOFS_INFO_NO_RCU' is enough. - * simple_empty() takes a spinlock, so leave it - * to last. + * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough. + * * We only return -EISDIR when certain this isn't * a mount-trap. */ @@ -443,9 +443,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) return -EISDIR; - if (list_empty(&dentry->d_subdirs)) - return 0; - if (!simple_empty(dentry)) + if (!autofs_empty(ino)) return -EISDIR; return 0; } @@ -465,7 +463,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ - if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || + if ((!path_is_mountpoint(path) && !autofs_empty(ino)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } From ba97a0a3a31a2451607ebf601c0b7c4b1322ce9a Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 8 Jul 2022 09:43:18 +0800 Subject: [PATCH 50/72] autofs: add comment about autofs_mountpoint_changed() The function autofs_mountpoint_changed() is unusual, add a comment about two cases for which it is needed. Link: https://lkml.kernel.org/r/165724459804.30914.10974834416046555127.stgit@donald.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: David Howells Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/autofs/root.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/autofs/root.c b/fs/autofs/root.c index e0fa71eb5c0587..ca03c1cae2be17 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -291,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path) struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - /* - * If this is an indirect mount the dentry could have gone away - * as a result of an expire and a new one created. + /* If this is an indirect mount the dentry could have gone away + * and a new one created. + * + * This is unusual and I can't remember the case for which it + * was originally added now. But an example of how this can + * happen is an autofs indirect mount that has the "browse" + * option set and also has the "symlink" option in the autofs + * map entry. In this case the daemon will remove the browse + * directory and create a symlink as the mount leaving the + * struct path stale. + * + * Another not so obvious case is when a mount in an autofs + * indirect mount that uses the "nobrowse" option is being + * expired at the same time as a path walk. If the mount has + * been umounted but the mount point directory seen before + * becoming unhashed (during a lockless path walk) when a stat + * family system call is made the mount won't be re-mounted as + * it should. In this case the mount point that's been removed + * (by the daemon) will be stale and the a new mount point + * dentry created. */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; From 7ffe4e90a061a2f612b3b8c29b583ec3b707781f Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 8 Jul 2022 09:43:23 +0800 Subject: [PATCH 51/72] autofs: remove unused ino field inode Remove the unused inode field of the autofs dentry info structure. Link: https://lkml.kernel.org/r/165724460393.30914.6511330213821246793.stgit@donald.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: David Howells Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/autofs/autofs_i.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 0117d6e0630089..d5a44fa88acf9a 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type; */ struct autofs_info { struct dentry *dentry; - struct inode *inode; - int flags; struct completion expire_complete; From d919a1e79bac890421537cf02ae773007bf55e6b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Jul 2022 21:00:29 +0800 Subject: [PATCH 52/72] proc: fix a dentry lock race between release_task and lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc") moved proc_flush_task() behind __exit_signal(). Then, process systemd can take long period high cpu usage during releasing task in following concurrent processes: systemd ps kernel_waitid stat(/proc/tgid) do_wait filename_lookup wait_consider_task lookup_fast release_task __exit_signal __unhash_process detach_pid __change_pid // remove task->pid_links d_revalidate -> pid_revalidate // 0 d_invalidate(/proc/tgid) shrink_dcache_parent(/proc/tgid) d_walk(/proc/tgid) spin_lock_nested(/proc/tgid/fd) // iterating opened fd proc_flush_pid | d_invalidate (/proc/tgid/fd) | shrink_dcache_parent(/proc/tgid/fd) | shrink_dentry_list(subdirs) ↓ shrink_lock_dentry(/proc/tgid/fd) --> race on dentry lock Function d_invalidate() will remove dentry from hash firstly, but why does proc_flush_pid() process dentry '/proc/tgid/fd' before dentry '/proc/tgid'? That's because proc_pid_make_inode() adds proc inode in reverse order by invoking hlist_add_head_rcu(). But proc should not add any inodes under '/proc/tgid' except '/proc/tgid/task/pid', fix it by adding inode into 'pid->inodes' only if the inode is /proc/tgid or /proc/tgid/task/pid. Performance regression: Create 200 tasks, each task open one file for 50,000 times. Kill all tasks when opened files exceed 10,000,000 (cat /proc/sys/fs/file-nr). Before fix: $ time killall -wq aa real 4m40.946s # During this period, we can see 'ps' and 'systemd' taking high cpu usage. After fix: $ time killall -wq aa real 1m20.732s # During this period, we can see 'systemd' taking high cpu usage. Link: https://lkml.kernel.org/r/20220713130029.4133533-1-chengzhihao1@huawei.com Fixes: 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc") Link: https://bugzilla.kernel.org/show_bug.cgi?id=216054 Signed-off-by: Zhihao Cheng Signed-off-by: Zhang Yi Suggested-by: Brian Foster Reviewed-by: Brian Foster Cc: Al Viro Cc: Alexey Dobriyan Cc: Eric Biederman Cc: Matthew Wilcox Cc: Baoquan He Cc: Kalesh Singh Cc: Yu Kuai Signed-off-by: Andrew Morton --- fs/proc/base.c | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 8dfa36a99c7421..93f7e3d971e4bb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1885,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei) put_pid(pid); } -struct inode *proc_pid_make_inode(struct super_block * sb, +struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; @@ -1914,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* Let the pid remember us for quick removal */ ei->pid = pid; - if (S_ISDIR(mode)) { - spin_lock(&pid->lock); - hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); - spin_unlock(&pid->lock); - } task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -1931,6 +1926,39 @@ struct inode *proc_pid_make_inode(struct super_block * sb, return NULL; } +/* + * Generating an inode and adding it into @pid->inodes, so that task will + * invalidate inode's dentry before being released. + * + * This helper is used for creating dir-type entries under '/proc' and + * '/proc//task'. Other entries(eg. fd, stat) under '/proc/' + * can be released by invalidating '/proc/' dentry. + * In theory, dentries under '/proc//task' can also be released by + * invalidating '/proc/' dentry, we reserve it to handle single + * thread exiting situation: Any one of threads should invalidate its + * '/proc//task/' dentry before released. + */ +static struct inode *proc_pid_make_base_inode(struct super_block *sb, + struct task_struct *task, umode_t mode) +{ + struct inode *inode; + struct proc_inode *ei; + struct pid *pid; + + inode = proc_pid_make_inode(sb, task, mode); + if (!inode) + return NULL; + + /* Let proc_flush_pid find this directory inode */ + ei = PROC_I(inode); + pid = ei->pid; + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + + return inode; +} + int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -3369,7 +3397,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); @@ -3671,7 +3700,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); From 3adb2d87238dea5e05bab747238bb47306b9cb56 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 12 Jul 2022 17:51:45 +0300 Subject: [PATCH 53/72] proc: fix test for "vsyscall=xonly" boot option Booting with vsyscall=xonly results in the following vsyscall VMA: ffffffffff600000-ffffffffff601000 --xp ... [vsyscall] Test does read from fixed vsyscall address to determine if kernel supports vsyscall page but it doesn't work because, well, vsyscall page is execute only. Fix test by trying to execute from the first byte of the page which contains gettimeofday() stub. This should work because vsyscall entry points have stable addresses by design. Alexey, avoiding parsing .config, /proc/config.gz and /proc/cmdline at all costs. Link: https://lkml.kernel.org/r/Ys2KgeiEMboU8Ytu@localhost.localdomain Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-pid-vm.c | 75 ++++++++++++++++++++-- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index 28604c9f805c75..e5962f4794f566 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -211,10 +211,19 @@ static int make_exe(const uint8_t *payload, size_t len) } #endif -static bool g_vsyscall = false; +/* + * 0: vsyscall VMA doesn't exist vsyscall=none + * 1: vsyscall VMA is r-xp vsyscall=emulate + * 2: vsyscall VMA is --xp vsyscall=xonly + */ +static int g_vsyscall; +static const char *str_vsyscall; -static const char str_vsyscall[] = +static const char str_vsyscall_0[] = ""; +static const char str_vsyscall_1[] = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; +static const char str_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) @@ -223,13 +232,47 @@ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) } /* - * vsyscall page can't be unmapped, probe it with memory load. + * vsyscall page can't be unmapped, probe it directly. */ static void vsyscall(void) { pid_t pid; int wstatus; + pid = fork(); + if (pid < 0) { + fprintf(stderr, "fork, errno %d\n", errno); + exit(1); + } + if (pid == 0) { + struct rlimit rlim = {0, 0}; + (void)setrlimit(RLIMIT_CORE, &rlim); + + /* Hide "segfault at ffffffffff600000" messages. */ + struct sigaction act; + memset(&act, 0, sizeof(struct sigaction)); + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV; + (void)sigaction(SIGSEGV, &act, NULL); + + /* gettimeofday(NULL, NULL); */ + asm volatile ( + "call %P0" + : + : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) + : "rax", "rcx", "r11" + ); + exit(0); + } + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { + /* vsyscall page exists and is executable. */ + } else { + /* vsyscall page doesn't exist. */ + g_vsyscall = 0; + return; + } + pid = fork(); if (pid < 0) { fprintf(stderr, "fork, errno %d\n", errno); @@ -251,8 +294,13 @@ static void vsyscall(void) } waitpid(pid, &wstatus, 0); if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - g_vsyscall = true; + /* vsyscall page is readable and executable. */ + g_vsyscall = 1; + return; } + + /* vsyscall page is executable but unreadable. */ + g_vsyscall = 2; } int main(void) @@ -261,6 +309,19 @@ int main(void) int exec_fd; vsyscall(); + switch (g_vsyscall) { + case 0: + str_vsyscall = str_vsyscall_0; + break; + case 1: + str_vsyscall = str_vsyscall_1; + break; + case 2: + str_vsyscall = str_vsyscall_2; + break; + default: + abort(); + } atexit(ate); @@ -314,7 +375,7 @@ int main(void) /* Test /proc/$PID/maps */ { - const size_t len = strlen(buf0) + (g_vsyscall ? strlen(str_vsyscall) : 0); + const size_t len = strlen(buf0) + strlen(str_vsyscall); char buf[256]; ssize_t rv; int fd; @@ -327,7 +388,7 @@ int main(void) rv = read(fd, buf, sizeof(buf)); assert(rv == len); assert(memcmp(buf, buf0, strlen(buf0)) == 0); - if (g_vsyscall) { + if (g_vsyscall > 0) { assert(memcmp(buf + strlen(buf0), str_vsyscall, strlen(str_vsyscall)) == 0); } } @@ -374,7 +435,7 @@ int main(void) assert(memmem(buf, rv, S[i], strlen(S[i]))); } - if (g_vsyscall) { + if (g_vsyscall > 0) { assert(memmem(buf, rv, str_vsyscall, strlen(str_vsyscall))); } } From 1298f83b546921506afa4050a833f21433ed4b88 Mon Sep 17 00:00:00 2001 From: "Souptick Joarder (HPE)" Date: Sun, 26 Jun 2022 07:51:14 +0530 Subject: [PATCH 54/72] ia64: old_rr4 added under CONFIG_HUGETLB_PAGE kernel test robot throws below warning -> arch/ia64/include/asm/mmu_context.h: In function 'reload_context': arch/ia64/include/asm/mmu_context.h:127:48: warning: variable 'old_rr4' set but not used [-Wunused-but-set-variable] 127 | unsigned long rr0, rr1, rr2, rr3, rr4, old_rr4; Add it under CONFIG_HUGETLB_PAGE Link: https://lkml.kernel.org/r/20220626022114.4020-1-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder (HPE) Reported-by: kernel test robot Signed-off-by: Andrew Morton --- arch/ia64/include/asm/mmu_context.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h index 87a0d5bc11ef04..06257e355d0076 100644 --- a/arch/ia64/include/asm/mmu_context.h +++ b/arch/ia64/include/asm/mmu_context.h @@ -124,9 +124,12 @@ reload_context (nv_mm_context_t context) { unsigned long rid; unsigned long rid_incr = 0; - unsigned long rr0, rr1, rr2, rr3, rr4, old_rr4; + unsigned long rr0, rr1, rr2, rr3, rr4; +#ifdef CONFIG_HUGETLB_PAGE + unsigned long old_rr4; old_rr4 = ia64_get_rr(RGN_BASE(RGN_HPAGE)); +#endif rid = context << 3; /* make space for encoding the region number */ rid_incr = 1 << 8; From 233eb8d6894ed3349e9971a51dd8a9b5586e2598 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Fri, 15 Jul 2022 14:00:35 +0800 Subject: [PATCH 55/72] fs/ocfs2: Fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220715060035.632903-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Acked-by: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/quota_global.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 0b6f551a342a17..dc9f76ab7e13c3 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -412,7 +412,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type) goto out_err; } -/* Write information to global quota file. Expects exlusive lock on quota +/* Write information to global quota file. Expects exclusive lock on quota * file inode and quota info */ static int __ocfs2_global_write_info(struct super_block *sb, int type) { From 0c12185728d602c27cd12a845249e7f37197f71f Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Fri, 17 Jun 2022 16:38:09 +0800 Subject: [PATCH 56/72] Revert "squashfs: provide backing_dev_info in order to disable read-ahead" Patch series "Implement readahead for squashfs", v7. Commit 9eec1d897139("squashfs: provide backing_dev_info in order to disable read-ahead") mitigates the performance drop issue for squashfs by closing readahead for it. This series implements readahead callback for squashfs. This patch (of 4): This reverts 9eec1d897139e5 ("squashfs: provide backing_dev_info in order to disable read-ahead"). Revert closing the readahead to squashfs since the readahead callback for squashfs is implemented. Link: https://lkml.kernel.org/r/20220617083810.337573-1-hsinyi@chromium.org Link: https://lkml.kernel.org/r/20220617083810.337573-2-hsinyi@chromium.org Signed-off-by: Hsin-Yi Wang Suggested-by: Xiongwei Song Cc: Phillip Lougher Cc: Matthew Wilcox Cc: Marek Szyprowski Cc: Zheng Liang Cc: Zhang Yi Cc: Hou Tao Cc: Miao Xie Cc: kernel test robot Signed-off-by: Andrew Morton --- fs/squashfs/super.c | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6d594ba2ed28ff..32565dafa7f3ba 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } -static int squashfs_bdi_init(struct super_block *sb) -{ - int err; - unsigned int major = MAJOR(sb->s_dev); - unsigned int minor = MINOR(sb->s_dev); - - bdi_put(sb->s_bdi); - sb->s_bdi = &noop_backing_dev_info; - - err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); - if (err) - return err; - - sb->s_bdi->ra_pages = 0; - sb->s_bdi->io_pages = 0; - - return 0; -} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); - /* - * squashfs provides 'backing_dev_info' in order to disable read-ahead. For - * squashfs, I/O is not deferred, it is done immediately in read_folio, - * which means the user would always have to wait their own I/O. So the effect - * of readahead is very weak for squashfs. squashfs_bdi_init will set - * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for - * squashfs. - */ - err = squashfs_bdi_init(sb); - if (err) { - errorf(fc, "squashfs init bdi failed"); - return err; - } - sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); From db98b43086275350294f5c6f797249b714d6316d Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 17 Jun 2022 16:38:11 +0800 Subject: [PATCH 57/72] squashfs: always build "file direct" version of page actor Squashfs_readahead uses the "file direct" version of the page actor, and so build it unconditionally. Link: https://lkml.kernel.org/r/20220617083810.337573-3-hsinyi@chromium.org Signed-off-by: Phillip Lougher Signed-off-by: Hsin-Yi Wang Reported-by: kernel test robot Cc: Hou Tao Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Miao Xie Cc: Xiongwei Song Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/Makefile | 4 ++-- fs/squashfs/page_actor.h | 46 ---------------------------------------- 2 files changed, 2 insertions(+), 48 deletions(-) diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7bd9b8b856d0bf..477c89a519ee88 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,9 +5,9 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o -squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 37523c54256fa7..24841d28bc0fb8 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -6,51 +6,6 @@ * Phillip Lougher */ -#ifndef CONFIG_SQUASHFS_FILE_DIRECT -struct squashfs_page_actor { - void **page; - int pages; - int length; - int next_page; -}; - -static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - return actor; -} - -static inline void *squashfs_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->page[0]; -} - -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->next_page == actor->pages ? NULL : - actor->page[actor->next_page++]; -} - -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} - -static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) -{ - /* empty */ -} -#else struct squashfs_page_actor { union { void **buffer; @@ -91,4 +46,3 @@ static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) actor->alloc_buffer = 0; } #endif -#endif From 8fc78b6fe24c36b151ac98d7546591ed92083d4f Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Fri, 17 Jun 2022 16:38:13 +0800 Subject: [PATCH 58/72] squashfs: implement readahead Implement readahead callback for squashfs. It will read datablocks which cover pages in readahead request. For a few cases it will not mark page as uptodate, including: - file end is 0. - zero filled blocks. - current batch of pages isn't in the same datablock. - decompressor error. Otherwise pages will be marked as uptodate. The unhandled pages will be updated by readpage later. Link: https://lkml.kernel.org/r/20220617083810.337573-4-hsinyi@chromium.org Signed-off-by: Hsin-Yi Wang Suggested-by: Matthew Wilcox Reported-by: Matthew Wilcox Reported-by: Phillip Lougher Reported-by: Xiongwei Song Reported-by: Andrew Morton Cc: Hou Tao Cc: kernel test robot Cc: Marek Szyprowski Cc: Miao Xie Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8e495d8eb8600..128ebe9aded87d 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -39,6 +39,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "page_actor.h" /* * Locate cache slot in range [offset, index] for specified inode. If @@ -495,7 +496,96 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) return 0; } +static void squashfs_readahead(struct readahead_control *ractl) +{ + struct inode *inode = ractl->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + size_t mask = (1UL << msblk->block_log) - 1; + unsigned short shift = msblk->block_log - PAGE_SHIFT; + loff_t start = readahead_pos(ractl) & ~mask; + size_t len = readahead_length(ractl) + readahead_pos(ractl) - start; + struct squashfs_page_actor *actor; + unsigned int nr_pages = 0; + struct page **pages; + int i, file_end = i_size_read(inode) >> msblk->block_log; + unsigned int max_pages = 1UL << shift; + + readahead_expand(ractl, start, (len | mask) + 1); + + if (file_end == 0) + return; + + pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return; + + for (;;) { + pgoff_t index; + int res, bsize; + u64 block = 0; + unsigned int expected; + + nr_pages = __readahead_batch(ractl, pages, max_pages); + if (!nr_pages) + break; + + if (readahead_pos(ractl) >= i_size_read(inode)) + goto skip_pages; + + index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) + goto skip_pages; + + expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + bsize = read_blocklist(inode, index, &block); + if (bsize == 0) + goto skip_pages; + + actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, + expected); + if (!actor) + goto skip_pages; + + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + + if (res == expected) { + int bytes; + + /* Last page (if present) may have trailing bytes not filled */ + bytes = res % PAGE_SIZE; + if (pages[nr_pages - 1]->index == file_end && bytes) + memzero_page(pages[nr_pages - 1], bytes, + PAGE_SIZE - bytes); + + for (i = 0; i < nr_pages; i++) { + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + } + + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + + kfree(pages); + return; + +skip_pages: + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + kfree(pages); +} const struct address_space_operations squashfs_aops = { - .read_folio = squashfs_read_folio + .read_folio = squashfs_read_folio, + .readahead = squashfs_readahead }; From b09a7a036d2035b14636cd4c4c69518d73770f65 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 17 Jun 2022 16:38:15 +0800 Subject: [PATCH 59/72] squashfs: support reading fragments in readahead call Add a function which can be used to read fragments in the readahead call. This function is necessary because filesystems built with the -tailends (or -always-use-fragments) option may have fragments present which cannot be currently handled. Link: https://lkml.kernel.org/r/20220617083810.337573-5-hsinyi@chromium.org Signed-off-by: Phillip Lougher Signed-off-by: Hsin-Yi Wang Cc: Hou Tao Cc: kernel test robot Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Miao Xie Cc: Xiongwei Song Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 128ebe9aded87d..7ff0b03cceab01 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -496,6 +496,41 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) return 0; } +static int squashfs_readahead_fragment(struct page **page, + unsigned int pages, unsigned int expected) +{ + struct inode *inode = page[0]->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + + if (buffer->error) + goto out; + + expected += squashfs_i(inode)->fragment_offset; + + for (n = 0; n < pages; n++) { + unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; + unsigned int offset = base + squashfs_i(inode)->fragment_offset; + + if (expected > offset) { + unsigned int avail = min_t(unsigned int, expected - + offset, PAGE_SIZE); + + squashfs_fill_page(page[n], buffer, offset, avail); + } + + unlock_page(page[n]); + put_page(page[n]); + } + +out: + squashfs_cache_put(buffer); + return buffer->error; +} + static void squashfs_readahead(struct readahead_control *ractl) { struct inode *inode = ractl->mapping->host; @@ -512,9 +547,6 @@ static void squashfs_readahead(struct readahead_control *ractl) readahead_expand(ractl, start, (len | mask) + 1); - if (file_end == 0) - return; - pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); if (!pages) return; @@ -540,6 +572,15 @@ static void squashfs_readahead(struct readahead_control *ractl) (i_size_read(inode) & (msblk->block_size - 1)) : msblk->block_size; + if (index == file_end && squashfs_i(inode)->fragment_block != + SQUASHFS_INVALID_BLK) { + res = squashfs_readahead_fragment(pages, nr_pages, + expected); + if (res) + goto skip_pages; + continue; + } + bsize = read_blocklist(inode, index, &block); if (bsize == 0) goto skip_pages; From a10c9ede9913fd54be61bbb01884e647e83dfcae Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Thu, 14 Jul 2022 09:54:41 +0800 Subject: [PATCH 60/72] lib/lzo/lzo1x_compress.c: replace ternary operator with min() and min_t() Fix the following coccicheck warning: lib/lzo/lzo1x_compress.c:54: WARNING opportunity for min(). lib/lzo/lzo1x_compress.c:329: WARNING opportunity for min(). min() and min_t() macro is defined in include/linux/minmax.h. It avoids multiple evaluations of the arguments when non-constant and performs strict type-checking. Link: https://lkml.kernel.org/r/20220714015441.1313036-1-13667453960@163.com Signed-off-by: Jiangshan Yi Tested-by: Dave Rodgman Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- lib/lzo/lzo1x_compress.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index 76758e9296ba65..9d31e7126606ac 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -50,9 +50,7 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len, if (dv == 0 && bitstream_version) { const unsigned char *ir = ip + 4; - const unsigned char *limit = ip_end - < (ip + MAX_ZERO_RUN_LENGTH + 1) - ? ip_end : ip + MAX_ZERO_RUN_LENGTH + 1; + const unsigned char *limit = min(ip_end, ip + MAX_ZERO_RUN_LENGTH + 1); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \ defined(LZO_FAST_64BIT_MEMORY_ACCESS) u64 dv64; @@ -326,7 +324,7 @@ static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, data_start = op; while (l > 20) { - size_t ll = l <= (m4_max_offset + 1) ? l : (m4_max_offset + 1); + size_t ll = min_t(size_t, l, m4_max_offset + 1); uintptr_t ll_end = (uintptr_t) ip + ll; if ((ll_end + ((t + ll) >> 5)) <= ll_end) break; From 591c32bddbe20ba0e172d9def3c7f22b9c926ad9 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 14 Jul 2022 08:47:44 +0100 Subject: [PATCH 61/72] kernel/hung_task: fix address space of proc_dohung_task_timeout_secs The proc_dohung_task_timeout_secs() function is incorrectly marked as having a __user buffer as argument 3. However this is not the case and it is casing multiple sparse warnings. Fix the following warnings by removing __user from the argument: kernel/hung_task.c:237:52: warning: incorrect type in argument 3 (different address spaces) kernel/hung_task.c:237:52: expected void * kernel/hung_task.c:237:52: got void [noderef] __user *buffer kernel/hung_task.c:287:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/hung_task.c:287:35: expected int ( [usertype] *proc_handler )( ... ) kernel/hung_task.c:287:35: got int ( * )( ... ) kernel/hung_task.c:295:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/hung_task.c:295:35: expected int ( [usertype] *proc_handler )( ... ) kernel/hung_task.c:295:35: got int ( * )( ... ) Link: https://lkml.kernel.org/r/20220714074744.189017-1-ben.dooks@sifive.com Signed-off-by: Ben Dooks Cc: Signed-off-by: Andrew Morton --- kernel/hung_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cff3ae8c818fd3..bb2354f73dedca 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -229,7 +229,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; From fa7d574ba4f4f3f4f78d432c8545d9045daa89b1 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 19 Jul 2022 16:33:49 +0800 Subject: [PATCH 62/72] bdi: remove enum wb_congested_state enum wb_congested_state and the member 'congested' in bdi_writeback are useless since commit a88f2096d5a2 ("remove congestion tracking framework"), so remove it. Link: https://lkml.kernel.org/r/20220719083349.87547-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Jan Kara Cc: NeilBrown Signed-off-by: Andrew Morton --- include/linux/backing-dev-defs.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e863c88df95f97..ae12696ec492c6 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -28,11 +28,6 @@ enum wb_state { WB_start_all, /* nr_pages == 0 (all) work pending */ }; -enum wb_congested_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ -}; - enum wb_stat_item { WB_RECLAIMABLE, WB_WRITEBACK, @@ -122,8 +117,6 @@ struct bdi_writeback { atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; - unsigned long congested; /* WB_[a]sync_congested flags */ - unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ From ed8fb78d7ecdeb3e2e86df0027e2c2cc55f9908b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Jul 2022 20:09:07 +0300 Subject: [PATCH 63/72] proc: add some (hopefully) insightful comments * /proc/${pid}/net status * removing PDE vs last close stuff (again!) * random small stuff Link: https://lkml.kernel.org/r/YtwrM6sDC0OQ53YB@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/array.c | 4 ++++ fs/proc/inode.c | 17 ++++++++++++----- fs/proc/proc_net.c | 6 ++++++ fs/proc/root.c | 5 +++++ 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 65fa603422e04d..99fcbfda8e2593 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -99,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char tcomm[64]; + /* + * Test before PF_KTHREAD because all workqueue worker threads are + * kernel threads. + */ if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else if (p->flags & PF_KTHREAD) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fd40d60169b5a2..f130499ad8432d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -212,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked on entry, unlocked on exit */ +/* + * At most 2 contexts can enter this function: the one doing the last + * close on the descriptor and whoever is deleting PDE itself. + * + * First to enter calls ->proc_release hook and signals its completion + * to the second one which waits and then does nothing. + * + * PDE is locked on entry, unlocked on exit. + */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { @@ -222,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. - * - * Therefore, first process to enter this function does ->release() and - * signals its completion to the other process which does nothing. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ @@ -238,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); - /* After ->release. */ + /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index bbce6fbe779c8c..856839b8ae8b7e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -350,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net) kgid_t gid; int err; + /* + * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. + * Corresponding inode (PDE(inode) == net->proc_net) is never + * instantiated therefore blanket zeroing is fine. + * net->proc_net_stat inode is instantiated normally. + */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) diff --git a/fs/proc/root.c b/fs/proc/root.c index 5a7d15d197f8e0..3c2ee3eb1138aa 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -302,6 +302,11 @@ void __init proc_root_init(void) proc_mkdir("bus", NULL); proc_sys_init(); + /* + * Last things last. It is not like userspace processes eager + * to open /proc files exist at this point but register last + * anyway. + */ register_filesystem(&proc_fs_type); } From cf069c3b47fed4e475a13e3ec89451fbdb88869a Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Fri, 22 Jul 2022 18:19:22 +0800 Subject: [PATCH 64/72] lib/mpi: fix typo 'the the' in comment Replace 'the the' with 'the' in the comment. Link: https://lkml.kernel.org/r/20220722101922.81126-1-slark_xiao@163.com Signed-off-by: Slark Xiao Cc: Hongbo Li Cc: Herbert Xu Signed-off-by: Andrew Morton --- lib/mpi/mpiutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c index bc81419f400c55..aa8c46544af8e0 100644 --- a/lib/mpi/mpiutil.c +++ b/lib/mpi/mpiutil.c @@ -272,7 +272,7 @@ MPI mpi_set_ui(MPI w, unsigned long u) if (!w) w = mpi_alloc(1); /* FIXME: If U is 0 we have no need to resize and thus possible - * allocating the the limbs. + * allocating the limbs. */ RESIZE_IF_NEEDED(w, 1); w->d[0] = u; From 97d3b2676fc6bc4865eb825037f4492f0fb804eb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 21 Jul 2022 22:49:25 +0200 Subject: [PATCH 65/72] ocfs2: remove some useless functions Patch series "ocfs2: A few clean_ups", v2. __ocfs2_node_map_set_bit() and __ocfs2_node_map_clear_bit() are just wrapper around set_bit() and clear_bit(). The leading __ also makes think that these functions are non-atomic just like __set_bit() and __clear_bit(). So, just remove these wrappers and call set_bit() and clear_bit() directly. Link: https://lkml.kernel.org/r/cover.1658436259.git.christophe.jaillet@wanadoo.fr Link: https://lkml.kernel.org/r/bd1429c84ec7d174c96dbb67a2b42b1b456d9394.1658436259.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Signed-off-by: Andrew Morton --- fs/ocfs2/heartbeat.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 9099d8fc759999..1d72e078894384 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -24,11 +24,6 @@ #include "buffer_head_io.h" -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit); -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit); - /* special case -1 for now * TODO: should *really* make sure the calling func never passes -1!! */ static void ocfs2_node_map_init(struct ocfs2_node_map *map) @@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data) ocfs2_recovery_thread(osb, node_num); } -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit) -{ - set_bit(bit, map->map); -} - void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_set_bit(map, bit); + set_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit) -{ - clear_bit(bit, map->map); -} - void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_clear_bit(map, bit); + clear_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } From 702f3cf374b85d2e77431c80e870ee31ea03cdd8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 21 Jul 2022 22:49:37 +0200 Subject: [PATCH 66/72] ocfs2: use the bitmap API to simplify code Use bitmap_zero() instead of hand-writing it. It is less verbose. While at it, add an explicit #include . Link: https://lkml.kernel.org/r/86d2a027c319db12055c98f00c65f7d01e703722.1658436259.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Signed-off-by: Andrew Morton --- fs/ocfs2/heartbeat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 1d72e078894384..dd29d60af1547d 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -8,6 +8,7 @@ * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ +#include #include #include #include @@ -29,8 +30,7 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map) { map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; - memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * - sizeof(unsigned long)); + bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES); } void ocfs2_init_node_maps(struct ocfs2_super *osb) From 45ee6d1e935d879d86aebd1fd15afb3bc015c4a0 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 21 Jul 2022 22:49:48 +0200 Subject: [PATCH 67/72] ocfs2: fix a typo in a comment s/heartbaet/heartbeat Link: https://lkml.kernel.org/r/4d4a6786e8ad522bfad6d2401b7f6634f8af0e5d.1658436259.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Signed-off-by: Andrew Morton --- fs/ocfs2/heartbeat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index dd29d60af1547d..22da768e65b7ce 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -2,7 +2,7 @@ /* * heartbeat.c * - * Register ourselves with the heartbaet service, keep our node maps + * Register ourselves with the heartbeat service, keep our node maps * up to date, and fire off recovery when needed. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. From 787dbea11a5d6843999ff71a3fb9aa1ed6d5d889 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 21 Jul 2022 20:55:09 +0100 Subject: [PATCH 68/72] profile: setup_profiling_timer() is moslty not implemented The setup_profiling_timer() is mostly un-implemented by many architectures. In many places it isn't guarded by CONFIG_PROFILE which is needed for it to be used. Make it a weak symbol in kernel/profile.c and remove the 'return -EINVAL' implementations from the kenrel. There are a couple of architectures which do return 0 from the setup_profiling_timer() function but they don't seem to do anything else with it. To keep the /proc compatibility for now, leave these for a future update or removal. On ARM, this fixes the following sparse warning: arch/arm/kernel/smp.c:793:5: warning: symbol 'setup_profiling_timer' was not declared. Should it be static? Link: https://lkml.kernel.org/r/20220721195509.418205-1-ben-linux@fluff.org Signed-off-by: Ben Dooks Signed-off-by: Andrew Morton --- arch/alpha/kernel/smp.c | 6 ------ arch/arc/kernel/smp.c | 8 -------- arch/arm/kernel/smp.c | 8 -------- arch/arm64/kernel/smp.c | 8 -------- arch/csky/kernel/smp.c | 5 ----- arch/hexagon/kernel/smp.c | 5 ----- arch/ia64/kernel/smp.c | 6 ------ arch/openrisc/kernel/smp.c | 6 ------ arch/parisc/kernel/smp.c | 7 ------- arch/powerpc/kernel/smp.c | 7 ------- arch/riscv/kernel/smp.c | 6 ------ arch/sparc/kernel/smp_32.c | 5 ----- arch/sparc/kernel/smp_64.c | 6 ------ arch/x86/include/asm/apic.h | 2 -- arch/x86/kernel/apic/apic.c | 5 ----- kernel/profile.c | 8 ++++++-- 16 files changed, 6 insertions(+), 92 deletions(-) diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index cb64e4797d2a89..f4e20f75438f8b 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -497,12 +497,6 @@ smp_cpus_done(unsigned int max_cpus) ((bogosum + 2500) / (5000/HZ)) % 100); } -int -setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static void send_ipi_message(const struct cpumask *to_whom, enum ipi_message_type operation) { diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index d947473f1e6da5..ab9e75e90f729d 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -232,14 +232,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) return 0; } -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /*****************************************************************************/ /* Inter Processor Interrupt Handling */ /*****************************************************************************/ diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 73fc645fc4c7e3..978db2d96b4469 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -787,14 +787,6 @@ void panic_smp_self_stop(void) cpu_relax(); } -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - #ifdef CONFIG_CPU_FREQ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 62ed361a4376ba..ffc5d76cf69555 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1078,14 +1078,6 @@ bool smp_crash_stop_failed(void) } #endif -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static bool have_cpu_die(void) { #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c index 6bb38bc2f39b43..4b605aa2e1d65d 100644 --- a/arch/csky/kernel/smp.c +++ b/arch/csky/kernel/smp.c @@ -243,11 +243,6 @@ void __init smp_cpus_done(unsigned int max_cpus) { } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void csky_start_secondary(void) { struct mm_struct *mm = &init_mm; diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index 619c56420aa0c7..4ba93e59370c41 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -240,11 +240,6 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) send_ipi(mask, IPI_CALL_FUNC); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void smp_start_cpus(void) { int i; diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 7b7b64eb312975..e2cc59db86bc2d 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -333,9 +333,3 @@ smp_send_stop (void) { send_IPI_allbutself(IPI_CPU_STOP); } - -int -setup_profiling_timer (unsigned int multiplier) -{ - return -EINVAL; -} diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 27041db2c8b0f5..e1419095a6f0af 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -197,12 +197,6 @@ void smp_send_stop(void) smp_call_function(stop_this_cpu, NULL, 0); } -/* not supported, yet */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int)) { smp_cross_call = fn; diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 24d0744c3b3abc..7dbd92cafae38a 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -513,10 +513,3 @@ void __cpu_die(unsigned int cpu) pdc_cpu_rendezvous_unlock(); } - -#ifdef CONFIG_PROC_FS -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} -#endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index bcefab484ea61d..c037c26540ddc6 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1674,13 +1674,6 @@ void start_secondary(void *unused) BUG(); } -#ifdef CONFIG_PROFILING -int setup_profiling_timer(unsigned int multiplier) -{ - return 0; -} -#endif - static void __init fixup_topology(void) { int i; diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index b5d30ea922925f..441d0ceb80adbc 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -64,12 +64,6 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id) return phys_id == cpuid_to_hartid_map(cpu); } -/* Unsupported */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static void ipi_stop(void) { set_cpu_online(smp_processor_id(), false); diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 22b148e5a5f88c..ad8094d955eba6 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -174,11 +174,6 @@ void smp_call_function_interrupt(void) irq_exit(); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void __init smp_prepare_cpus(unsigned int max_cpus) { int i, cpuid, extra; diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index a1f78e9ddaf371..a55295d1b92448 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1186,12 +1186,6 @@ void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs) preempt_enable(); } -/* /proc/profile writes can call this, don't __init it please. */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void __init smp_prepare_cpus(unsigned int max_cpus) { } diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bd8ae0a7010ae5..3415321c8240c4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -98,8 +98,6 @@ static inline bool apic_from_smp_config(void) #include #endif -extern int setup_profiling_timer(unsigned int); - static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 189d3a5e471adc..df764ceac2c85d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1115,11 +1115,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) set_irq_regs(old_regs); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /* * Local APIC start and shutdown */ diff --git a/kernel/profile.c b/kernel/profile.c index ae82ddfc6a6845..7ea01ba30e757e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -425,6 +425,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) return read; } +/* default is to not implement this call */ +int __weak setup_profiling_timer(unsigned mult) +{ + return -EINVAL; +} + /* * Writing to /proc/profile resets the counters * @@ -435,8 +441,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP - extern int setup_profiling_timer(unsigned int multiplier); - if (count == sizeof(int)) { unsigned int multiplier; From 50feece7f770cd5d850334716d71bb8ea4868810 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 25 Jul 2022 23:37:15 +0300 Subject: [PATCH 69/72] mailmap: update Kirill's email I disconnected from both Virtuozzo and OpenVZ, so this updates my email to point to my own. I haven't used @openvz address for patches, so let's rewrite the line instead of to add a new one. CC all previous addresses. Link: https://lkml.kernel.org/r/14ca895b-e745-6ba2-8be8-652feacbc907@ya.ru Signed-off-by: Kirill Tkhai Signed-off-by: Andrew Morton --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 2ed1cf86917530..04561cd90a0994 100644 --- a/.mailmap +++ b/.mailmap @@ -221,7 +221,7 @@ Kees Cook Keith Busch Keith Busch Kenneth W Chen -Kirill Tkhai +Kirill Tkhai Konstantin Khlebnikov Konstantin Khlebnikov Koushik From 9f3cebf0bb84a2b94cb59df869f1463e4954e150 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 25 Jul 2022 17:58:33 -0400 Subject: [PATCH 70/72] mailmap: add linux.dev alias for Brendan Higgins Because of my new work remote setup at Google, I can no longer use command line tools with my google.com email address, for this reason I got a linux.dev account. So update the mailmap to show the new alias I will be using. Link: https://lkml.kernel.org/r/20220725215833.789133-1-brendan.higgins@linux.dev Signed-off-by: Brendan Higgins Reviewed-by: David Gow Cc: Shuah Khan Cc: Daniel Latypov Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 04561cd90a0994..23e98625d1ce6f 100644 --- a/.mailmap +++ b/.mailmap @@ -71,6 +71,7 @@ Boris Brezillon Boris Brezillon Boris Brezillon Boris Brezillon +Brendan Higgins Brian Avery Brian King Brian Silverman From 9f98911a9d6e08037903e8ecf44d50f4bcf2368d Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 25 Jul 2022 18:07:37 -0400 Subject: [PATCH 71/72] MAINTAINERS: kunit: add David Gow as a maintainer of KUnit David has been a de facto maintainer of KUnit for a long time now. Formalize this in the MAINTAINERS file. Link: https://lkml.kernel.org/r/20220725220737.790976-1-brendan.higgins@linux.dev Signed-off-by: Brendan Higgins Reviewed-by: David Gow Cc: Shuah Khan Cc: Daniel Latypov Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 52d1c5d0ca9514..ba745a624c756d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10818,6 +10818,7 @@ F: fs/smbfs_common/ KERNEL UNIT TESTING FRAMEWORK (KUnit) M: Brendan Higgins +M: David Gow L: linux-kselftest@vger.kernel.org L: kunit-dev@googlegroups.com S: Maintained From b99695580bfc1f91364023c673681ddb88e375dc Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 12 Jul 2022 12:02:48 +0100 Subject: [PATCH 72/72] scripts/gdb: ensure the absolute path is generated on initial source Post 'make scripts_gdb' a symbolic link to scripts/gdb/vmlinux-gdb.py is created. Currently 'os.path.dirname(__file__)' does not generate the absolute path to scripts/gdb resulting in the following: (gdb) source vmlinux-gdb.py Traceback (most recent call last): File "scripts/gdb/vmlinux-gdb.py", line 25, in import linux.utils ModuleNotFoundError: No module named 'linux' This patch ensures that the absolute path to scripts/gdb in relation to the given file is generated so each module can be located accordingly. Link: https://lkml.kernel.org/r/20220712110248.1404125-1-atomlin@redhat.com Signed-off-by: Aaron Tomlin Reviewed-by: Douglas Anderson Cc: Jan Kiszka Cc: Kieran Bingham Signed-off-by: Andrew Morton --- scripts/gdb/vmlinux-gdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py index 4136dc2c59df23..3e8d3669f0ce00 100644 --- a/scripts/gdb/vmlinux-gdb.py +++ b/scripts/gdb/vmlinux-gdb.py @@ -13,7 +13,7 @@ import os -sys.path.insert(0, os.path.dirname(__file__) + "/scripts/gdb") +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/scripts/gdb") try: gdb.parse_and_eval("0")