From dcea7964764aad41c2994084a4c0292371b14e36 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 25 May 2022 12:03:17 -0700
Subject: [PATCH 01/72] checkpatch: add XA_STATE and XA_STATE_ORDER to the
 macro declaration list

XA_STATE() and XA_STATE_ORDER macro uses are declarations.

Add them to the declaration macro list to avoid suggesting a blank line
after declarations when used.

Link: https://lkml.kernel.org/r/144314f4bf2c58cf2336028a75a5127e848abd81.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Reported-by: David Howells <dhowells@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 503e8abbb2c1e4..205bf5055acffb 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1042,7 +1042,8 @@ sub build_types {
 our $declaration_macros = qr{(?x:
 	(?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
 	(?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(|
-	(?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\(
+	(?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\(|
+	(?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\(
 )};
 
 our %allow_repeated_words = (

From 0fe6ee8f123a4dfb529a5aff07536bb481f34043 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Tue, 31 May 2022 09:28:54 +0800
Subject: [PATCH 02/72] profiling: fix shift too large makes kernel panic

2d186afd04d6 ("profiling: fix shift-out-of-bounds bugs") limits shift
value by [0, BITS_PER_LONG -1], which means [0, 63].

However, syzbot found that the max shift value should be the bit number of
(_etext - _stext).  If shift is outside of this, the "buffer_bytes" will
be zero and will cause kzalloc(0).  Then the kernel panics due to
dereferencing the returned pointer 16.

This can be easily reproduced by passing a large number like 60 to enable
profiling and then run readprofile.

LOGS:
 BUG: kernel NULL pointer dereference, address: 0000000000000010
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD 6148067 P4D 6148067 PUD 6142067 PMD 0
 PREEMPT SMP
 CPU: 4 PID: 184 Comm: readprofile Not tainted 5.18.0+ #162
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
 RIP: 0010:read_profile+0x104/0x220
 RSP: 0018:ffffc900006fbe80 EFLAGS: 00000202
 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
 RDX: ffff888006150000 RSI: 0000000000000001 RDI: ffffffff82aba4a0
 RBP: 000000000188bb60 R08: 0000000000000010 R09: ffff888006151000
 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82aba4a0
 R13: 0000000000000000 R14: ffffc900006fbf08 R15: 0000000000020c30
 FS:  000000000188a8c0(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000010 CR3: 0000000006144000 CR4: 00000000000006e0
 Call Trace:
  <TASK>
  proc_reg_read+0x56/0x70
  vfs_read+0x9a/0x1b0
  ksys_read+0xa1/0xe0
  ? fpregs_assert_state_consistent+0x1e/0x40
  do_syscall_64+0x3a/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0
 RIP: 0033:0x4d4b4e
 RSP: 002b:00007ffebb668d58 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
 RAX: ffffffffffffffda RBX: 000000000188a8a0 RCX: 00000000004d4b4e
 RDX: 0000000000000400 RSI: 000000000188bb60 RDI: 0000000000000003
 RBP: 0000000000000003 R08: 000000000000006e R09: 0000000000000000
 R10: 0000000000000041 R11: 0000000000000246 R12: 000000000188bb60
 R13: 0000000000000400 R14: 0000000000000000 R15: 000000000188bb60
  </TASK>
 Modules linked in:
 CR2: 0000000000000010
Killed
 ---[ end trace 0000000000000000 ]---

Check prof_len in profile_init() to prevent it be zero.

Link: https://lkml.kernel.org/r/20220531012854.229439-1-chenzhongjin@huawei.com
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/profile.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/profile.c b/kernel/profile.c
index 37640a0bd8a3c7..ae82ddfc6a6845 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -109,6 +109,13 @@ int __ref profile_init(void)
 
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
+
+	if (!prof_len) {
+		pr_warn("profiling shift: %u too large\n", prof_shift);
+		prof_on = 0;
+		return -EINVAL;
+	}
+
 	buffer_bytes = prof_len*sizeof(atomic_t);
 
 	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))

From 53fd5ffbb5197b8cc2d73d2bbc0f688afd45736c Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Tue, 7 Jun 2022 10:12:26 -0700
Subject: [PATCH 03/72] ocfs2: kill EBUSY from dlmfs_evict_inode

When unlinking a dlmfs, first it will invoke dlmfs_unlink(), and then
invoke dlmfs_evict_inode(), user_dlm_destroy_lock() is invoked in both
places, the second one from dlmfs_evict_inode() will get EBUSY error
because USER_LOCK_IN_TEARDOWN is already set in lockres.  This doesn't
affect any function, just the error log is annoying.

Link: https://lkml.kernel.org/r/20220607171226.86672-1-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlmfs/dlmfs.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index e360543ad7e714..8b2020f92b5f07 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode)
 {
 	int status;
 	struct dlmfs_inode_private *ip;
+	struct user_lock_res *lockres;
+	int teardown;
 
 	clear_inode(inode);
 
 	mlog(0, "inode %lu\n", inode->i_ino);
 
 	ip = DLMFS_I(inode);
+	lockres = &ip->ip_lockres;
 
 	if (S_ISREG(inode->i_mode)) {
-		status = user_dlm_destroy_lock(&ip->ip_lockres);
-		if (status < 0)
-			mlog_errno(status);
+		spin_lock(&lockres->l_lock);
+		teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN);
+		spin_unlock(&lockres->l_lock);
+		if (!teardown) {
+			status = user_dlm_destroy_lock(lockres);
+			if (status < 0)
+				mlog_errno(status);
+		}
 		iput(ip->ip_parent);
 		goto clear_fields;
 	}

From 0cc011c576aaa4de505046f7a6c90933d7c749a9 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 31 May 2022 15:29:51 -0700
Subject: [PATCH 04/72] lib/list_debug.c: Detect uninitialized lists

In some circumstances, attempts are made to add entries to or to remove
entries from an uninitialized list.  A prime example is
amdgpu_bo_vm_destroy(): It is indirectly called from
ttm_bo_init_reserved() if that function fails, and tries to remove an
entry from a list.  However, that list is only initialized in
amdgpu_bo_create_vm() after the call to ttm_bo_init_reserved() returned
success.  This results in crashes such as

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP NOPTI
 CPU: 1 PID: 1479 Comm: chrome Not tainted 5.10.110-15768-g29a72e65dae5
 Hardware name: Google Grunt/Grunt, BIOS Google_Grunt.11031.149.0 07/15/2020
 RIP: 0010:__list_del_entry_valid+0x26/0x7d
 ...
 Call Trace:
  amdgpu_bo_vm_destroy+0x48/0x8b
  ttm_bo_init_reserved+0x1d7/0x1e0
  amdgpu_bo_create+0x212/0x476
  ? amdgpu_bo_user_destroy+0x23/0x23
  ? kmem_cache_alloc+0x60/0x271
  amdgpu_bo_create_vm+0x40/0x7d
  amdgpu_vm_pt_create+0xe8/0x24b
 ...

Check if the list's prev and next pointers are NULL to catch such problems.

Link: https://lkml.kernel.org/r/20220531222951.92073-1-linux@roeck-us.net
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/list_debug.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/lib/list_debug.c b/lib/list_debug.c
index 9daa3fb9d1cd61..d98d43f80958b8 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -20,7 +20,11 @@
 bool __list_add_valid(struct list_head *new, struct list_head *prev,
 		      struct list_head *next)
 {
-	if (CHECK_DATA_CORRUPTION(next->prev != prev,
+	if (CHECK_DATA_CORRUPTION(prev == NULL,
+			"list_add corruption. prev is NULL.\n") ||
+	    CHECK_DATA_CORRUPTION(next == NULL,
+			"list_add corruption. next is NULL.\n") ||
+	    CHECK_DATA_CORRUPTION(next->prev != prev,
 			"list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n",
 			prev, next->prev, next) ||
 	    CHECK_DATA_CORRUPTION(prev->next != next,
@@ -42,7 +46,11 @@ bool __list_del_entry_valid(struct list_head *entry)
 	prev = entry->prev;
 	next = entry->next;
 
-	if (CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+	if (CHECK_DATA_CORRUPTION(next == NULL,
+			"list_del corruption, %px->next is NULL\n", entry) ||
+	    CHECK_DATA_CORRUPTION(prev == NULL,
+			"list_del corruption, %px->prev is NULL\n", entry) ||
+	    CHECK_DATA_CORRUPTION(next == LIST_POISON1,
 			"list_del corruption, %px->next is LIST_POISON1 (%px)\n",
 			entry, LIST_POISON1) ||
 	    CHECK_DATA_CORRUPTION(prev == LIST_POISON2,

From a91befde350375b1ff954635acdde14dc92cd9a8 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Sat, 4 Jun 2022 21:15:02 +0800
Subject: [PATCH 05/72] lib/flex_proportions.c: remove local_irq_ops in
 fprop_new_period()

commit e78d4833c03e28> "lib: Fix possible deadlock in flexible proportion
code" adds the local_irq_ops because percpu_counter_{sum |add} ops'lock
can cause deadlock by interrupts.  Now percpu_counter _{sum|add} ops use
raw_spin_(un)lock_irq*, so revert the commit and resolve the conflict.

Link: https://lkml.kernel.org/r/20220604131502.5190-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/flex_proportions.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
index 53e7eb1dd76c96..05cccbcf1661a3 100644
--- a/lib/flex_proportions.c
+++ b/lib/flex_proportions.c
@@ -63,18 +63,13 @@ void fprop_global_destroy(struct fprop_global *p)
  */
 bool fprop_new_period(struct fprop_global *p, int periods)
 {
-	s64 events;
-	unsigned long flags;
+	s64 events = percpu_counter_sum(&p->events);
 
-	local_irq_save(flags);
-	events = percpu_counter_sum(&p->events);
 	/*
 	 * Don't do anything if there are no events.
 	 */
-	if (events <= 1) {
-		local_irq_restore(flags);
+	if (events <= 1)
 		return false;
-	}
 	write_seqcount_begin(&p->sequence);
 	if (periods < 64)
 		events -= events >> periods;
@@ -82,7 +77,6 @@ bool fprop_new_period(struct fprop_global *p, int periods)
 	percpu_counter_add(&p->events, -events);
 	p->period += periods;
 	write_seqcount_end(&p->sequence);
-	local_irq_restore(flags);
 
 	return true;
 }

From 4815a36009044ba69a9b8d781943ec6505c451a2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 3 Jun 2022 20:10:12 +0300
Subject: [PATCH 06/72] include/linux/rbtree.h: replace kernel.h with the
 necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20220603171012.48880-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rbtree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 235047d7a1b5e8..f7edca369edadd 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -17,9 +17,9 @@
 #ifndef	_LINUX_RBTREE_H
 #define	_LINUX_RBTREE_H
 
+#include <linux/container_of.h>
 #include <linux/rbtree_types.h>
 
-#include <linux/kernel.h>
 #include <linux/stddef.h>
 #include <linux/rcupdate.h>
 

From 9776e3861e0e30330f6c8ca9c30348f336d24b1c Mon Sep 17 00:00:00 2001
From: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Date: Sun, 5 Jun 2022 18:07:38 +0200
Subject: [PATCH 07/72] ia64: fix sparse warnings with cmpxchg() & xchg()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On IA64, new sparse's warnings where issued after fixing some __rcu
annotations in kernel/bpf/.

These new warnings are false positives and appear on IA64 because on this
architecture, the macros for cmpxchg() and xchg() make casts that ignore
sparse annotations.

This patch contains the minimal patch to fix this issue: adding a missing
cast and some missing '__force'.

Link: https://lore.kernel.org/r/20220601120013.bq5a3ynbkc3hngm5@mail
Link: https://lkml.kernel.org/r/20220605160738.79736-1-luc.vanoostenryck@gmail.com
Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/ia64/include/uapi/asm/cmpxchg.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
index 2c2f3cfeaa77b3..ca2e0268534384 100644
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ b/arch/ia64/include/uapi/asm/cmpxchg.h
@@ -33,24 +33,24 @@ extern void ia64_xchg_called_with_bad_pointer(void);
 									\
 	switch (size) {							\
 	case 1:								\
-		__xchg_result = ia64_xchg1((__u8 *)ptr, x);		\
+		__xchg_result = ia64_xchg1((__u8 __force *)ptr, x);	\
 		break;							\
 									\
 	case 2:								\
-		__xchg_result = ia64_xchg2((__u16 *)ptr, x);		\
+		__xchg_result = ia64_xchg2((__u16 __force *)ptr, x);	\
 		break;							\
 									\
 	case 4:								\
-		__xchg_result = ia64_xchg4((__u32 *)ptr, x);		\
+		__xchg_result = ia64_xchg4((__u32 __force *)ptr, x);	\
 		break;							\
 									\
 	case 8:								\
-		__xchg_result = ia64_xchg8((__u64 *)ptr, x);		\
+		__xchg_result = ia64_xchg8((__u64 __force *)ptr, x);	\
 		break;							\
 	default:							\
 		ia64_xchg_called_with_bad_pointer();			\
 	}								\
-	__xchg_result;							\
+	(__typeof__ (*(ptr)) __force) __xchg_result;			\
 })
 
 #ifndef __KERNEL__
@@ -76,42 +76,42 @@ extern long ia64_cmpxchg_called_with_bad_pointer(void);
 									\
 	switch (size) {							\
 	case 1:								\
-		_o_ = (__u8) (long) (old);				\
+		_o_ = (__u8) (long __force) (old);			\
 		break;							\
 	case 2:								\
-		_o_ = (__u16) (long) (old);				\
+		_o_ = (__u16) (long __force) (old);			\
 		break;							\
 	case 4:								\
-		_o_ = (__u32) (long) (old);				\
+		_o_ = (__u32) (long __force) (old);			\
 		break;							\
 	case 8:								\
-		_o_ = (__u64) (long) (old);				\
+		_o_ = (__u64) (long __force) (old);			\
 		break;							\
 	default:							\
 		break;							\
 	}								\
 	switch (size) {							\
 	case 1:								\
-		_r_ = ia64_cmpxchg1_##sem((__u8 *) ptr, new, _o_);	\
+		_r_ = ia64_cmpxchg1_##sem((__u8 __force *) ptr, new, _o_);	\
 		break;							\
 									\
 	case 2:								\
-		_r_ = ia64_cmpxchg2_##sem((__u16 *) ptr, new, _o_);	\
+		_r_ = ia64_cmpxchg2_##sem((__u16 __force *) ptr, new, _o_);	\
 		break;							\
 									\
 	case 4:								\
-		_r_ = ia64_cmpxchg4_##sem((__u32 *) ptr, new, _o_);	\
+		_r_ = ia64_cmpxchg4_##sem((__u32 __force *) ptr, new, _o_);	\
 		break;							\
 									\
 	case 8:								\
-		_r_ = ia64_cmpxchg8_##sem((__u64 *) ptr, new, _o_);	\
+		_r_ = ia64_cmpxchg8_##sem((__u64 __force *) ptr, new, _o_);	\
 		break;							\
 									\
 	default:							\
 		_r_ = ia64_cmpxchg_called_with_bad_pointer();		\
 		break;							\
 	}								\
-	(__typeof__(old)) _r_;						\
+	(__typeof__(old) __force) _r_;					\
 })
 
 #define cmpxchg_acq(ptr, o, n)	\

From c0af32fdc625c0e7f03465a813b04cbfb5419a1e Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Tue, 7 Jun 2022 21:35:56 +0800
Subject: [PATCH 08/72] lib/btree: simplify btree_{lookup|update}

btree_{lookup|update} both need to look up node by key, using the common
parts(add function btree_lookup_node) to simplify code.

Link: https://lkml.kernel.org/r/20220607133556.34732-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/btree.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/lib/btree.c b/lib/btree.c
index b4cf08a5c26789..a82100c73b5597 100644
--- a/lib/btree.c
+++ b/lib/btree.c
@@ -238,7 +238,7 @@ static int keyzero(struct btree_geo *geo, unsigned long *key)
 	return 1;
 }
 
-void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
+static void *btree_lookup_node(struct btree_head *head, struct btree_geo *geo,
 		unsigned long *key)
 {
 	int i, height = head->height;
@@ -257,7 +257,16 @@ void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
 		if (!node)
 			return NULL;
 	}
+	return node;
+}
 
+void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key)
+{
+	int i;
+	unsigned long *node;
+
+	node = btree_lookup_node(head, geo, key);
 	if (!node)
 		return NULL;
 
@@ -271,23 +280,10 @@ EXPORT_SYMBOL_GPL(btree_lookup);
 int btree_update(struct btree_head *head, struct btree_geo *geo,
 		 unsigned long *key, void *val)
 {
-	int i, height = head->height;
-	unsigned long *node = head->node;
-
-	if (height == 0)
-		return -ENOENT;
-
-	for ( ; height > 1; height--) {
-		for (i = 0; i < geo->no_pairs; i++)
-			if (keycmp(geo, node, i, key) <= 0)
-				break;
-		if (i == geo->no_pairs)
-			return -ENOENT;
-		node = bval(geo, node, i);
-		if (!node)
-			return -ENOENT;
-	}
+	int i;
+	unsigned long *node;
 
+	node = btree_lookup_node(head, geo, key);
 	if (!node)
 		return -ENOENT;
 

From d30dfd490f7dc4cb6a7c11a647bd1ff7a22139e7 Mon Sep 17 00:00:00 2001
From: Justin Stitt <jstitt007@gmail.com>
Date: Wed, 8 Jun 2022 15:35:39 -0700
Subject: [PATCH 09/72] include/uapi/linux/swab.h: move explicit cast outside
 ternary

A cast inside __builtin_constant_p doesn't do anything since it should
evaluate as constant at compile time irrespective of this cast.  Instead,
I moved this cast outside the ternary to ensure the return type is as
expected.

Additionally, if __HAVE_BUILTIN_BSWAP16__ was not defined then __swab16 is
actually returning an `int` not a `u16` due to integer promotion.

As Al Viro notes:
You *can't* get smaller-than-int out of ? :, same as you can't get it
out of addition, etc.

This also fixes some clang -Wformat warnings involving default
argument promotion.

Link: https://github.com/ClangBuiltLinux/linux/issues/378
Link: https://lkml.kernel.org/r/20220608223539.470472-1-justinstitt@google.com
Signed-off-by: Justin Stitt <jstitt007@gmail.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/swab.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 7272f85d6d6ab5..0723a9cce747c8 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -102,7 +102,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
 #else
 #define __swab16(x)				\
-	(__builtin_constant_p((__u16)(x)) ?	\
+	(__u16)(__builtin_constant_p(x) ?	\
 	___constant_swab16(x) :			\
 	__fswab16(x))
 #endif
@@ -115,7 +115,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
 #else
 #define __swab32(x)				\
-	(__builtin_constant_p((__u32)(x)) ?	\
+	(__u32)(__builtin_constant_p(x) ?	\
 	___constant_swab32(x) :			\
 	__fswab32(x))
 #endif
@@ -128,7 +128,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab64(x) (__u64)__builtin_bswap64((__u64)(x))
 #else
 #define __swab64(x)				\
-	(__builtin_constant_p((__u64)(x)) ?	\
+	(__u64)(__builtin_constant_p(x) ?	\
 	___constant_swab64(x) :			\
 	__fswab64(x))
 #endif

From dabba87229411a5e9d20ac03ffc36463c53ae672 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 27 May 2022 02:55:34 +0000
Subject: [PATCH 10/72] fs/kernel_read_file: allow to read files up-to ssize_t

Patch series "Allow to kexec with initramfs larger than 2G", v2.

Currently, the largest initramfs that is supported by kexec_file_load()
syscall is 2G.

This is because kernel_read_file() returns int, and is limited to INT_MAX
or 2G.

On the other hand, there are kexec based boot loaders (i.e.  u-root), that
may need to boot netboot images that might be larger than 2G.

The first patch changes the return type from int to ssize_t in
kernel_read_file* functions.

The second patch increases the maximum initramfs file size to 4G.

Tested: verified that can kexec_file_load() works with 4G initramfs
on x86_64.


This patch (of 2):

Currently, the maximum file size that is supported is 2G.  This may be too
small in some cases.  For example, kexec_file_load() system call loads
initramfs.  In some netboot cases initramfs can be rather large.

Allow to use up-to ssize_t bytes.  The callers still can limit the maximum
file size via buf_size.

Link: https://lkml.kernel.org/r/20220527025535.3953665-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20220527025535.3953665-2-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/kernel_read_file.c            | 38 ++++++++++++++++----------------
 include/linux/kernel_read_file.h | 32 +++++++++++++--------------
 include/linux/limits.h           |  1 +
 3 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 1b07550485b964..5d826274570cab 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -29,15 +29,15 @@
  * change between calls to kernel_read_file().
  *
  * Returns number of bytes read (no single read will be bigger
- * than INT_MAX), or negative on error.
+ * than SSIZE_MAX), or negative on error.
  *
  */
-int kernel_read_file(struct file *file, loff_t offset, void **buf,
-		     size_t buf_size, size_t *file_size,
-		     enum kernel_read_file_id id)
+ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf,
+			 size_t buf_size, size_t *file_size,
+			 enum kernel_read_file_id id)
 {
 	loff_t i_size, pos;
-	size_t copied;
+	ssize_t copied;
 	void *allocated = NULL;
 	bool whole_file;
 	int ret;
@@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf,
 		goto out;
 	}
 	/* The file is too big for sane activities. */
-	if (i_size > INT_MAX) {
+	if (i_size > SSIZE_MAX) {
 		ret = -EFBIG;
 		goto out;
 	}
@@ -124,12 +124,12 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file);
 
-int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
-			       size_t buf_size, size_t *file_size,
-			       enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
+				   size_t buf_size, size_t *file_size,
+				   enum kernel_read_file_id id)
 {
 	struct file *file;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
 
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id)
 {
 	struct file *file;
 	struct path root;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
 
-int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
-			     size_t buf_size, size_t *file_size,
-			     enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
+				 size_t buf_size, size_t *file_size,
+				 enum kernel_read_file_id id)
 {
 	struct fd f = fdget(fd);
-	int ret = -EBADF;
+	ssize_t ret = -EBADF;
 
 	if (!f.file || !(f.file->f_mode & FMODE_READ))
 		goto out;
diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h
index 575ffa1031d348..90451e2e12bd19 100644
--- a/include/linux/kernel_read_file.h
+++ b/include/linux/kernel_read_file.h
@@ -35,21 +35,21 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 	return kernel_read_file_str[id];
 }
 
-int kernel_read_file(struct file *file, loff_t offset,
-		     void **buf, size_t buf_size,
-		     size_t *file_size,
-		     enum kernel_read_file_id id);
-int kernel_read_file_from_path(const char *path, loff_t offset,
-			       void **buf, size_t buf_size,
-			       size_t *file_size,
-			       enum kernel_read_file_id id);
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id);
-int kernel_read_file_from_fd(int fd, loff_t offset,
-			     void **buf, size_t buf_size,
-			     size_t *file_size,
-			     enum kernel_read_file_id id);
+ssize_t kernel_read_file(struct file *file, loff_t offset,
+			 void **buf, size_t buf_size,
+			 size_t *file_size,
+			 enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset,
+				   void **buf, size_t buf_size,
+				   size_t *file_size,
+				   enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset,
+				 void **buf, size_t buf_size,
+				 size_t *file_size,
+				 enum kernel_read_file_id id);
 
 #endif /* _LINUX_KERNEL_READ_FILE_H */
diff --git a/include/linux/limits.h b/include/linux/limits.h
index b568b9c30bbf58..f6bcc936901071 100644
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
@@ -7,6 +7,7 @@
 #include <vdso/limits.h>
 
 #define SIZE_MAX	(~(size_t)0)
+#define SSIZE_MAX	((ssize_t)(SIZE_MAX >> 1))
 #define PHYS_ADDR_MAX	(~(phys_addr_t)0)
 
 #define U8_MAX		((u8)~0U)

From f4da7afe07523ff8930c4466b09a15db18508cd4 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 27 May 2022 02:55:35 +0000
Subject: [PATCH 11/72] kexec_file: increase maximum file size to 4G

In some case initrd can be large.  For example, it could be a netboot
image loaded by u-root, that is kexec'ing into it.

The maximum size of initrd is arbitrary set to 2G.  Also, the limit is not
very obvious because it is hidden behind a generic INT_MAX macro.

Theoretically, we could make it LONG_MAX, but it is safer to keep it sane,
and just increase it to 4G.

Increase the size to 4G, and make it obvious by having a new macro that
specifies the maximum file size supported by kexec_file_load() syscall:
KEXEC_FILE_SIZE_MAX.

Link: https://lkml.kernel.org/r/20220527025535.3953665-3-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Sasha Levin <sashal@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_file.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 145321a5e798a6..9b2839775c837b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -31,6 +31,9 @@
 
 static int kexec_calculate_store_digests(struct kimage *image);
 
+/* Maximum size in bytes for kernel/initrd files. */
+#define KEXEC_FILE_SIZE_MAX	min_t(s64, 4LL << 30, SSIZE_MAX)
+
 /*
  * Currently this is the only default function that is exported as some
  * architectures need it to do additional handlings.
@@ -189,11 +192,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 			     const char __user *cmdline_ptr,
 			     unsigned long cmdline_len, unsigned flags)
 {
-	int ret;
+	ssize_t ret;
 	void *ldata;
 
 	ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf,
-				       INT_MAX, NULL, READING_KEXEC_IMAGE);
+				       KEXEC_FILE_SIZE_MAX, NULL,
+				       READING_KEXEC_IMAGE);
 	if (ret < 0)
 		return ret;
 	image->kernel_buf_len = ret;
@@ -213,7 +217,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 	/* It is possible that there no initramfs is being loaded */
 	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
 		ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf,
-					       INT_MAX, NULL,
+					       KEXEC_FILE_SIZE_MAX, NULL,
 					       READING_KEXEC_INITRAMFS);
 		if (ret < 0)
 			goto out;

From 0aed4724a8392f2567f83c9c4b9decf447d752a2 Mon Sep 17 00:00:00 2001
From: cxbing <chenxuebing@jari.cn>
Date: Thu, 9 Jun 2022 07:44:59 -0700
Subject: [PATCH 12/72] delayacct: remove some unused variables

Drop the unused variables *done* and *count*.

Link: https://lkml.kernel.org/r/20220609144459.86379-1-zhangkkoo@126.com
Signed-off-by: cxbing <chenxuebing@jari.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/getdelays.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index e83e6e47a21ea1..938dec0dfaad84 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -45,7 +45,6 @@
 		exit(code);			\
 	} while (0)
 
-int done;
 int rcvbufsz;
 char name[100];
 int dbg;
@@ -285,7 +284,6 @@ int main(int argc, char *argv[])
 	pid_t rtid = 0;
 
 	int fd = 0;
-	int count = 0;
 	int write_file = 0;
 	int maskset = 0;
 	char *logfile = NULL;
@@ -495,7 +493,6 @@ int main(int argc, char *argv[])
 				len2 = 0;
 				/* For nested attributes, na follows */
 				na = (struct nlattr *) NLA_DATA(na);
-				done = 0;
 				while (len2 < aggr_len) {
 					switch (na->nla_type) {
 					case TASKSTATS_TYPE_PID:
@@ -509,7 +506,6 @@ int main(int argc, char *argv[])
 							printf("TGID\t%d\n", rtid);
 						break;
 					case TASKSTATS_TYPE_STATS:
-						count++;
 						if (print_delays)
 							print_delayacct((struct taskstats *) NLA_DATA(na));
 						if (print_io_accounting)

From f268eedddf3595e85f8883dc50aed29654785696 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sat, 11 Jun 2022 04:21:32 +0100
Subject: [PATCH 13/72] squashfs: extend "page actor" to handle missing pages

Patch series "Squashfs: handle missing pages decompressing into page
cache".

This patchset enables Squashfs to handle missing pages when directly
decompressing datablocks into the page cache.

Previously if the full set of pages needed was not available, Squashfs
would have to fall back to using an intermediate buffer (the older
method), which is slower, involving a memcopy, and it introduces
contention on a shared buffer.

The first patch extends the "page actor" code to handle missing pages.

The second patch updates Squashfs_readpage_block() to use the new
functionality, and removes the code that falls back to using an
intermediate buffer.

This patchset is independent of the readahead work, and it is standalone.
It can be merged on its own.

But the readahead patch for efficiency also needs this patch-set.


This patch (of 2):

This patch extends the "page actor" code to handle missing pages.

Previously if the full set of pages needed to decompress a Squashfs
datablock was unavailable, this would cause decompression to fail on the
missing pages.

In this case direct decompression into the page cache could not be
achieved and the code would fall back to using the older intermediate
buffer method.

With this patch, direct decompression into the page cache can be achieved
with missing pages.

For "multi-shot" decompressors (zlib, xz, zstd), the page actor will
allocate a temporary buffer which is passed to the decompressor, and then
freed by the page actor.

For "single shot" decompressors (lz4, lzo) which decompress into a
contiguous "bounce buffer", and which is then copied into the page cache,
it would be pointless to allocate a temporary buffer, memcpy into it, and
then free it.  For these decompressors -ENOMEM is returned, which
signifies that the memcpy for that page should be skipped.

This also happens if the data block is uncompressed.

Link: https://lkml.kernel.org/r/20220611032133.5743-1-phillip@squashfs.org.uk
Link: https://lkml.kernel.org/r/20220611032133.5743-2-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/block.c        | 10 ++++---
 fs/squashfs/decompressor.h |  1 +
 fs/squashfs/file_direct.c  | 21 ++++++++-------
 fs/squashfs/lz4_wrapper.c  |  7 +++--
 fs/squashfs/lzo_wrapper.c  |  7 +++--
 fs/squashfs/page_actor.c   | 55 ++++++++++++++++++++++++++++++++------
 fs/squashfs/page_actor.h   | 21 ++++++++++++---
 fs/squashfs/xz_wrapper.c   | 11 +++++++-
 fs/squashfs/zlib_wrapper.c | 12 ++++++++-
 fs/squashfs/zstd_wrapper.c | 12 ++++++++-
 10 files changed, 126 insertions(+), 31 deletions(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8879d052f96c6a..833aca92301f0e 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio,
 			     struct squashfs_page_actor *actor,
 			     int offset, int req_length)
 {
-	void *actor_addr = squashfs_first_page(actor);
+	void *actor_addr;
 	struct bvec_iter_all iter_all = {};
 	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
 	int copied_bytes = 0;
 	int actor_offset = 0;
 
+	squashfs_actor_nobuff(actor);
+	actor_addr = squashfs_first_page(actor);
+
 	if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all)))
 		return 0;
 
@@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio,
 
 		bytes_to_copy = min_t(int, bytes_to_copy,
 				      req_length - copied_bytes);
-		memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset,
-		       bytes_to_copy);
+		if (!IS_ERR(actor_addr))
+			memcpy(actor_addr + actor_offset, bvec_virt(bvec) +
+					offset, bytes_to_copy);
 
 		actor_offset += bytes_to_copy;
 		copied_bytes += bytes_to_copy;
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 1b9ccfd0aa519b..19ab608343895f 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -20,6 +20,7 @@ struct squashfs_decompressor {
 		struct bio *, int, int, struct squashfs_page_actor *);
 	int	id;
 	char	*name;
+	int	alloc_buffer;
 	int	supported;
 };
 
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index a4894cc5944719..5af5802f5626cb 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -47,14 +47,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	if (page == NULL)
 		return res;
 
-	/*
-	 * Create a "page actor" which will kmap and kunmap the
-	 * page cache pages appropriately within the decompressor
-	 */
-	actor = squashfs_page_actor_init_special(page, pages, 0);
-	if (actor == NULL)
-		goto out;
-
 	/* Try to grab all the pages covered by the Squashfs block */
 	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
 		page[i] = (n == target_page->index) ? target_page :
@@ -89,8 +81,19 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 		goto out;
 	}
 
+	/*
+	 * Create a "page actor" which will kmap and kunmap the
+	 * page cache pages appropriately within the decompressor
+	 */
+	actor = squashfs_page_actor_init_special(msblk, page, pages, 0);
+	if (actor == NULL)
+		goto out;
+
 	/* Decompress directly into the page cache buffers */
 	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+	kfree(actor);
+
 	if (res < 0)
 		goto mark_errored;
 
@@ -116,7 +119,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 			put_page(page[i]);
 	}
 
-	kfree(actor);
 	kfree(page);
 
 	return 0;
@@ -135,7 +137,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	}
 
 out:
-	kfree(actor);
 	kfree(page);
 	return res;
 }
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index b685b6238316c7..49797729f14383 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	buff = stream->output;
 	while (data) {
 		if (bytes <= PAGE_SIZE) {
-			memcpy(data, buff, bytes);
+			if (!IS_ERR(data))
+				memcpy(data, buff, bytes);
 			break;
 		}
-		memcpy(data, buff, PAGE_SIZE);
+		if (!IS_ERR(data))
+			memcpy(data, buff, PAGE_SIZE);
 		buff += PAGE_SIZE;
 		bytes -= PAGE_SIZE;
 		data = squashfs_next_page(output);
@@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = {
 	.decompress = lz4_uncompress,
 	.id = LZ4_COMPRESSION,
 	.name = "lz4",
+	.alloc_buffer = 0,
 	.supported = 1
 };
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index cb510a63196836..d216aeefa865ce 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	buff = stream->output;
 	while (data) {
 		if (bytes <= PAGE_SIZE) {
-			memcpy(data, buff, bytes);
+			if (!IS_ERR(data))
+				memcpy(data, buff, bytes);
 			break;
 		} else {
-			memcpy(data, buff, PAGE_SIZE);
+			if (!IS_ERR(data))
+				memcpy(data, buff, PAGE_SIZE);
 			buff += PAGE_SIZE;
 			bytes -= PAGE_SIZE;
 			data = squashfs_next_page(output);
@@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = {
 	.decompress = lzo_uncompress,
 	.id = LZO_COMPRESSION,
 	.name = "lzo",
+	.alloc_buffer = 0,
 	.supported = 1
 };
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 520d323a99ce67..b23b780d8f42ec 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -7,6 +7,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
 #include "page_actor.h"
 
 /*
@@ -57,29 +59,62 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
 }
 
 /* Implementation of page_actor for decompressing directly into page cache. */
+static void *handle_next_page(struct squashfs_page_actor *actor)
+{
+	int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	if (actor->returned_pages == max_pages)
+		return NULL;
+
+	if ((actor->next_page == actor->pages) ||
+			(actor->next_index != actor->page[actor->next_page]->index)) {
+		if (actor->alloc_buffer) {
+			void *tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+			if (tmp_buffer) {
+				actor->tmp_buffer = tmp_buffer;
+				actor->next_index++;
+				actor->returned_pages++;
+				return tmp_buffer;
+			}
+		}
+
+		actor->next_index++;
+		actor->returned_pages++;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	actor->next_index++;
+	actor->returned_pages++;
+	return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]);
+}
+
 static void *direct_first_page(struct squashfs_page_actor *actor)
 {
-	actor->next_page = 1;
-	return actor->pageaddr = kmap_atomic(actor->page[0]);
+	return handle_next_page(actor);
 }
 
 static void *direct_next_page(struct squashfs_page_actor *actor)
 {
 	if (actor->pageaddr)
-		kunmap_atomic(actor->pageaddr);
+		kunmap_local(actor->pageaddr);
+
+	kfree(actor->tmp_buffer);
+	actor->pageaddr = actor->tmp_buffer = NULL;
 
-	return actor->pageaddr = actor->next_page == actor->pages ? NULL :
-		kmap_atomic(actor->page[actor->next_page++]);
+	return handle_next_page(actor);
 }
 
 static void direct_finish_page(struct squashfs_page_actor *actor)
 {
 	if (actor->pageaddr)
-		kunmap_atomic(actor->pageaddr);
+		kunmap_local(actor->pageaddr);
+
+	kfree(actor->tmp_buffer);
 }
 
-struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
-	int pages, int length)
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk,
+	struct page **page, int pages, int length)
 {
 	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
 
@@ -90,7 +125,11 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
 	actor->page = page;
 	actor->pages = pages;
 	actor->next_page = 0;
+	actor->returned_pages = 0;
+	actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1);
 	actor->pageaddr = NULL;
+	actor->tmp_buffer = NULL;
+	actor->alloc_buffer = msblk->decompressor->alloc_buffer;
 	actor->squashfs_first_page = direct_first_page;
 	actor->squashfs_next_page = direct_next_page;
 	actor->squashfs_finish_page = direct_finish_page;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 2e3073ace0097b..37523c54256fa7 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -45,6 +45,11 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
 {
 	/* empty */
 }
+
+static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
+{
+	/* empty */
+}
 #else
 struct squashfs_page_actor {
 	union {
@@ -52,17 +57,23 @@ struct squashfs_page_actor {
 		struct page	**page;
 	};
 	void	*pageaddr;
+	void	*tmp_buffer;
 	void    *(*squashfs_first_page)(struct squashfs_page_actor *);
 	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
 	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
 	int	pages;
 	int	length;
 	int	next_page;
+	int	alloc_buffer;
+	int	returned_pages;
+	pgoff_t	next_index;
 };
 
-extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
-extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
-							 **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+				int pages, int length);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(
+				struct squashfs_sb_info *msblk,
+				struct page **page, int pages, int length);
 static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
 {
 	return actor->squashfs_first_page(actor);
@@ -75,5 +86,9 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
 {
 	actor->squashfs_finish_page(actor);
 }
+static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
+{
+	actor->alloc_buffer = 0;
+}
 #endif
 #endif
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 68f6d09bb3a2bc..6c49481a2f8c43 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	stream->buf.out_pos = 0;
 	stream->buf.out_size = PAGE_SIZE;
 	stream->buf.out = squashfs_first_page(output);
+	if (IS_ERR(stream->buf.out)) {
+		error = PTR_ERR(stream->buf.out);
+		goto finish;
+	}
 
 	for (;;) {
 		enum xz_ret xz_err;
@@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->buf.out_pos == stream->buf.out_size) {
 			stream->buf.out = squashfs_next_page(output);
-			if (stream->buf.out != NULL) {
+			if (IS_ERR(stream->buf.out)) {
+				error = PTR_ERR(stream->buf.out);
+				break;
+			} else if (stream->buf.out != NULL) {
 				stream->buf.out_pos = 0;
 				total += PAGE_SIZE;
 			}
@@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		}
 	}
 
+finish:
 	squashfs_finish_page(output);
 
 	return error ? error : total + stream->buf.out_pos;
@@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = {
 	.decompress = squashfs_xz_uncompress,
 	.id = XZ_COMPRESSION,
 	.name = "xz",
+	.alloc_buffer = 1,
 	.supported = 1
 };
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index a20e9042146bd6..cbb7afe7bc4679 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	stream->next_out = squashfs_first_page(output);
 	stream->avail_in = 0;
 
+	if (IS_ERR(stream->next_out)) {
+		error = PTR_ERR(stream->next_out);
+		goto finish;
+	}
+
 	for (;;) {
 		int zlib_err;
 
@@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->avail_out == 0) {
 			stream->next_out = squashfs_next_page(output);
-			if (stream->next_out != NULL)
+			if (IS_ERR(stream->next_out)) {
+				error = PTR_ERR(stream->next_out);
+				break;
+			} else if (stream->next_out != NULL)
 				stream->avail_out = PAGE_SIZE;
 		}
 
@@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		}
 	}
 
+finish:
 	squashfs_finish_page(output);
 
 	if (!error)
@@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = {
 	.decompress = zlib_uncompress,
 	.id = ZLIB_COMPRESSION,
 	.name = "zlib",
+	.alloc_buffer = 1,
 	.supported = 1
 };
 
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index c40445dbf38c77..0e407c4d8b3bc3 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 	out_buf.size = PAGE_SIZE;
 	out_buf.dst = squashfs_first_page(output);
+	if (IS_ERR(out_buf.dst)) {
+		error = PTR_ERR(out_buf.dst);
+		goto finish;
+	}
 
 	for (;;) {
 		size_t zstd_err;
@@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (out_buf.pos == out_buf.size) {
 			out_buf.dst = squashfs_next_page(output);
-			if (out_buf.dst == NULL) {
+			if (IS_ERR(out_buf.dst)) {
+				error = PTR_ERR(out_buf.dst);
+				break;
+			} else if (out_buf.dst == NULL) {
 				/* Shouldn't run out of pages
 				 * before stream is done.
 				 */
@@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		}
 	}
 
+finish:
+
 	squashfs_finish_page(output);
 
 	return error ? error : total_out;
@@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = {
 	.decompress = zstd_uncompress,
 	.id = ZSTD_COMPRESSION,
 	.name = "zstd",
+	.alloc_buffer = 1,
 	.supported = 1
 };

From 1bb1a07afad97303f14b8d1b319b03f1f01a0091 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sat, 11 Jun 2022 04:21:33 +0100
Subject: [PATCH 14/72] squashfs: don't use intermediate buffer if pages
 missing

Now that the "page actor" can handle missing pages, we don't have to fall
back to using an intermediate buffer in Squashfs_readpage_block() if all
the pages necessary can't be obtained.

Link: https://lkml.kernel.org/r/20220611032133.5743-3-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file_direct.c | 75 +++++++--------------------------------
 1 file changed, 12 insertions(+), 63 deletions(-)

diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 5af5802f5626cb..be4b12d31e0c36 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -18,9 +18,6 @@
 #include "squashfs.h"
 #include "page_actor.h"
 
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-	int pages, struct page **page, int bytes);
-
 /* Read separately compressed datablock directly into page cache */
 int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	int expected)
@@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
 	int start_index = target_page->index & ~mask;
 	int end_index = start_index | mask;
-	int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+	int i, n, pages, bytes, res = -ENOMEM;
 	struct page **page;
 	struct squashfs_page_actor *actor;
 	void *pageaddr;
@@ -48,44 +45,29 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 		return res;
 
 	/* Try to grab all the pages covered by the Squashfs block */
-	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+	for (i = 0, n = start_index; n <= end_index; n++) {
 		page[i] = (n == target_page->index) ? target_page :
 			grab_cache_page_nowait(target_page->mapping, n);
 
-		if (page[i] == NULL) {
-			missing_pages++;
+		if (page[i] == NULL)
 			continue;
-		}
 
 		if (PageUptodate(page[i])) {
 			unlock_page(page[i]);
 			put_page(page[i]);
-			page[i] = NULL;
-			missing_pages++;
+			continue;
 		}
-	}
-
-	if (missing_pages) {
-		/*
-		 * Couldn't get one or more pages, this page has either
-		 * been VM reclaimed, but others are still in the page cache
-		 * and uptodate, or we're racing with another thread in
-		 * squashfs_readpage also trying to grab them.  Fall back to
-		 * using an intermediate buffer.
-		 */
-		res = squashfs_read_cache(target_page, block, bsize, pages,
-							page, expected);
-		if (res < 0)
-			goto mark_errored;
 
-		goto out;
+		i++;
 	}
 
+	pages = i;
+
 	/*
 	 * Create a "page actor" which will kmap and kunmap the
 	 * page cache pages appropriately within the decompressor
 	 */
-	actor = squashfs_page_actor_init_special(msblk, page, pages, 0);
+	actor = squashfs_page_actor_init_special(msblk, page, pages, expected);
 	if (actor == NULL)
 		goto out;
 
@@ -102,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 		goto mark_errored;
 	}
 
-	/* Last page may have trailing bytes not filled */
+	/* Last page (if present) may have trailing bytes not filled */
 	bytes = res % PAGE_SIZE;
-	if (bytes) {
-		pageaddr = kmap_atomic(page[pages - 1]);
+	if (page[pages - 1]->index == end_index && bytes) {
+		pageaddr = kmap_local_page(page[pages - 1]);
 		memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
-		kunmap_atomic(pageaddr);
+		kunmap_local(pageaddr);
 	}
 
 	/* Mark pages as uptodate, unlock and release */
@@ -140,36 +122,3 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	kfree(page);
 	return res;
 }
-
-
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-	int pages, struct page **page, int bytes)
-{
-	struct inode *i = target_page->mapping->host;
-	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
-						 block, bsize);
-	int res = buffer->error, n, offset = 0;
-
-	if (res) {
-		ERROR("Unable to read page, block %llx, size %x\n", block,
-			bsize);
-		goto out;
-	}
-
-	for (n = 0; n < pages && bytes > 0; n++,
-			bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
-		int avail = min_t(int, bytes, PAGE_SIZE);
-
-		if (page[n] == NULL)
-			continue;
-
-		squashfs_fill_page(page[n], buffer, offset, avail);
-		unlock_page(page[n]);
-		if (page[n] != target_page)
-			put_page(page[n]);
-	}
-
-out:
-	squashfs_cache_put(buffer);
-	return res;
-}

From 019a0c9e377c9f7bd477a0742706d93cdddaee4d Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Fri, 10 Jun 2022 09:57:18 +0200
Subject: [PATCH 15/72] fat: add a vfat_rename2() and make existing .rename
 callback a helper

Patch series "fat: add support for the renameat2 RENAME_EXCHANGE flag", v6.

The series adds support for the renameat2 system call RENAME_EXCHANGE flag
(which allows to atomically replace two paths) to the vfat filesystem
code.

There are many use cases for this, but we are particularly interested in
making possible for vfat filesystems to be part of OSTree [0] deployments.

Currently OSTree relies on symbolic links to make the deployment updates
an atomic transactional operation.  But RENAME_EXCHANGE could be used [1]
to achieve a similar level of robustness when using a vfat filesystem.

Patch #1 is just a preparatory patch to introduce the RENAME_EXCHANGE
support, patch #2 moves some code blocks in vfat_rename() to a set of
helper functions, that can be reused by tvfat_rename_exchange() that's
added by patch #3 and finally patch #4 adds some kselftests to test it.


This patch (of 4):

Currently vfat only supports the RENAME_NOREPLACE flag which is handled by
the virtual file system layer but doesn't support the RENAME_EXCHANGE
flag.

Add a vfat_rename2() function to be used as the .rename callback and move
the current vfat_rename() handler to a helper.  This is in preparation for
implementing the RENAME_NOREPLACE flag using a different helper function.

Link: https://lkml.kernel.org/r/20220610075721.1182745-1-javierm@redhat.com
Link: https://lkml.kernel.org/r/20220610075721.1182745-2-javierm@redhat.com
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Christian Kellner <ckellner@redhat.com>
Cc: Peter Jones <pjones@redhat.com>
Cc: Chung-Chiang Cheng <cccheng@synology.com>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Colin Walters <walters@verbum.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c573314806cf82..88ccb2ee3537bc 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -889,9 +889,8 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	return err;
 }
 
-static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
-		       struct dentry *old_dentry, struct inode *new_dir,
-		       struct dentry *new_dentry, unsigned int flags)
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct buffer_head *dotdot_bh;
 	struct msdos_dir_entry *dotdot_de;
@@ -902,9 +901,6 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
-	if (flags & ~RENAME_NOREPLACE)
-		return -EINVAL;
-
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
 	old_inode = d_inode(old_dentry);
 	new_inode = d_inode(new_dentry);
@@ -1021,13 +1017,24 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	goto out;
 }
 
+static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+			struct dentry *old_dentry, struct inode *new_dir,
+			struct dentry *new_dentry, unsigned int flags)
+{
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	/* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */
+	return vfat_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static const struct inode_operations vfat_dir_inode_operations = {
 	.create		= vfat_create,
 	.lookup		= vfat_lookup,
 	.unlink		= vfat_unlink,
 	.mkdir		= vfat_mkdir,
 	.rmdir		= vfat_rmdir,
-	.rename		= vfat_rename,
+	.rename		= vfat_rename2,
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
 	.update_time	= fat_update_time,

From 204d03203a145b443cd8676dc12dbb47e1a3751f Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 10 Jun 2022 09:57:19 +0200
Subject: [PATCH 16/72] fat: factor out reusable code in vfat_rename() as
 helper functions

The vfat_rename() function is quite big and there are code blocks that can
be moved into helper functions.  This not only simplify the implementation
of that function but also allows these helpers to be reused.

For example, the helpers can be used by the handler of the RENAME_EXCHANGE
flag once this is implemented in a subsequent patch.

Link: https://lkml.kernel.org/r/20220610075721.1182745-3-javierm@redhat.com
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Christian Kellner <ckellner@redhat.com>
Cc: Chung-Chiang Cheng <cccheng@synology.com>
Cc: Colin Walters <walters@verbum.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Peter Jones <pjones@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 89 +++++++++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 32 deletions(-)

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 88ccb2ee3537bc..9c04053a8f1cc5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -889,16 +889,55 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	return err;
 }
 
+static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
+			      struct msdos_dir_entry **de)
+{
+	if (S_ISDIR(inode->i_mode)) {
+		if (fat_get_dotdot_entry(inode, bh, de))
+			return -EIO;
+	}
+	return 0;
+}
+
+static int vfat_sync_ipos(struct inode *dir, struct inode *inode)
+{
+	if (IS_DIRSYNC(dir))
+		return fat_sync_inode(inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode,
+				 struct buffer_head *dotdot_bh,
+				 struct msdos_dir_entry *dotdot_de)
+{
+	fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart);
+	mark_buffer_dirty_inode(dotdot_bh, inode);
+	if (IS_DIRSYNC(dir))
+		return sync_dirty_buffer(dotdot_bh);
+	return 0;
+}
+
+static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts)
+{
+	inode_inc_iversion(dir);
+	fat_truncate_time(dir, ts, S_CTIME | S_MTIME);
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+}
+
 static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		       struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
+	struct msdos_dir_entry *dotdot_de = NULL;
 	struct inode *old_inode, *new_inode;
 	struct fat_slot_info old_sinfo, sinfo;
 	struct timespec64 ts;
 	loff_t new_i_pos;
-	int err, is_dir, update_dotdot, corrupt = 0;
+	int err, is_dir, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -909,15 +948,13 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (err)
 		goto out;
 
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
-			err = -EIO;
+	if (old_dir != new_dir) {
+		err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de);
+		if (err)
 			goto out;
-		}
 	}
 
+	is_dir = S_ISDIR(old_inode->i_mode);
 	ts = current_time(old_dir);
 	if (new_inode) {
 		if (is_dir) {
@@ -938,21 +975,15 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	fat_detach(old_inode);
 	fat_attach(old_inode, new_i_pos);
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
+	err = vfat_sync_ipos(new_dir, old_inode);
+	if (err)
+		goto error_inode;
 
-	if (update_dotdot) {
-		fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
+	if (dotdot_de) {
+		err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh,
+					    dotdot_de);
+		if (err)
+			goto error_dotdot;
 		drop_nlink(old_dir);
 		if (!new_inode)
  			inc_nlink(new_dir);
@@ -962,12 +993,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old_sinfo.bh = NULL;
 	if (err)
 		goto error_dotdot;
-	inode_inc_iversion(old_dir);
-	fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
+	vfat_update_dir_metadata(old_dir, &ts);
 
 	if (new_inode) {
 		drop_nlink(new_inode);
@@ -987,10 +1013,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* data cluster is shared, serious corruption */
 	corrupt = 1;
 
-	if (update_dotdot) {
-		fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
+	if (dotdot_de) {
+		corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh,
+						 dotdot_de);
 	}
 error_inode:
 	fat_detach(old_inode);

From da87e1725ae2136baeb9aac04c572c283afc917f Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Fri, 10 Jun 2022 09:57:20 +0200
Subject: [PATCH 17/72] fat: add renameat2 RENAME_EXCHANGE flag support

The renameat2 RENAME_EXCHANGE flag allows to atomically exchange two paths
but is currently not supported by the Linux vfat filesystem driver.

Add a vfat_rename_exchange() helper function that implements this support.

The super block lock is acquired during the operation to ensure atomicity,
and in the error path actions made are reversed also with the mutex held.

It makes the operation as transactional as possible, within the limitation
impossed by vfat due not having a journal with logs to replay.

Link: https://lkml.kernel.org/r/20220610075721.1182745-4-javierm@redhat.com
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Christian Kellner <ckellner@redhat.com>
Cc: Chung-Chiang Cheng <cccheng@synology.com>
Cc: Colin Walters <walters@verbum.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Peter Jones <pjones@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 123 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 9c04053a8f1cc5..21620054e1c44e 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1042,13 +1042,134 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	goto out;
 }
 
+static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode,
+			       loff_t old_i_pos, loff_t new_i_pos)
+{
+	fat_detach(old_inode);
+	fat_detach(new_inode);
+	fat_attach(old_inode, new_i_pos);
+	fat_attach(new_inode, old_i_pos);
+}
+
+static void vfat_move_nlink(struct inode *src, struct inode *dst)
+{
+	drop_nlink(src);
+	inc_nlink(dst);
+}
+
+static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+				struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL;
+	struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL;
+	struct inode *old_inode, *new_inode;
+	struct timespec64 ts = current_time(old_dir);
+	loff_t old_i_pos, new_i_pos;
+	int err, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
+
+	old_inode = d_inode(old_dentry);
+	new_inode = d_inode(new_dentry);
+
+	/* Acquire super block lock for the operation to be atomic */
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	/* if directories are not the same, get ".." info to update */
+	if (old_dir != new_dir) {
+		err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh,
+					 &old_dotdot_de);
+		if (err)
+			goto out;
+
+		err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh,
+					 &new_dotdot_de);
+		if (err)
+			goto out;
+	}
+
+	old_i_pos = MSDOS_I(old_inode)->i_pos;
+	new_i_pos = MSDOS_I(new_inode)->i_pos;
+
+	vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos);
+
+	err = vfat_sync_ipos(old_dir, new_inode);
+	if (err)
+		goto error_exchange;
+	err = vfat_sync_ipos(new_dir, old_inode);
+	if (err)
+		goto error_exchange;
+
+	/* update ".." directory entry info */
+	if (old_dotdot_de) {
+		err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh,
+					    old_dotdot_de);
+		if (err)
+			goto error_old_dotdot;
+	}
+	if (new_dotdot_de) {
+		err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh,
+					    new_dotdot_de);
+		if (err)
+			goto error_new_dotdot;
+	}
+
+	/* if cross directory and only one is a directory, adjust nlink */
+	if (!old_dotdot_de != !new_dotdot_de) {
+		if (old_dotdot_de)
+			vfat_move_nlink(old_dir, new_dir);
+		else
+			vfat_move_nlink(new_dir, old_dir);
+	}
+
+	vfat_update_dir_metadata(old_dir, &ts);
+	/* if directories are not the same, update new_dir as well */
+	if (old_dir != new_dir)
+		vfat_update_dir_metadata(new_dir, &ts);
+
+out:
+	brelse(old_dotdot_bh);
+	brelse(new_dotdot_bh);
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+
+	return err;
+
+error_new_dotdot:
+	if (new_dotdot_de) {
+		corrupt |= vfat_update_dotdot_de(new_dir, new_inode,
+						 new_dotdot_bh, new_dotdot_de);
+	}
+
+error_old_dotdot:
+	if (old_dotdot_de) {
+		corrupt |= vfat_update_dotdot_de(old_dir, old_inode,
+						 old_dotdot_bh, old_dotdot_de);
+	}
+
+error_exchange:
+	vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos);
+	corrupt |= vfat_sync_ipos(new_dir, new_inode);
+	corrupt |= vfat_sync_ipos(old_dir, old_inode);
+
+	if (corrupt < 0) {
+		fat_fs_error(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld, %lld)",
+			     __func__, old_i_pos, new_i_pos);
+	}
+	goto out;
+}
+
 static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
 			struct dentry *old_dentry, struct inode *new_dir,
 			struct dentry *new_dentry, unsigned int flags)
 {
-	if (flags & ~RENAME_NOREPLACE)
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
 		return -EINVAL;
 
+	if (flags & RENAME_EXCHANGE) {
+		return vfat_rename_exchange(old_dir, old_dentry,
+					    new_dir, new_dentry);
+	}
+
 	/* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */
 	return vfat_rename(old_dir, old_dentry, new_dir, new_dentry);
 }

From dd7c9be330d87732766a95cfd7a6de38bf7a39c3 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Fri, 10 Jun 2022 09:57:21 +0200
Subject: [PATCH 18/72] selftests/filesystems: add a vfat RENAME_EXCHANGE test

Add a test for the renameat2 RENAME_EXCHANGE support in vfat, but split it
in a tool that just does the rename exchange and a script that is run by
the kselftests framework on `make TARGETS="filesystems/fat" kselftest`.

That way the script can be easily extended to test other file operations.

The script creates a 1 MiB disk image, that is then formated with a vfat
filesystem and mounted using a loop device.  That way all file operations
are done on an ephemeral filesystem.

Link: https://lkml.kernel.org/r/20220610075721.1182745-5-javierm@redhat.com
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Christian Kellner <ckellner@redhat.com>
Cc: Chung-Chiang Cheng <cccheng@synology.com>
Cc: Colin Walters <walters@verbum.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Peter Jones <pjones@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                                   |  1 +
 tools/testing/selftests/Makefile              |  1 +
 .../selftests/filesystems/fat/.gitignore      |  2 +
 .../selftests/filesystems/fat/Makefile        |  7 ++
 .../testing/selftests/filesystems/fat/config  |  2 +
 .../filesystems/fat/rename_exchange.c         | 37 +++++++++
 .../filesystems/fat/run_fat_tests.sh          | 82 +++++++++++++++++++
 7 files changed, 132 insertions(+)
 create mode 100644 tools/testing/selftests/filesystems/fat/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/fat/Makefile
 create mode 100644 tools/testing/selftests/filesystems/fat/config
 create mode 100644 tools/testing/selftests/filesystems/fat/rename_exchange.c
 create mode 100644 tools/testing/selftests/filesystems/fat/run_fat_tests.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index 1fc9ead83d2aa3..addcc0cca3211a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20891,6 +20891,7 @@ M:	OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
 S:	Maintained
 F:	Documentation/filesystems/vfat.rst
 F:	fs/fat/
+F:	tools/testing/selftests/filesystems/fat/
 
 VFIO DRIVER
 M:	Alex Williamson <alex.williamson@redhat.com>
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index de11992dc57763..67668a9fa115ed 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -17,6 +17,7 @@ TARGETS += exec
 TARGETS += filesystems
 TARGETS += filesystems/binderfs
 TARGETS += filesystems/epoll
+TARGETS += filesystems/fat
 TARGETS += firmware
 TARGETS += fpu
 TARGETS += ftrace
diff --git a/tools/testing/selftests/filesystems/fat/.gitignore b/tools/testing/selftests/filesystems/fat/.gitignore
new file mode 100644
index 00000000000000..b89920ed841cce
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+rename_exchange
diff --git a/tools/testing/selftests/filesystems/fat/Makefile b/tools/testing/selftests/filesystems/fat/Makefile
new file mode 100644
index 00000000000000..902033f6ef098b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS := run_fat_tests.sh
+TEST_GEN_PROGS_EXTENDED := rename_exchange
+CFLAGS += -O2 -g -Wall $(KHDR_INCLUDES)
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/fat/config b/tools/testing/selftests/filesystems/fat/config
new file mode 100644
index 00000000000000..6cf95e787a17b5
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/config
@@ -0,0 +1,2 @@
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_VFAT_FS=y
diff --git a/tools/testing/selftests/filesystems/fat/rename_exchange.c b/tools/testing/selftests/filesystems/fat/rename_exchange.c
new file mode 100644
index 00000000000000..e488ad354fce4f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/rename_exchange.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Program that atomically exchanges two paths using
+ * the renameat2() system call RENAME_EXCHANGE flag.
+ *
+ * Copyright 2022 Red Hat Inc.
+ * Author: Javier Martinez Canillas <javierm@redhat.com>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void print_usage(const char *program)
+{
+	printf("Usage: %s [oldpath] [newpath]\n", program);
+	printf("Atomically exchange oldpath and newpath\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int ret;
+
+	if (argc != 3) {
+		print_usage(argv[0]);
+		exit(EXIT_FAILURE);
+	}
+
+	ret = renameat2(AT_FDCWD, argv[1], AT_FDCWD, argv[2], RENAME_EXCHANGE);
+	if (ret) {
+		perror("rename exchange failed");
+		exit(EXIT_FAILURE);
+	}
+
+	exit(EXIT_SUCCESS);
+}
diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh
new file mode 100644
index 00000000000000..7f35dc3d15dfa6
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run filesystem operations tests on an 1 MiB disk image that is formatted with
+# a vfat filesystem and mounted in a temporary directory using a loop device.
+#
+# Copyright 2022 Red Hat Inc.
+# Author: Javier Martinez Canillas <javierm@redhat.com>
+
+set -e
+set -u
+set -o pipefail
+
+BASE_DIR="$(dirname $0)"
+TMP_DIR="$(mktemp -d /tmp/fat_tests_tmp.XXXX)"
+IMG_PATH="${TMP_DIR}/fat.img"
+MNT_PATH="${TMP_DIR}/mnt"
+
+cleanup()
+{
+    mountpoint -q "${MNT_PATH}" && unmount_image
+    rm -rf "${TMP_DIR}"
+}
+trap cleanup SIGINT SIGTERM EXIT
+
+create_loopback()
+{
+    touch "${IMG_PATH}"
+    chattr +C "${IMG_PATH}" >/dev/null 2>&1 || true
+
+    truncate -s 1M "${IMG_PATH}"
+    mkfs.vfat "${IMG_PATH}" >/dev/null 2>&1
+}
+
+mount_image()
+{
+    mkdir -p "${MNT_PATH}"
+    sudo mount -o loop "${IMG_PATH}" "${MNT_PATH}"
+}
+
+rename_exchange_test()
+{
+    local rename_exchange="${BASE_DIR}/rename_exchange"
+    local old_path="${MNT_PATH}/old_file"
+    local new_path="${MNT_PATH}/new_file"
+
+    echo old | sudo tee "${old_path}" >/dev/null 2>&1
+    echo new | sudo tee "${new_path}" >/dev/null 2>&1
+    sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1
+    sudo sync -f "${MNT_PATH}"
+    grep new "${old_path}" >/dev/null 2>&1
+    grep old "${new_path}" >/dev/null 2>&1
+}
+
+rename_exchange_subdir_test()
+{
+    local rename_exchange="${BASE_DIR}/rename_exchange"
+    local dir_path="${MNT_PATH}/subdir"
+    local old_path="${MNT_PATH}/old_file"
+    local new_path="${dir_path}/new_file"
+
+    sudo mkdir -p "${dir_path}"
+    echo old | sudo tee "${old_path}" >/dev/null 2>&1
+    echo new | sudo tee "${new_path}" >/dev/null 2>&1
+    sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1
+    sudo sync -f "${MNT_PATH}"
+    grep new "${old_path}" >/dev/null 2>&1
+    grep old "${new_path}" >/dev/null 2>&1
+}
+
+unmount_image()
+{
+    sudo umount "${MNT_PATH}" &> /dev/null
+}
+
+create_loopback
+mount_image
+rename_exchange_test
+rename_exchange_subdir_test
+unmount_image
+
+exit 0

From f858e23a29740757fe1ca602cb1f57845034b1c5 Mon Sep 17 00:00:00 2001
From: Antonio Borneo <borneo.antonio@gmail.com>
Date: Mon, 13 Jun 2022 12:00:55 +0200
Subject: [PATCH 19/72] checkpatch: fix incorrect camelcase detection on
 numeric constant

The code fragment below

	int foo(int *array, int index)
	{
		return array[index & 0xFF];
	}

triggers an incorrect camelcase detection by checking a substring of the
hex constant:

	CHECK: Avoid CamelCase: <xFF>
	#3: FILE: test.c:3:
	+	return array[index & 0xFF];

This is caused by passing the whole string "array[index & 0xFF]" to the
inner loop that iterates over a "$Ident" match.  The numeric constant is
not a $Ident as it doesn't start with [A-Za-z_] and should be excluded
from the match.

Similar issue can be detected with other constants like "1uL", "0xffffU".

Force the match to start at word boundary so the $Ident will be properly
checked starting from its first char and the constants will be
filtered-out.

Link: https://lkml.kernel.org/r/20220613100055.77821-1-borneo.antonio@gmail.com
Signed-off-by: Antonio Borneo <borneo.antonio@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 205bf5055acffb..79e759aac543b8 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5721,7 +5721,7 @@ sub process {
 			    $var !~ /^(?:[a-z0-9_]*|[A-Z0-9_]*)?_?[a-z][A-Z](?:_[a-z0-9_]+|_[A-Z0-9_]+)?$/ &&
 #Ignore some three character SI units explicitly, like MiB and KHz
 			    $var !~ /^(?:[a-z_]*?)_?(?:[KMGT]iB|[KMGT]?Hz)(?:_[a-z_]+)?$/) {
-				while ($var =~ m{($Ident)}g) {
+				while ($var =~ m{\b($Ident)}g) {
 					my $word = $1;
 					next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/);
 					if ($check) {

From 00c9d5632277b21ba8802e26c27254cd9d0dfa13 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Sun, 12 Jun 2022 13:20:15 +0800
Subject: [PATCH 20/72] lib/error-inject: convert to DEFINE_SEQ_ATTRIBUTE

Use DEFINE_SEQ_ATTRIBUTE helper macro to simplify the code.

Link: https://lkml.kernel.org/r/20220612052015.23283-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/error-inject.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/lib/error-inject.c b/lib/error-inject.c
index 2ff5ef689d7271..4a4f1278c41916 100644
--- a/lib/error-inject.c
+++ b/lib/error-inject.c
@@ -197,24 +197,14 @@ static int ei_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static const struct seq_operations ei_seq_ops = {
+static const struct seq_operations ei_sops = {
 	.start = ei_seq_start,
 	.next  = ei_seq_next,
 	.stop  = ei_seq_stop,
 	.show  = ei_seq_show,
 };
 
-static int ei_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &ei_seq_ops);
-}
-
-static const struct file_operations debugfs_ei_ops = {
-	.open           = ei_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(ei);
 
 static int __init ei_debugfs_init(void)
 {
@@ -224,7 +214,7 @@ static int __init ei_debugfs_init(void)
 	if (!dir)
 		return -ENOMEM;
 
-	file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_ei_ops);
+	file = debugfs_create_file("list", 0444, dir, NULL, &ei_fops);
 	if (!file) {
 		debugfs_remove(dir);
 		return -ENOMEM;

From 5a704629f2c1ba33bbb444cb18e6957e97c76e8f Mon Sep 17 00:00:00 2001
From: Dan Moulding <dmoulding@me.com>
Date: Sun, 17 Jul 2022 17:31:37 -0700
Subject: [PATCH 21/72] init: add "hostname" kernel parameter

The gethostname system call returns the hostname for the current machine.
However, the kernel has no mechanism to initially set the current
machine's name in such a way as to guarantee that the first userspace
process to call gethostname will receive a meaningful result.  It relies
on some unspecified userspace process to first call sethostname before
gethostname can produce a meaningful name.

Traditionally the machine's hostname is set from userspace by the init
system.  The init system, in turn, often relies on a configuration file
(say, /etc/hostname) to provide the value that it will supply in the call
to sethostname.  Consequently, the file system containing /etc/hostname
usually must be available before the hostname will be set.  There may,
however, be earlier userspace processes that could call gethostname before
the file system containing /etc/hostname is mounted.  Such a process will
get some other, likely meaningless, name from gethostname (such as
"(none)", "localhost", or "darkstar").

A real-world example where this can happen, and lead to undesirable
results, is with mdadm.  When assembling arrays, mdadm distinguishes
between "local" arrays and "foreign" arrays.  A local array is one that
properly belongs to the current machine, and a foreign array is one that
is (possibly temporarily) attached to the current machine, but properly
belongs to some other machine.  To determine if an array is local or
foreign, mdadm may compare the "homehost" recorded on the array with the
current hostname.  If mdadm is run before the root file system is mounted,
perhaps because the root file system itself resides on an md-raid array,
then /etc/hostname isn't yet available and the init system will not yet
have called sethostname, causing mdadm to incorrectly conclude that all of
the local arrays are foreign.

Solving this problem *could* be delegated to the init system.  It could be
left up to the init system (including any init system that starts within
an initramfs, if one is in use) to ensure that sethostname is called
before any other userspace process could possibly call gethostname.
However, it may not always be obvious which processes could call
gethostname (for example, udev itself might not call gethostname, but it
could via udev rules invoke processes that do).  Additionally, the init
system has to ensure that the hostname configuration value is stored in
some place where it will be readily accessible during early boot.
Unfortunately, every init system will attempt to (or has already attempted
to) solve this problem in a different, possibly incorrect, way.  This
makes getting consistently working configurations harder for users.

I believe it is better for the kernel to provide the means by which the
hostname may be set early, rather than making this a problem for the init
system to solve.  The option to set the hostname during early startup, via
a kernel parameter, provides a simple, reliable way to solve this problem.
It also could make system configuration easier for some embedded systems.

[dmoulding@me.com: v2]
  Link: https://lkml.kernel.org/r/20220506060310.7495-2-dmoulding@me.com
Link: https://lkml.kernel.org/r/20220505180651.22849-2-dmoulding@me.com
Signed-off-by: Dan Moulding <dmoulding@me.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 13 +++++++++++++
 init/version.c                                  | 17 +++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2522b11e593f23..2c9c0229b77071 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1667,6 +1667,19 @@
 
 	hlt		[BUGS=ARM,SH]
 
+	hostname=	[KNL] Set the hostname (aka UTS nodename).
+			Format: <string>
+			This allows setting the system's hostname during early
+			startup. This sets the name returned by gethostname.
+			Using this parameter to set the hostname makes it
+			possible to ensure the hostname is correctly set before
+			any userspace processes run, avoiding the possibility
+			that a process may call gethostname before the hostname
+			has been explicitly set, resulting in the calling
+			process getting an incorrect result. The string must
+			not exceed the maximum allowed hostname length (usually
+			64 characters) and will be truncated otherwise.
+
 	hpet=		[X86-32,HPET] option to control HPET usage
 			Format: { enable (default) | disable | force |
 				verbose }
diff --git a/init/version.c b/init/version.c
index 1a356f5493e853..b7f9559d417c74 100644
--- a/init/version.c
+++ b/init/version.c
@@ -11,6 +11,8 @@
 #include <linux/build-salt.h>
 #include <linux/elfnote-lto.h>
 #include <linux/export.h>
+#include <linux/init.h>
+#include <linux/printk.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <generated/utsrelease.h>
@@ -35,6 +37,21 @@ struct uts_namespace init_uts_ns = {
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
+static int __init early_hostname(char *arg)
+{
+	size_t bufsize = sizeof(init_uts_ns.name.nodename);
+	size_t maxlen  = bufsize - 1;
+	size_t arglen;
+
+	arglen = strlcpy(init_uts_ns.name.nodename, arg, bufsize);
+	if (arglen > maxlen) {
+		pr_warn("hostname parameter exceeds %zd characters and will be truncated",
+			maxlen);
+	}
+	return 0;
+}
+early_param("hostname", early_hostname);
+
 /* FIXED STRINGS! Don't touch! */
 const char linux_banner[] =
 	"Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"

From 5a66fce95b72e6359527415b33a7ae13f0d6b7eb Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Sat, 18 Jun 2022 16:25:21 +0800
Subject: [PATCH 22/72] lib/lru_cache: fix error free handing in lc_create

When kmem_cache_alloc in function lc_create returns null, we will
free the memory already allocated. The loop of kmem_cache_free
is wrong, especially:
  i = 0  ==> do wrong loop
  i > 0  ==> do not free element[0]

Link: https://lkml.kernel.org/r/20220618082521.7082-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Lars Ellenberg <lars.ellenberg@linbit.com>
Cc: Christoph Bhmwalder <christoph.boehmwalder@linbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/lru_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 52313acbfa6284..dc35464216d3cc 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -147,8 +147,8 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
 		return lc;
 
 	/* else: could not allocate all elements, give up */
-	for (i--; i; i--) {
-		void *p = element[i];
+	while (i) {
+		void *p = element[--i];
 		kmem_cache_free(cache, p - e_off);
 	}
 	kfree(lc);

From 62df90b53e6f332bb69b73621998826c49a17323 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Sun, 19 Jun 2022 15:46:41 +0800
Subject: [PATCH 23/72] net, lib/once: remove {net_}get_random_once_wait macro

DO_ONCE(func, ...) will call func with spinlock which acquired by
spin_lock_irqsave in __do_once_start.  But the get_random_once_wait will
sleep in get_random_bytes_wait -> wait_for_random_bytes.

Fortunately, there is no place to use {net_}get_random_once_wait, so we
could remove them simply.

Link: https://lkml.kernel.org/r/20220619074641.40916-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/net.h  | 2 --
 include/linux/once.h | 2 --
 2 files changed, 4 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 12093f4db50c42..8613772a1f580e 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -303,8 +303,6 @@ do {									\
 
 #define net_get_random_once(buf, nbytes)			\
 	get_random_once((buf), (nbytes))
-#define net_get_random_once_wait(buf, nbytes)			\
-	get_random_once_wait((buf), (nbytes))
 
 /*
  * E.g. XFS meta- & log-data is in slab pages, or bcache meta
diff --git a/include/linux/once.h b/include/linux/once.h
index f54523052bbcb5..b14d8b309d52b1 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -54,7 +54,5 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 
 #define get_random_once(buf, nbytes)					     \
 	DO_ONCE(get_random_bytes, (buf), (nbytes))
-#define get_random_once_wait(buf, nbytes)                                    \
-	DO_ONCE(get_random_bytes_wait, (buf), (nbytes))                      \
 
 #endif /* _LINUX_ONCE_H */

From f9987921cb541b1187a648141a9048547ea89ffb Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 20 Jun 2022 17:02:49 +0200
Subject: [PATCH 24/72] lib/stackdepot: replace CONFIG_STACK_HASH_ORDER with
 automatic sizing

As Linus explained [1], setting the stackdepot hash table size as a config
option is suboptimal, especially as stackdepot becomes a dependency of
less "expert" subsystems than initially (e.g.  DRM, networking,
SLUB_DEBUG):

: (a) it introduces a new compile-time question that isn't sane to ask
: a regular user, but is now exposed to regular users.

: (b) this by default uses 1MB of memory for a feature that didn't in
: the past, so now if you have small machines you need to make sure you
: make a special kernel config for them.

Ideally we would employ rhashtable for fully automatic resizing, which
should be feasible for many of the new users, but problematic for the
original users with restricted context that call __stack_depot_save() with
can_alloc == false, i.e.  KASAN.

However we can easily remove the config option and scale the hash table
automatically with system memory.  The STACK_HASH_MASK constant becomes
stack_hash_mask variable and is used only in one mask operation, so the
overhead should be negligible to none.  For early allocation we can employ
the existing alloc_large_system_hash() function and perform similar
scaling for the late allocation.

The existing limits of the config option (between 4k and 1M buckets) are
preserved, and scaling factor is set to one bucket per 16kB memory so on
64bit the max 1M buckets (8MB memory) is achieved with 16GB system, while
a 1GB system will use 512kB.

Because KASAN is reported to need the maximum number of buckets even with
smaller amounts of memory [2], set it as such when kasan_enabled().

If needed, the automatic scaling could be complemented with a boot-time
kernel parameter, but it feels pointless to add it without a specific use
case.

[1] https://lore.kernel.org/all/CAHk-=wjC5nS+fnf6EzRD9yQRJApAhxx7gRB87ZV+pAWo9oVrTg@mail.gmail.com/
[2] https://lore.kernel.org/all/CACT4Y+Y4GZfXOru2z5tFPzFdaSUd+GFc6KVL=bsa0+1m197cQQ@mail.gmail.com/

Link: https://lkml.kernel.org/r/20220620150249.16814-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig      |  9 --------
 lib/stackdepot.c | 59 ++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/lib/Kconfig b/lib/Kconfig
index eaaad4d85bf24b..986ea474836c3a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -685,15 +685,6 @@ config STACKDEPOT_ALWAYS_INIT
 	bool
 	select STACKDEPOT
 
-config STACK_HASH_ORDER
-	int "stack depot hash size (12 => 4KB, 20 => 1024KB)"
-	range 12 20
-	default 20
-	depends on STACKDEPOT
-	help
-	 Select the hash size as a power of 2 for the stackdepot hash table.
-	 Choose a lower value to reduce the memory impact.
-
 config REF_TRACKER
 	bool
 	depends on STACKTRACE_SUPPORT
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 5ca0d086ef4a3a..e73fda23388d8c 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/memblock.h>
+#include <linux/kasan-enabled.h>
 
 #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
 
@@ -145,10 +146,16 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
 	return stack;
 }
 
-#define STACK_HASH_SIZE (1L << CONFIG_STACK_HASH_ORDER)
-#define STACK_HASH_MASK (STACK_HASH_SIZE - 1)
+/* one hash table bucket entry per 16kB of memory */
+#define STACK_HASH_SCALE	14
+/* limited between 4k and 1M buckets */
+#define STACK_HASH_ORDER_MIN	12
+#define STACK_HASH_ORDER_MAX	20
 #define STACK_HASH_SEED 0x9747b28c
 
+static unsigned int stack_hash_order;
+static unsigned int stack_hash_mask;
+
 static bool stack_depot_disable;
 static struct stack_record **stack_table;
 
@@ -175,7 +182,7 @@ void __init stack_depot_want_early_init(void)
 
 int __init stack_depot_early_init(void)
 {
-	size_t size;
+	unsigned long entries = 0;
 
 	/* This is supposed to be called only once, from mm_init() */
 	if (WARN_ON(__stack_depot_early_init_passed))
@@ -183,13 +190,23 @@ int __init stack_depot_early_init(void)
 
 	__stack_depot_early_init_passed = true;
 
+	if (kasan_enabled() && !stack_hash_order)
+		stack_hash_order = STACK_HASH_ORDER_MAX;
+
 	if (!__stack_depot_want_early_init || stack_depot_disable)
 		return 0;
 
-	size = (STACK_HASH_SIZE * sizeof(struct stack_record *));
-	pr_info("Stack Depot early init allocating hash table with memblock_alloc, %zu bytes\n",
-		size);
-	stack_table = memblock_alloc(size, SMP_CACHE_BYTES);
+	if (stack_hash_order)
+		entries = 1UL <<  stack_hash_order;
+	stack_table = alloc_large_system_hash("stackdepot",
+						sizeof(struct stack_record *),
+						entries,
+						STACK_HASH_SCALE,
+						HASH_EARLY | HASH_ZERO,
+						NULL,
+						&stack_hash_mask,
+						1UL << STACK_HASH_ORDER_MIN,
+						1UL << STACK_HASH_ORDER_MAX);
 
 	if (!stack_table) {
 		pr_err("Stack Depot hash table allocation failed, disabling\n");
@@ -207,13 +224,35 @@ int stack_depot_init(void)
 
 	mutex_lock(&stack_depot_init_mutex);
 	if (!stack_depot_disable && !stack_table) {
-		pr_info("Stack Depot allocating hash table with kvcalloc\n");
-		stack_table = kvcalloc(STACK_HASH_SIZE, sizeof(struct stack_record *), GFP_KERNEL);
+		unsigned long entries;
+		int scale = STACK_HASH_SCALE;
+
+		if (stack_hash_order) {
+			entries = 1UL << stack_hash_order;
+		} else {
+			entries = nr_free_buffer_pages();
+			entries = roundup_pow_of_two(entries);
+
+			if (scale > PAGE_SHIFT)
+				entries >>= (scale - PAGE_SHIFT);
+			else
+				entries <<= (PAGE_SHIFT - scale);
+		}
+
+		if (entries < 1UL << STACK_HASH_ORDER_MIN)
+			entries = 1UL << STACK_HASH_ORDER_MIN;
+		if (entries > 1UL << STACK_HASH_ORDER_MAX)
+			entries = 1UL << STACK_HASH_ORDER_MAX;
+
+		pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n",
+				entries);
+		stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL);
 		if (!stack_table) {
 			pr_err("Stack Depot hash table allocation failed, disabling\n");
 			stack_depot_disable = true;
 			ret = -ENOMEM;
 		}
+		stack_hash_mask = entries - 1;
 	}
 	mutex_unlock(&stack_depot_init_mutex);
 	return ret;
@@ -386,7 +425,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries,
 		goto fast_exit;
 
 	hash = hash_stack(entries, nr_entries);
-	bucket = &stack_table[hash & STACK_HASH_MASK];
+	bucket = &stack_table[hash & stack_hash_mask];
 
 	/*
 	 * Fast path: look the stack trace up without locking.

From 86e5908ec293bf6505a59d02542da006226bcaa7 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Mon, 20 Jun 2022 18:02:44 +0800
Subject: [PATCH 25/72] lib/error-inject: traverse list with mutex

Traversing list without mutex in get_injectable_error_type will
race with the following code:
    list_del_init(&ent->list)
    kfree(ent)
in module_unload_ei_list. So fix that.

Link: https://lkml.kernel.org/r/20220620100244.82896-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/error-inject.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/lib/error-inject.c b/lib/error-inject.c
index 4a4f1278c41916..1afca1b1cdead0 100644
--- a/lib/error-inject.c
+++ b/lib/error-inject.c
@@ -40,12 +40,18 @@ bool within_error_injection_list(unsigned long addr)
 int get_injectable_error_type(unsigned long addr)
 {
 	struct ei_entry *ent;
+	int ei_type = EI_ETYPE_NONE;
 
+	mutex_lock(&ei_mutex);
 	list_for_each_entry(ent, &error_injection_list, list) {
-		if (addr >= ent->start_addr && addr < ent->end_addr)
-			return ent->etype;
+		if (addr >= ent->start_addr && addr < ent->end_addr) {
+			ei_type = ent->etype;
+			break;
+		}
 	}
-	return EI_ETYPE_NONE;
+	mutex_unlock(&ei_mutex);
+
+	return ei_type;
 }
 
 /*

From 43c249ea0b1e10baac4a1264a25d69723ce5d2c2 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Fri, 24 Jun 2022 16:14:12 +0200
Subject: [PATCH 26/72] compiler-gcc.h: remove ancient workaround for gcc PR
 58670

The workaround for 'asm goto' miscompilation introduces a compiler barrier
quirk that inhibits many useful compiler optimizations.  For example,
__try_cmpxchg_user compiles to:

   11375:	41 8b 4d 00          	mov    0x0(%r13),%ecx
   11379:	41 8b 02             	mov    (%r10),%eax
   1137c:	f0 0f b1 0a          	lock cmpxchg %ecx,(%rdx)
   11380:	0f 94 c2             	sete   %dl
   11383:	84 d2                	test   %dl,%dl
   11385:	75 c4                	jne    1134b <...>
   11387:	41 89 02             	mov    %eax,(%r10)

where the barrier inhibits flags propagation from asm when compiled with
gcc-12.

When the mentioned quirk is removed, the following code is generated:

   11553:	41 8b 4d 00          	mov    0x0(%r13),%ecx
   11557:	41 8b 02             	mov    (%r10),%eax
   1155a:	f0 0f b1 0a          	lock cmpxchg %ecx,(%rdx)
   1155e:	74 c9                	je     11529 <...>
   11560:	41 89 02             	mov    %eax,(%r10)

The refered compiler bug:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670

was fixed for gcc-4.8.2.

Current minimum required version of GCC is version 5.1 which has the above
'asm goto' miscompilation fixed, so remove the workaround.

Link: https://lkml.kernel.org/r/20220624141412.72274-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a0c55eeaeaf163..9b157b71036f18 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,17 +66,6 @@
 		__builtin_unreachable();	\
 	} while (0)
 
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
-
 #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
 #define __HAVE_BUILTIN_BSWAP32__
 #define __HAVE_BUILTIN_BSWAP64__

From 045ed31e23aea840648c290dbde04797064960db Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 24 Jun 2022 08:30:04 +0300
Subject: [PATCH 27/72] kfifo: fix kfifo_to_user() return type

The kfifo_to_user() macro is supposed to return zero for success or
negative error codes.  Unfortunately, there is a signedness bug so it
returns unsigned int.  This only affects callers which try to save the
result in ssize_t and as far as I can see the only place which does that
is line6_hwdep_read().

TL;DR: s/_uint/_int/.

Link: https://lkml.kernel.org/r/YrVL3OJVLlNhIMFs@kili
Fixes: 144ecf310eb5 ("kfifo: fix kfifo_alloc() to return a signed int value")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kfifo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 86249476b57f43..0b35a41440ff13 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -688,7 +688,7 @@ __kfifo_uint_must_check_helper( \
  * writer, you don't need extra locking to use these macro.
  */
 #define	kfifo_to_user(fifo, to, len, copied) \
-__kfifo_uint_must_check_helper( \
+__kfifo_int_must_check_helper( \
 ({ \
 	typeof((fifo) + 1) __tmp = (fifo); \
 	void __user *__to = (to); \

From cda83bb8a61e6d7ce231dc1c2b78a9b79b1f1411 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Sat, 25 Jun 2022 21:53:24 +0800
Subject: [PATCH 28/72] lib/radix-tree: remove unused argument of
 insert_entries

insert_entries() doesn't use the 'bool replace' argument, and the function
is only used locally, remove the argument.

The historical context of the unused argument is as follow:

2: commit <3a08cd52c37c79> (radix tree: Remove multiorder support)
  Remove the code related to macro CONFIG_RADIX_TREE_MULTIORDER
to convert to the xArray.
  Without the macro, there is no need to retain the argument.

1: commit <175542f575723e> (radix-tree: add radix_tree_join)
  Add insert_entries(..., bool replace) function, depending on the
macro CONFIG_RADIX_TREE_MULTIORDER definition, the implementation
is different. Notice that the implementation without the macro doesn't
use the argument.

[Matthew Wilcox: add historical context for argument]

Link: https://lkml.kernel.org/r/20220625135324.72574-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/radix-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index b3afafe46fffbc..3c78e1e8b2ad60 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -677,7 +677,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
 }
 
 static inline int insert_entries(struct radix_tree_node *node,
-		void __rcu **slot, void *item, bool replace)
+		void __rcu **slot, void *item)
 {
 	if (*slot)
 		return -EEXIST;
@@ -711,7 +711,7 @@ int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	if (error)
 		return error;
 
-	error = insert_entries(node, slot, item, false);
+	error = insert_entries(node, slot, item);
 	if (error < 0)
 		return error;
 

From 2d8867f3e0833963dd4af64bed2fb47ed5cf55d8 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Mon, 27 Jun 2022 11:02:45 +0800
Subject: [PATCH 29/72] lib: make LZ4_decompress_safe_forceExtDict() static

LZ4_decompress_safe_forceExtDict() is only used in
lib/lz4/lz4_decompress.c, make it static to fix the build warning about
"no previous prototype" [1].

[1] https://lore.kernel.org/lkml/202206260948.akgsho1q-lkp@intel.com/

Link: https://lkml.kernel.org/r/1656298965-8698-1-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/lz4/lz4_decompress.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index fd1728d94babb2..59fe69a638000e 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -507,9 +507,9 @@ static int LZ4_decompress_safe_withSmallPrefix(const char *source, char *dest,
 				      (BYTE *)dest - prefixSize, NULL, 0);
 }
 
-int LZ4_decompress_safe_forceExtDict(const char *source, char *dest,
-				     int compressedSize, int maxOutputSize,
-				     const void *dictStart, size_t dictSize)
+static int LZ4_decompress_safe_forceExtDict(const char *source, char *dest,
+					    int compressedSize, int maxOutputSize,
+					    const void *dictStart, size_t dictSize)
 {
 	return LZ4_decompress_generic(source, dest,
 				      compressedSize, maxOutputSize,

From 6d529ea80b8a03401195506f45c052c4937545d5 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Wed, 29 Jun 2022 11:02:41 +0800
Subject: [PATCH 30/72] lib/scatterlist: use matched parameter type when
 calling __sg_free_table()

commit 4635873c561a ("scsi: lib/sg_pool.c: improve APIs for allocating sg
pool") changeed @(bool)skip_first_chunk of __sg_free_table() to @(unsigned
int)nents_first_chunk, so use unsigend int type instead of bool type
(false -> 0) when calling the function in sg_free_append_table() and
sg_free_table().

Link: https://lkml.kernel.org/r/20220629030241.84559-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: Maor Gottlieb <maorg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/scatterlist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index d5e82e4a57ad06..c8c3d675845c37 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -240,7 +240,7 @@ EXPORT_SYMBOL(__sg_free_table);
  **/
 void sg_free_append_table(struct sg_append_table *table)
 {
-	__sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, false, sg_kfree,
+	__sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, 0, sg_kfree,
 			table->total_nents);
 }
 EXPORT_SYMBOL(sg_free_append_table);
@@ -253,7 +253,7 @@ EXPORT_SYMBOL(sg_free_append_table);
  **/
 void sg_free_table(struct sg_table *table)
 {
-	__sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree,
+	__sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree,
 			table->orig_nents);
 }
 EXPORT_SYMBOL(sg_free_table);

From 4a70ce5f93aaeb0aa81f29c4a3c70f39d8f21087 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 4 Jul 2022 22:53:25 +0100
Subject: [PATCH 31/72] lib/ts_bm.c: remove redundant store to variable
 consumed after addition

There is no need to store the result of the addition back to variable
consumed after the addition.  The store is redundant, replace += with just
+

Cleans up clang scan build warning: lib/ts_bm.c:83:11: warning: Although
the value stored to 'consumed' is used in the enclosing expression, the
value is never actually read from 'consumed' [deadcode.DeadStores]

Link: https://lkml.kernel.org/r/20220704215325.600993-1-colin.i.king@gmail.com
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/ts_bm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/ts_bm.c b/lib/ts_bm.c
index 4cf250031f0f0a..1f2234221dd114 100644
--- a/lib/ts_bm.c
+++ b/lib/ts_bm.c
@@ -80,7 +80,7 @@ static unsigned int bm_find(struct ts_config *conf, struct ts_state *state)
 
 			/* London calling... */
 			DEBUGP("found!\n");
-			return consumed += (shift-(bm->patlen-1));
+			return consumed + (shift-(bm->patlen-1));
 
 next:			bs = bm->bad_shift[text[shift-i]];
 

From 71f8c15565d0f3d2f5b3339845e05cf4f03725cd Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Mon, 16 May 2022 17:05:07 -0700
Subject: [PATCH 32/72] kallsyms: move declarations to internal header

Patch series "Expose kallsyms data in vmcoreinfo note".

The kernel can be configured to contain a lot of introspection or
debugging information built-in, such as ORC for unwinding stack traces,
BTF for type information, and of course kallsyms.  Debuggers could use
this information to navigate a core dump or live system, but they need to
be able to find it.

This patch series adds the necessary symbols into vmcoreinfo, which would
allow a debugger to find and interpret the kallsyms table.  Using the
kallsyms data, the debugger can then lookup any symbol, allowing it to
find ORC, BTF, or any other useful data.

This would allow a live kernel, or core dump, to be debugged without any
DWARF debuginfo.  This is useful for many cases: the debuginfo may not
have been generated, or you may not want to deploy the large files
everywhere you need them.

I've demonstrated a proof of concept for this at LSF/MM+BPF during a
lighting talk.  Using a work-in-progress branch of the drgn debugger, and
an extended set of BTF generated by a patched version of dwarves, I've
been able to open a core dump without any DWARF info and do basic tasks
such as enumerating slab caches, block devices, tasks, and doing
backtraces.  I hope this series can be a first step toward a new
possibility of "DWARFless debugging".

Related discussion around the BTF side of this:
https://lore.kernel.org/bpf/586a6288-704a-f7a7-b256-e18a675927df@oracle.com/T/#u

Some work-in-progress branches using this feature:
https://github.com/brenns10/dwarves/tree/remove_percpu_restriction_1
https://github.com/brenns10/drgn/tree/kallsyms_plus_btf


This patch (of 2):

To include kallsyms data in the vmcoreinfo note, we must make the symbol
declarations visible outside of kallsyms.c.  Move these to a new internal
header file.

Link: https://lkml.kernel.org/r/20220517000508.777145-1-stephen.s.brennan@oracle.com
Link: https://lkml.kernel.org/r/20220517000508.777145-2-stephen.s.brennan@oracle.com
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Stephen Boyd <swboyd@chromium.org>
Cc: Bixuan Cui <cuibixuan@huawei.com>
Cc: David Vernet <void@manifault.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kallsyms.c          | 23 +----------------------
 kernel/kallsyms_internal.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 22 deletions(-)
 create mode 100644 kernel/kallsyms_internal.h

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fbdf8d3279aca3..510fba0ba5b4a5 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -31,28 +31,7 @@
 #include <linux/kernel.h>
 #include <linux/bsearch.h>
 
-/*
- * These will be re-linked against their real values
- * during the second link stage.
- */
-extern const unsigned long kallsyms_addresses[] __weak;
-extern const int kallsyms_offsets[] __weak;
-extern const u8 kallsyms_names[] __weak;
-
-/*
- * Tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV).
- */
-extern const unsigned int kallsyms_num_syms
-__section(".rodata") __attribute__((weak));
-
-extern const unsigned long kallsyms_relative_base
-__section(".rodata") __attribute__((weak));
-
-extern const char kallsyms_token_table[] __weak;
-extern const u16 kallsyms_token_index[] __weak;
-
-extern const unsigned int kallsyms_markers[] __weak;
+#include "kallsyms_internal.h"
 
 /*
  * Expand a compressed symbol data into the resulting uncompressed string,
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
new file mode 100644
index 00000000000000..2d0c6f2f0243a2
--- /dev/null
+++ b/kernel/kallsyms_internal.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef LINUX_KALLSYMS_INTERNAL_H_
+#define LINUX_KALLSYMS_INTERNAL_H_
+
+#include <linux/types.h>
+
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const int kallsyms_offsets[] __weak;
+extern const u8 kallsyms_names[] __weak;
+
+/*
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
+ */
+extern const unsigned int kallsyms_num_syms
+__section(".rodata") __attribute__((weak));
+
+extern const unsigned long kallsyms_relative_base
+__section(".rodata") __attribute__((weak));
+
+extern const char kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
+
+extern const unsigned int kallsyms_markers[] __weak;
+
+#endif // LINUX_KALLSYMS_INTERNAL_H_

From 5fd8fea935a1091083506d0b982fcc5d35062f06 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Mon, 16 May 2022 17:05:08 -0700
Subject: [PATCH 33/72] vmcoreinfo: include kallsyms symbols

The internal kallsyms tables contain information which could be quite
useful to a debugging tool in the absence of other debuginfo.  If kallsyms
is enabled, then a debugging tool could parse it and use it as a fallback
symbol table.  Combined with BTF data, live & post-mortem debuggers can
support basic operations without needing a large DWARF debuginfo file
available.  As many as five symbols are necessary to properly parse
kallsyms names and addresses.  Add these to the vmcoreinfo note.

CONFIG_KALLSYMS_ABSOLUTE_PERCPU does impact the computation of symbol
addresses.  However, a debugger can infer this configuration value by
comparing the address of _stext in the vmcoreinfo with the address
computed via kallsyms.  So there's no need to include information about
this config value in the vmcoreinfo note.

To verify that we're still well below the maximum of 4096 bytes, I created
a script[1] to compute a rough upper bound on the possible size of
vmcoreinfo.  On v5.18-rc7, the script reports 3106 bytes, and with this
patch, the maximum become 3370 bytes.

[1]: https://github.com/brenns10/kernel_stuff/blob/master/vmcoreinfosize/

Link: https://lkml.kernel.org/r/20220517000508.777145-3-stephen.s.brennan@oracle.com
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Bixuan Cui <cuibixuan@huawei.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Vernet <void@manifault.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Stephen Boyd <swboyd@chromium.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 71122e01623cc2..f64d35e2841198 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -15,6 +15,8 @@
 
 #include <crypto/sha1.h>
 
+#include "kallsyms_internal.h"
+
 /* vmcoreinfo stuff */
 unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
@@ -480,6 +482,18 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
 #endif
 
+#ifdef CONFIG_KALLSYMS
+	VMCOREINFO_SYMBOL(kallsyms_names);
+	VMCOREINFO_SYMBOL(kallsyms_token_table);
+	VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+	VMCOREINFO_SYMBOL(kallsyms_offsets);
+	VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+	VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
 	arch_crash_save_vmcoreinfo();
 	update_vmcoreinfo_note();
 

From 376b0c266143a1dda162db6d5bc9b3a7f0ae97c9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Jun 2022 14:22:06 +0300
Subject: [PATCH 34/72] proc: delete unused <linux/uaccess.h> includes

Those aren't necessary after seq files won.

Link: https://lkml.kernel.org/r/YqnA3mS7KBt8Z4If@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/array.c    | 1 -
 fs/proc/inode.c    | 2 --
 fs/proc/kmsg.c     | 1 -
 fs/proc/nommu.c    | 1 -
 fs/proc/proc_net.c | 3 ---
 fs/proc/proc_tty.c | 2 --
 fs/proc/root.c     | 3 ---
 fs/proc/vmcore.c   | 1 -
 8 files changed, 14 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index eb815759842ce4..65fa603422e04d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -69,7 +69,6 @@
 #include <linux/sched/cputime.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
-#include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 73aeb4e6d32e51..fd40d60169b5a2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -26,8 +26,6 @@
 #include <linux/mount.h>
 #include <linux/bug.h>
 
-#include <linux/uaccess.h>
-
 #include "internal.h"
 
 static void proc_evict_inode(struct inode *inode)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index b38ad552887fb5..592e6dc7c1102b 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,7 +15,6 @@
 #include <linux/fs.h>
 #include <linux/syslog.h>
 
-#include <linux/uaccess.h>
 #include <asm/io.h>
 
 extern wait_queue_head_t log_wait;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 13452b32e2bd57..4d3493579458f0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
-#include <linux/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/div64.h>
 #include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 913e5acefbb66b..bbce6fbe779c8c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,9 +8,6 @@
  *
  *  proc net directory handling functions
  */
-
-#include <linux/uaccess.h>
-
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index c69ff191e5d8f5..5c6a5ceab2f1b7 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,8 +4,6 @@
  *
  * Copyright 1997, Theodore Ts'o
  */
-
-#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef84f..5a7d15d197f8e0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,9 +6,6 @@
  *
  *  proc root directory handling functions
  */
-
-#include <linux/uaccess.h>
-
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4eaeb645e75966..f2aa86c421f2d4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,7 +25,6 @@
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
-#include <linux/uaccess.h>
 #include <linux/uio.h>
 #include <linux/cc_platform.h>
 #include <asm/io.h>

From 46d36b1be18b745fc9f6be2087633ba2f9895ffe Mon Sep 17 00:00:00 2001
From: Tao Liu <ltao@redhat.com>
Date: Mon, 27 Jun 2022 15:44:41 +0800
Subject: [PATCH 35/72] kdump: round up the total memory size to 128M for
 crashkernel reservation

The total memory size we get in kernel is usually slightly less than the
actual memory size because BIOS/firmware will reserve some memory region.
So it won't export all memory as usable.

E.g, on my x86_64 kvm guest with 1G memory, the total_mem value shows:
UEFI boot with ovmf: 0x3faef000 Legacy boot kvm guest: 0x3ff7ec00

When specifying crashkernel=1G-2G:128M, if we have a 1G memory machine, we
get total size 1023M from firmware.  Then it will not fall into 1G-2G,
thus no memory reserved.  User will never know this, it is hard to let
user know the exact total value in kernel.

One way is to use dmi/smbios to get physical memory size, but it's not
reliable as well.  According to Prarit hardware vendors sometimes screw
this up.  Thus round up total size to 128M to work around this problem.

This patch is a resend of [1] and rebased onto v5.19-rc2, and the
original credit goes to Dave Young.

[1]: http://lists.infradead.org/pipermail/kexec/2018-April/020568.html

Link: https://lkml.kernel.org/r/20220627074440.187222-1-ltao@redhat.com
Signed-off-by: Tao Liu <ltao@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index f64d35e2841198..07b26df453a977 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
+#include <linux/sizes.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -45,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline,
 					unsigned long long *crash_base)
 {
 	char *cur = cmdline, *tmp;
+	unsigned long long total_mem = system_ram;
+
+	/*
+	 * Firmware sometimes reserves some memory regions for its own use,
+	 * so the system memory size is less than the actual physical memory
+	 * size. Work around this by rounding up the total size to 128M,
+	 * which is enough for most test cases.
+	 */
+	total_mem = roundup(total_mem, SZ_128M);
 
 	/* for each entry of the comma-separated list */
 	do {
@@ -89,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline,
 			return -EINVAL;
 		}
 		cur = tmp;
-		if (size >= system_ram) {
+		if (size >= total_mem) {
 			pr_warn("crashkernel: invalid size\n");
 			return -EINVAL;
 		}
 
 		/* match ? */
-		if (system_ram >= start && system_ram < end) {
+		if (total_mem >= start && total_mem < end) {
 			*crash_size = size;
 			break;
 		}

From 2c795fb03f138e9602e1f1ee31b8bfc00a96c7e5 Mon Sep 17 00:00:00 2001
From: Yu Zhe <yuzhe@nfschina.com>
Date: Tue, 28 Jun 2022 10:12:51 +0800
Subject: [PATCH 36/72] ipc/mqueue: remove unnecessary (void*) conversion

Remove unnecessary void* type casting.

Link: https://lkml.kernel.org/r/20220628021251.17197-1-yuzhe@nfschina.com
Signed-off-by: Yu Zhe <yuzhe@nfschina.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 ipc/mqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 12ad7860bb88f1..f98de32aeea174 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -489,7 +489,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
 
 static void init_once(void *foo)
 {
-	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
+	struct mqueue_inode_info *p = foo;
 
 	inode_init_once(&p->vfs_inode);
 }

From a16ceb13961068f7209e34d7984f8e42d2c06159 Mon Sep 17 00:00:00 2001
From: Benjamin Segall <bsegall@google.com>
Date: Wed, 15 Jun 2022 14:24:23 -0700
Subject: [PATCH 37/72] epoll: autoremove wakers even more aggressively

If a process is killed or otherwise exits while having active network
connections and many threads waiting on epoll_wait, the threads will all
be woken immediately, but not removed from ep->wq.  Then when network
traffic scans ep->wq in wake_up, every wakeup attempt will fail, and will
not remove the entries from the list.

This means that the cost of the wakeup attempt is far higher than usual,
does not decrease, and this also competes with the dying threads trying to
actually make progress and remove themselves from the wq.

Handle this by removing visited epoll wq entries unconditionally, rather
than only when the wakeup succeeds - the structure of ep_poll means that
the only potential loss is the timed_out->eavail heuristic, which now can
race and result in a redundant ep_send_events attempt.  (But only when
incoming data and a timeout actually race, not on every timeout)

Shakeel added:

: We are seeing this issue in production with real workloads and it has
: caused hard lockups.  Particularly network heavy workloads with a lot
: of threads in epoll_wait() can easily trigger this issue if they get
: killed (oom-killed in our case).

Link: https://lkml.kernel.org/r/xm26fsjotqda.fsf@google.com
Signed-off-by: Ben Segall <bsegall@google.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Roman Penyaev <rpenyaev@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Khazhismel Kumykov <khazhy@google.com>
Cc: Heiher <r@hev.cc>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/eventpoll.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e2daa940ebce7c..8b56b94e2f56f8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1747,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
 	return to;
 }
 
+/*
+ * autoremove_wake_function, but remove even on failure to wake up, because we
+ * know that default_wake_function/ttwu will only fail if the thread is already
+ * woken, and in that case the ep_poll loop will remove the entry anyways, not
+ * try to reuse it.
+ */
+static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+				       unsigned int mode, int sync, void *key)
+{
+	int ret = default_wake_function(wq_entry, mode, sync, key);
+
+	list_del_init(&wq_entry->entry);
+	return ret;
+}
+
 /**
  * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
  *           event buffer.
@@ -1828,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		 * normal wakeup path no need to call __remove_wait_queue()
 		 * explicitly, thus ep->lock is not taken, which halts the
 		 * event delivery.
+		 *
+		 * In fact, we now use an even more aggressive function that
+		 * unconditionally removes, because we don't reuse the wait
+		 * entry between loop iterations. This lets us also avoid the
+		 * performance issue if a process is killed, causing all of its
+		 * threads to wake up without being removed normally.
 		 */
 		init_wait(&wait);
+		wait.func = ep_autoremove_wake_function;
 
 		write_lock_irq(&ep->lock);
 		/*

From b62eb2731e17e83c32e1a6089b4463da1a75e66e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 1 Jul 2022 14:35:12 +0300
Subject: [PATCH 38/72] scripts/bloat-o-meter: switch argument parsing to using
 argparse

This will facilitate further extension to the arguments the script takes.
As an added benefit it also produces saner usage output, where mutual
exclusivity of the c|d|t parameters is clearly visible:

./scripts/bloat-o-meter  -h
usage: bloat-o-meter [-h] [-c | -d | -t] file1 file2

Simple script used to compare the symbol sizes of 2 object files

positional arguments:
  file1       First file to compare
  file2       Second file to compare

optional arguments:
  -h, --help  show this help message and exit
  -c          categorize output based on symbol type
  -d          Show delta of Data Section
  -t          Show delta of text Section

Link: https://lkml.kernel.org/r/20220701113513.1938008-1-nborisov@suse.com
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/bloat-o-meter | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index 4dd6a804ce41b2..2a360118710e53 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -7,18 +7,20 @@
 # This software may be used and distributed according to the terms
 # of the GNU General Public License, incorporated herein by reference.
 
-import sys, os, re
+import sys, os, re, argparse
 from signal import signal, SIGPIPE, SIG_DFL
 
 signal(SIGPIPE, SIG_DFL)
 
-if len(sys.argv) < 3:
-    sys.stderr.write("usage: %s [option] file1 file2\n" % sys.argv[0])
-    sys.stderr.write("The options are:\n")
-    sys.stderr.write("-c	categorize output based on symbol type\n")
-    sys.stderr.write("-d	Show delta of Data Section\n")
-    sys.stderr.write("-t	Show delta of text Section\n")
-    sys.exit(-1)
+parser = argparse.ArgumentParser(description="Simple script used to compare the symbol sizes of 2 object files")
+group = parser.add_mutually_exclusive_group()
+group.add_argument('-c', help='categorize output based on symbol type', action='store_true')
+group.add_argument('-d', help='Show delta of Data Section', action='store_true')
+group.add_argument('-t', help='Show delta of text Section', action='store_true')
+parser.add_argument('file1', help='First file to compare')
+parser.add_argument('file2', help='Second file to compare')
+
+args = parser.parse_args()
 
 re_NUMBER = re.compile(r'\.[0-9]+')
 
@@ -77,9 +79,9 @@ def calc(oldfile, newfile, format):
     delta.reverse()
     return grow, shrink, add, remove, up, down, delta, old, new, otot, ntot
 
-def print_result(symboltype, symbolformat, argc):
+def print_result(symboltype, symbolformat):
     grow, shrink, add, remove, up, down, delta, old, new, otot, ntot = \
-    calc(sys.argv[argc - 1], sys.argv[argc], symbolformat)
+    calc(args.file1, args.file2, symbolformat)
 
     print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \
           (add, remove, grow, shrink, up, -down, up-down))
@@ -93,13 +95,13 @@ def print_result(symboltype, symbolformat, argc):
         percent = 0
     print("Total: Before=%d, After=%d, chg %+.2f%%" % (otot, ntot, percent))
 
-if sys.argv[1] == "-c":
-    print_result("Function", "tT", 3)
-    print_result("Data", "dDbB", 3)
-    print_result("RO Data", "rR", 3)
-elif sys.argv[1] == "-d":
-    print_result("Data", "dDbBrR", 3)
-elif sys.argv[1] == "-t":
-    print_result("Function", "tT", 3)
+if args.c:
+    print_result("Function", "tT")
+    print_result("Data", "dDbB")
+    print_result("RO Data", "rR")
+elif args.d:
+    print_result("Data", "dDbBrR")
+elif args.t:
+    print_result("Function", "tT")
 else:
-    print_result("Function", "tTdDbBrR", 2)
+    print_result("Function", "tTdDbBrR")

From 8b5db6679807fd0ab1154375ea6e5aa6b11c4350 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 1 Jul 2022 14:35:13 +0300
Subject: [PATCH 39/72] scripts/bloat-o-meter: add -p argument

When doing cross platform development on a machine sometimes it might be
useful to invoke bloat-o-meter for files which haven't been build with the
native toolchain.  In cases when the host nm doesn't support the target
one then a toolchain-specific nm could be used.  Add this ability by
adding the -p allowing invocations as:

./scripts/bloat-o-meter -p riscv64-unknown-linux-gnu- file1.o file2.o

Link: https://lkml.kernel.org/r/20220701113513.1938008-2-nborisov@suse.com
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/bloat-o-meter | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index 2a360118710e53..f9553f60a14a89 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -17,6 +17,7 @@ group = parser.add_mutually_exclusive_group()
 group.add_argument('-c', help='categorize output based on symbol type', action='store_true')
 group.add_argument('-d', help='Show delta of Data Section', action='store_true')
 group.add_argument('-t', help='Show delta of text Section', action='store_true')
+parser.add_argument('-p', dest='prefix', help='Arch prefix for the tool being used. Useful in cross build scenarios')
 parser.add_argument('file1', help='First file to compare')
 parser.add_argument('file2', help='Second file to compare')
 
@@ -26,7 +27,11 @@ re_NUMBER = re.compile(r'\.[0-9]+')
 
 def getsizes(file, format):
     sym = {}
-    with os.popen("nm --size-sort " + file) as f:
+    nm = "nm"
+    if args.prefix:
+        nm = "{}nm".format(args.prefix)
+
+    with os.popen("{} --size-sort {}".format(nm, file)) as f:
         for line in f:
             if line.startswith("\n") or ":" in line:
                 continue

From adbcaef84088713bae9af5690cf566ce4fadfa22 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:24 +0200
Subject: [PATCH 40/72] x86/cacheinfo: move shared cache map definitions

Patch series "cpumask: Fix invalid uniprocessor assumptions", v4.

On uniprocessor builds, it is currently assumed that any cpumask will
contain the single CPU: cpu0.  This assumption is used to provide
optimised implementations.

The current assumption also appears to be wrong, by ignoring the fact that
users can provide empty cpumasks.  This can result in bugs as explained in
[1] - for_each_cpu() will run one iteration of the loop even when passed
an empty cpumask.

This series introduces some basic tests, and updates the optimisations for
uniprocessor builds.

The x86 patch was written after the kernel test robot [2] ran into a
failed build.  I have tried to list the files potentially affected by the
changes to cpumask.h, in an attempt to find any other cases that fail on
!SMP.  I've gone through some of the files manually, and ran a few cross
builds, but nothing else popped up.  I (build) checked about half of the
potientally affected files, but I do not have the resources to do them
all.  I hope we can fix other issues if/when they pop up later.

[1] https://lore.kernel.org/all/20220530082552.46113-1-sander@svanheule.net/
[2] https://lore.kernel.org/all/202206060858.wA0FOzRy-lkp@intel.com/


This patch (of 5):

The maps to keep track of shared caches between CPUs on SMP systems are
declared in asm/smp.h, among them specifically cpu_llc_shared_map.  These
maps are externally defined in cpu/smpboot.c.  The latter is only compiled
on CONFIG_SMP=y, which means the declared extern symbols from asm/smp.h do
not have a corresponding definition on uniprocessor builds.

The inline cpu_llc_shared_mask() function from asm/smp.h refers to the map
declaration mentioned above.  This function is referenced in cacheinfo.c
inside for_each_cpu() loop macros, to provide cpumask for the loop.  On
uniprocessor builds, the symbol for the cpu_llc_shared_map does not exist.
However, the current implementation of for_each_cpu() also (wrongly)
ignores the provided mask.

By sheer luck, the compiler thus optimises out this unused reference to
cpu_llc_shared_map, and the linker therefore does not require the
cpu_llc_shared_mask to actually exist on uniprocessor builds.  Only on SMP
bulids does smpboot.o exist to provide the required symbols.

To no longer rely on compiler optimisations for successful uniprocessor
builds, move the definitions of cpu_llc_shared_map and cpu_l2c_shared_map
from smpboot.c to cacheinfo.c.

Link: https://lkml.kernel.org/r/cover.1656777646.git.sander@svanheule.net
Link: https://lkml.kernel.org/r/e8167ddb570f56744a3dc12c2149a660a324d969.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Marco Elver <elver@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/cpu/cacheinfo.c | 6 ++++++
 arch/x86/kernel/smpboot.c       | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index fe98a1465be6ac..66556833d7af5d 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -29,6 +29,12 @@
 #define LVL_3		4
 #define LVL_TRACE	5
 
+/* Shared last level cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+
+/* Shared L2 cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
 struct _cache_table {
 	unsigned char descriptor;
 	char cache_type;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5e7f9532a10d07..f24227bc3220a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -95,10 +95,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
 EXPORT_PER_CPU_SYMBOL(cpu_die_map);
 
-DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
-
-DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
-
 /* Per CPU bogomips and other parameters */
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);

From 4f09903078eeb9138cddce8db06100b82f8620e8 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:27 +0200
Subject: [PATCH 41/72] cpumask: add UP optimised for_each_*_cpu versions

On uniprocessor builds, the following loops will always run over a mask
that contains one enabled CPU (cpu0):

    - for_each_possible_cpu
    - for_each_online_cpu
    - for_each_present_cpu

Provide uniprocessor-specific macros for these loops, that always run
exactly once.

Link: https://lkml.kernel.org/r/3a92869b902a075b97be5d1452c9c6badbbff0df.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Acked-by: Yury Norov <yury.norov@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469c2..533612770bc077 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -811,9 +811,16 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
 /* First bits of cpu_bit_bitmap are in fact unset. */
 #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])
 
+#if NR_CPUS == 1
+/* Uniprocessor: the possible/online/present masks are always "1" */
+#define for_each_possible_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_online_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_present_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#else
 #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
 #define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
 #define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
+#endif
 
 /* Wrappers for arch boot code to manipulate normally-constant masks */
 void init_cpu_present(const struct cpumask *src);

From b81dce77cedcea6f00292f02d4b1ebbfc2c5988d Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:25 +0200
Subject: [PATCH 42/72] cpumask: Fix invalid uniprocessor mask assumption

On uniprocessor builds, any CPU mask is assumed to contain exactly one CPU
(cpu0).  This assumption ignores the existence of empty masks, resulting
in incorrect behaviour.

cpumask_first_zero(), cpumask_next_zero(), and for_each_cpu_not() don't
provide behaviour matching the assumption that a UP mask is always "1",
and instead provide behaviour matching the empty mask.

Drop the incorrectly optimised code and use the generic implementations in
all cases.

Link: https://lkml.kernel.org/r/86bf3f005abba2d92120ddd0809235cab4f759a6.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Suggested-by: Yury Norov <yury.norov@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 99 ++++++++---------------------------------
 lib/Makefile            |  3 +-
 lib/cpumask.c           |  2 +
 3 files changed, 22 insertions(+), 82 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 533612770bc077..6c5b4ee000f2ea 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -116,85 +116,6 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
 	return cpu;
 }
 
-#if NR_CPUS == 1
-/* Uniprocessor.  Assume all masks are "1". */
-static inline unsigned int cpumask_first(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_first_and(const struct cpumask *srcp1,
-					     const struct cpumask *srcp2)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_last(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-/* Valid inputs for n are -1 and 0. */
-static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_and(int n,
-					    const struct cpumask *srcp,
-					    const struct cpumask *andp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
-					     int start, bool wrap)
-{
-	/* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
-	return (wrap && n == 0);
-}
-
-/* cpu must be a valid cpu, ie 0, so there's no other choice. */
-static inline unsigned int cpumask_any_but(const struct cpumask *mask,
-					   unsigned int cpu)
-{
-	return 1;
-}
-
-static inline unsigned int cpumask_local_spread(unsigned int i, int node)
-{
-	return 0;
-}
-
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
-					     const struct cpumask *src2p) {
-	return cpumask_first_and(src1p, src2p);
-}
-
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
-{
-	return cpumask_first(srcp);
-}
-
-#define for_each_cpu(cpu, mask)			\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_not(cpu, mask)		\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_wrap(cpu, mask, start)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
-#define for_each_cpu_and(cpu, mask1, mask2)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
-#else
 /**
  * cpumask_first - get the first cpu in a cpumask
  * @srcp: the cpumask pointer
@@ -260,10 +181,29 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 
 int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
 int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+
+#if NR_CPUS == 1
+/* Uniprocessor: there is only one valid CPU */
+static inline unsigned int cpumask_local_spread(unsigned int i, int node)
+{
+	return 0;
+}
+
+static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
+					     const struct cpumask *src2p) {
+	return cpumask_first_and(src1p, src2p);
+}
+
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+#else
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
 int cpumask_any_distribute(const struct cpumask *srcp);
+#endif /* NR_CPUS */
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
@@ -324,7 +264,6 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool
 	for ((cpu) = -1;						\
 		(cpu) = cpumask_next_and((cpu), (mask1), (mask2)),	\
 		(cpu) < nr_cpu_ids;)
-#endif /* SMP */
 
 #define CPU_BITS_NONE						\
 {								\
diff --git a/lib/Makefile b/lib/Makefile
index f99bf61f8bbc67..bcc7e8ea0cde5c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,10 +34,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
-	 buildid.o
+	 buildid.o cpumask.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
-lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a971a82d2f4360..b9728513a4d401 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -192,6 +192,7 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+#if NR_CPUS > 1
 /**
  * cpumask_local_spread - select the i'th cpu with local numa cpu's first
  * @i: index number
@@ -279,3 +280,4 @@ int cpumask_any_distribute(const struct cpumask *srcp)
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_distribute);
+#endif /* NR_CPUS */

From c41e8866c28c4d1a88a085fc3c3d6ba403510804 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:26 +0200
Subject: [PATCH 43/72] lib/test: introduce cpumask KUnit test suite

Add a basic suite of tests for cpumask, providing some tests for empty and
completely filled cpumasks.

Link: https://lkml.kernel.org/r/c96980ec35c3bd23f17c3374bf42c22971545e85.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Suggested-by: Yury Norov <yury.norov@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug  |   9 +++
 lib/Makefile       |   1 +
 lib/test_cpumask.c | 138 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+)
 create mode 100644 lib/test_cpumask.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2e24db4bff1921..04aaa20d50f982 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2021,6 +2021,15 @@ config LKDTM
 	Documentation on how to use the module can be found in
 	Documentation/fault-injection/provoke-crashes.rst
 
+config TEST_CPUMASK
+	tristate "cpumask tests" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
+	help
+	  Enable to turn on cpumask tests, running at boot or module load time.
+
+	  If unsure, say N.
+
 config TEST_LIST_SORT
 	tristate "Linked list sorting test" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/Makefile b/lib/Makefile
index bcc7e8ea0cde5c..de3e47453fe8eb 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_TEST_HMM) += test_hmm.o
 obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o
+obj-$(CONFIG_TEST_CPUMASK) += test_cpumask.o
 CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE)
 obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o
 #
diff --git a/lib/test_cpumask.c b/lib/test_cpumask.c
new file mode 100644
index 00000000000000..a31a1622f1f6e8
--- /dev/null
+++ b/lib/test_cpumask.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for cpumask.
+ *
+ * Author: Sander Vanheule <sander@svanheule.net>
+ */
+
+#include <kunit/test.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#define EXPECT_FOR_EACH_CPU_EQ(test, mask)			\
+	do {							\
+		const cpumask_t *m = (mask);			\
+		int mask_weight = cpumask_weight(m);		\
+		int cpu, iter = 0;				\
+		for_each_cpu(cpu, m)				\
+			iter++;					\
+		KUNIT_EXPECT_EQ((test), mask_weight, iter);	\
+	} while (0)
+
+#define EXPECT_FOR_EACH_CPU_NOT_EQ(test, mask)					\
+	do {									\
+		const cpumask_t *m = (mask);					\
+		int mask_weight = cpumask_weight(m);				\
+		int cpu, iter = 0;						\
+		for_each_cpu_not(cpu, m)					\
+			iter++;							\
+		KUNIT_EXPECT_EQ((test), nr_cpu_ids - mask_weight, iter);	\
+	} while (0)
+
+#define EXPECT_FOR_EACH_CPU_WRAP_EQ(test, mask)			\
+	do {							\
+		const cpumask_t *m = (mask);			\
+		int mask_weight = cpumask_weight(m);		\
+		int cpu, iter = 0;				\
+		for_each_cpu_wrap(cpu, m, nr_cpu_ids / 2)	\
+			iter++;					\
+		KUNIT_EXPECT_EQ((test), mask_weight, iter);	\
+	} while (0)
+
+#define EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, name)		\
+	do {							\
+		int mask_weight = num_##name##_cpus();		\
+		int cpu, iter = 0;				\
+		for_each_##name##_cpu(cpu)			\
+			iter++;					\
+		KUNIT_EXPECT_EQ((test), mask_weight, iter);	\
+	} while (0)
+
+static cpumask_t mask_empty;
+static cpumask_t mask_all;
+
+static void test_cpumask_weight(struct kunit *test)
+{
+	KUNIT_EXPECT_TRUE(test, cpumask_empty(&mask_empty));
+	KUNIT_EXPECT_TRUE(test, cpumask_full(cpu_possible_mask));
+	KUNIT_EXPECT_TRUE(test, cpumask_full(&mask_all));
+
+	KUNIT_EXPECT_EQ(test, 0, cpumask_weight(&mask_empty));
+	KUNIT_EXPECT_EQ(test, nr_cpu_ids, cpumask_weight(cpu_possible_mask));
+	KUNIT_EXPECT_EQ(test, nr_cpumask_bits, cpumask_weight(&mask_all));
+}
+
+static void test_cpumask_first(struct kunit *test)
+{
+	KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_first(&mask_empty));
+	KUNIT_EXPECT_EQ(test, 0, cpumask_first(cpu_possible_mask));
+
+	KUNIT_EXPECT_EQ(test, 0, cpumask_first_zero(&mask_empty));
+	KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_first_zero(cpu_possible_mask));
+}
+
+static void test_cpumask_last(struct kunit *test)
+{
+	KUNIT_EXPECT_LE(test, nr_cpumask_bits, cpumask_last(&mask_empty));
+	KUNIT_EXPECT_EQ(test, nr_cpumask_bits - 1, cpumask_last(cpu_possible_mask));
+}
+
+static void test_cpumask_next(struct kunit *test)
+{
+	KUNIT_EXPECT_EQ(test, 0, cpumask_next_zero(-1, &mask_empty));
+	KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_next_zero(-1, cpu_possible_mask));
+
+	KUNIT_EXPECT_LE(test, nr_cpu_ids, cpumask_next(-1, &mask_empty));
+	KUNIT_EXPECT_EQ(test, 0, cpumask_next(-1, cpu_possible_mask));
+}
+
+static void test_cpumask_iterators(struct kunit *test)
+{
+	EXPECT_FOR_EACH_CPU_EQ(test, &mask_empty);
+	EXPECT_FOR_EACH_CPU_NOT_EQ(test, &mask_empty);
+	EXPECT_FOR_EACH_CPU_WRAP_EQ(test, &mask_empty);
+
+	EXPECT_FOR_EACH_CPU_EQ(test, cpu_possible_mask);
+	EXPECT_FOR_EACH_CPU_NOT_EQ(test, cpu_possible_mask);
+	EXPECT_FOR_EACH_CPU_WRAP_EQ(test, cpu_possible_mask);
+}
+
+static void test_cpumask_iterators_builtin(struct kunit *test)
+{
+	EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, possible);
+
+	/* Ensure the dynamic masks are stable while running the tests */
+	cpu_hotplug_disable();
+
+	EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, online);
+	EXPECT_FOR_EACH_CPU_BUILTIN_EQ(test, present);
+
+	cpu_hotplug_enable();
+}
+
+static int test_cpumask_init(struct kunit *test)
+{
+	cpumask_clear(&mask_empty);
+	cpumask_setall(&mask_all);
+
+	return 0;
+}
+
+static struct kunit_case test_cpumask_cases[] = {
+	KUNIT_CASE(test_cpumask_weight),
+	KUNIT_CASE(test_cpumask_first),
+	KUNIT_CASE(test_cpumask_last),
+	KUNIT_CASE(test_cpumask_next),
+	KUNIT_CASE(test_cpumask_iterators),
+	KUNIT_CASE(test_cpumask_iterators_builtin),
+	{}
+};
+
+static struct kunit_suite test_cpumask_suite = {
+	.name = "cpumask",
+	.init = test_cpumask_init,
+	.test_cases = test_cpumask_cases,
+};
+kunit_test_suite(test_cpumask_suite);
+
+MODULE_LICENSE("GPL");

From 953257a9252a9b3c58ca68fc5bf26fc65e5b1cb8 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:28 +0200
Subject: [PATCH 44/72] cpumask: update cpumask_next_wrap() signature

The extern specifier is not needed for this declaration, so drop it.  The
function also depends only on the input parameters, and has no side
effects, so it can be marked __pure like other functions in cpumask.h.

Link: https://lkml.kernel.org/r/72ab755695b74bb5fbaa756ae4c0edd708d172f1.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 6c5b4ee000f2ea..523857884ae440 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -229,7 +229,7 @@ int cpumask_any_distribute(const struct cpumask *srcp);
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
-extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location

From bd27acaac24e4b252ee28dddcabaee80456d0faf Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 22 Jun 2022 14:46:31 +0900
Subject: [PATCH 45/72] lib/smp_processor_id: fix imbalanced
 instrumentation_end() call

Currently instrumentation_end() won't be called if printk_ratelimit()
returned false.

Link: https://lkml.kernel.org/r/a636d8e0-ad32-5888-acac-671f7f553bb3@I-love.SAKURA.ne.jp
Fixes: 126f21f0e8d46e2c ("lib/smp_processor_id: Move it into noinstr section")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexandre Chartre <alexandre.chartre@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/smp_processor_id.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 046ac6297c7811..a2bb7738c373cd 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -47,9 +47,9 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
 
 	printk("caller is %pS\n", __builtin_return_address(0));
 	dump_stack();
-	instrumentation_end();
 
 out_enable:
+	instrumentation_end();
 	preempt_enable_no_resched_notrace();
 out:
 	return this_cpu;

From 55656016daa7155d95471627c1b1438d488f011b Mon Sep 17 00:00:00 2001
From: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Date: Fri, 8 Jul 2022 21:19:47 +0800
Subject: [PATCH 46/72] lib: devres: use numa aware allocation

Allocate device resource from local node memory when the numa locality of
the device is specified.

Link: https://lkml.kernel.org/r/20220708131952.14500-1-mark-pk.tsai@mediatek.com
Signed-off-by: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: YJ Chiang <yj.chiang@mediatek.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/devres.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/lib/devres.c b/lib/devres.c
index 14664bbb48757d..55eb07e80cbb3b 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -29,7 +29,8 @@ static void __iomem *__devm_ioremap(struct device *dev, resource_size_t offset,
 {
 	void __iomem **ptr, *addr = NULL;
 
-	ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	ptr = devres_alloc_node(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL,
+				dev_to_node(dev));
 	if (!ptr)
 		return NULL;
 
@@ -292,7 +293,8 @@ void __iomem *devm_ioport_map(struct device *dev, unsigned long port,
 {
 	void __iomem **ptr, *addr;
 
-	ptr = devres_alloc(devm_ioport_map_release, sizeof(*ptr), GFP_KERNEL);
+	ptr = devres_alloc_node(devm_ioport_map_release, sizeof(*ptr), GFP_KERNEL,
+				dev_to_node(dev));
 	if (!ptr)
 		return NULL;
 
@@ -366,7 +368,8 @@ void __iomem * const *pcim_iomap_table(struct pci_dev *pdev)
 	if (dr)
 		return dr->table;
 
-	new_dr = devres_alloc(pcim_iomap_release, sizeof(*new_dr), GFP_KERNEL);
+	new_dr = devres_alloc_node(pcim_iomap_release, sizeof(*new_dr), GFP_KERNEL,
+				   dev_to_node(&pdev->dev));
 	if (!new_dr)
 		return NULL;
 	dr = devres_get(&pdev->dev, new_dr, NULL, NULL);
@@ -548,7 +551,8 @@ int devm_arch_phys_wc_add(struct device *dev, unsigned long base, unsigned long
 	int *mtrr;
 	int ret;
 
-	mtrr = devres_alloc(devm_arch_phys_ac_add_release, sizeof(*mtrr), GFP_KERNEL);
+	mtrr = devres_alloc_node(devm_arch_phys_ac_add_release, sizeof(*mtrr), GFP_KERNEL,
+				 dev_to_node(dev));
 	if (!mtrr)
 		return -ENOMEM;
 
@@ -593,7 +597,8 @@ int devm_arch_io_reserve_memtype_wc(struct device *dev, resource_size_t start,
 	struct arch_io_reserve_memtype_wc_devres *dr;
 	int ret;
 
-	dr = devres_alloc(devm_arch_io_free_memtype_wc_release, sizeof(*dr), GFP_KERNEL);
+	dr = devres_alloc_node(devm_arch_io_free_memtype_wc_release, sizeof(*dr), GFP_KERNEL,
+			       dev_to_node(dev));
 	if (!dr)
 		return -ENOMEM;
 

From f71381fcdc3ab615f55278d435a9f35542dc9e63 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:01 +0800
Subject: [PATCH 47/72] autofs: use inode permission method for write access

Patch series "autofs: misc patches".

This series contains several patches that resulted mostly from comments
made by Al Viro (quite a long time ago now).


This patch (of 5):

Eliminate some code duplication from mkdir/rmdir/symlink/unlink methods by
using the inode operation .permission().

Link: https://lkml.kernel.org/r/165724445154.30914.10970894936827635879.stgit@donald.themaw.net
Link: https://lkml.kernel.org/r/165724458096.30914.13499431569758625806.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/root.c | 63 +++++++++++++++++-------------------------------
 1 file changed, 22 insertions(+), 41 deletions(-)

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 91fe4548c25657..fef6ed9910221d 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,6 +10,7 @@
 
 #include "autofs_i.h"
 
+static int autofs_dir_permission(struct user_namespace *, struct inode *, int);
 static int autofs_dir_symlink(struct user_namespace *, struct inode *,
 			      struct dentry *, const char *);
 static int autofs_dir_unlink(struct inode *, struct dentry *);
@@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = {
 
 const struct inode_operations autofs_dir_inode_operations = {
 	.lookup		= autofs_lookup,
+	.permission	= autofs_dir_permission,
 	.unlink		= autofs_dir_unlink,
 	.symlink	= autofs_dir_symlink,
 	.mkdir		= autofs_dir_mkdir,
@@ -526,11 +528,30 @@ static struct dentry *autofs_lookup(struct inode *dir,
 	return NULL;
 }
 
+static int autofs_dir_permission(struct user_namespace *mnt_userns,
+				 struct inode *inode, int mask)
+{
+	if (mask & MAY_WRITE) {
+		struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
+
+		if (!autofs_oz_mode(sbi))
+			return -EACCES;
+
+		/* autofs_oz_mode() needs to allow path walks when the
+		 * autofs mount is catatonic but the state of an autofs
+		 * file system needs to be preserved over restarts.
+		 */
+		if (sbi->flags & AUTOFS_SBI_CATATONIC)
+			return -EACCES;
+	}
+
+	return generic_permission(mnt_userns, inode, mask);
+}
+
 static int autofs_dir_symlink(struct user_namespace *mnt_userns,
 			      struct inode *dir, struct dentry *dentry,
 			      const char *symname)
 {
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 	struct inode *inode;
@@ -539,16 +560,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
 
 	pr_debug("%s <- %pd\n", symname, dentry);
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	BUG_ON(!ino);
 
 	autofs_clean_ino(ino);
@@ -601,16 +612,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	ino->count--;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
@@ -683,16 +684,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 
 	pr_debug("dentry %p, removing %pd\n", dentry, dentry);
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	if (ino->count != 1)
 		return -ENOTEMPTY;
 
@@ -726,16 +717,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
 	struct autofs_info *p_ino;
 	struct inode *inode;
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	pr_debug("dentry %p, creating %pd\n", dentry, dentry);
 
 	BUG_ON(!ino);

From 9ccbac76e71de411b9c4beea9d91ba98f3fad690 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:06 +0800
Subject: [PATCH 48/72] autofs: make dentry info count consistent

If an autofs dentry is a mount root directory there's no ->mkdir() call to
set its count to one.

To make the dentry info count consistent for all autofs dentries set count
to one when the dentry info struct is allocated.

Link: https://lkml.kernel.org/r/165724458671.30914.2902424437132835325.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/inode.c | 1 +
 fs/autofs/root.c  | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 9edf243713eb6e..affa70360b1f8f 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
 		INIT_LIST_HEAD(&ino->expiring);
 		ino->last_used = jiffies;
 		ino->sbi = sbi;
+		ino->count = 1;
 	}
 	return ino;
 }
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index fef6ed9910221d..442d27d9cb1b99 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -582,7 +582,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
 	d_add(dentry, inode);
 
 	dget(dentry);
-	ino->count++;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 
@@ -612,7 +611,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 
-	ino->count--;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
 	dput(ino->dentry);
@@ -695,7 +693,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	if (sbi->version < 5)
 		autofs_clear_leaf_automount_flags(dentry);
 
-	ino->count--;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
 	dput(ino->dentry);
@@ -734,7 +731,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
 		autofs_set_leaf_automount_flags(dentry);
 
 	dget(dentry);
-	ino->count++;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 	inc_nlink(dir);

From a4a87303874c1a7d49cc18a8fe33676b0002ffbf Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:12 +0800
Subject: [PATCH 49/72] autofs: use dentry info count instead of simple_empty()

The dentry info.  field count is used to check if a dentry is in use
during expire.  But, to be used for this the count field must account for
the presence of child dentries in a directory dentry.

Therefore it can also be used to check for an empty directory dentry which
can be done without having to to take an additional lock or account for
the presence of a readdir cursor dentry as is done by simple_empty().

Link: https://lkml.kernel.org/r/165724459238.30914.1504611159945950108.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/autofs_i.h |  5 +++++
 fs/autofs/expire.c   |  2 +-
 fs/autofs/root.c     | 18 ++++++++----------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 918826eaceea34..0117d6e0630089 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -148,6 +148,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
 		 task_pgrp(current) == sbi->oz_pgrp);
 }
 
+static inline bool autofs_empty(struct autofs_info *ino)
+{
+	return ino->count < 2;
+}
+
 struct inode *autofs_get_inode(struct super_block *, umode_t);
 void autofs_free_ino(struct autofs_info *);
 
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index b3fefd6237c362..038b3d2d9f572e 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 		return NULL;
 	}
 
-	if (simple_empty(dentry))
+	if (autofs_empty(ino))
 		return NULL;
 
 	/* Case 2: tree mount, expire iff entire tree is not busy */
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 442d27d9cb1b99..e0fa71eb5c0587 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -79,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs_dentry_ino(dentry);
 
 	pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
 
@@ -95,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
 	 * it.
 	 */
 	spin_lock(&sbi->lookup_lock);
-	if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) {
+	if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) {
 		spin_unlock(&sbi->lookup_lock);
 		return -ENOENT;
 	}
@@ -364,7 +365,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
 		 * the mount never trigger mounts themselves (they have an
 		 * autofs trigger mount mounted on them). But v4 pseudo direct
 		 * mounts do need the leaves to trigger mounts. In this case
-		 * we have no choice but to use the list_empty() check and
+		 * we have no choice but to use the autofs_empty() check and
 		 * require user space behave.
 		 */
 		if (sbi->version > 4) {
@@ -373,7 +374,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
 				goto done;
 			}
 		} else {
-			if (!simple_empty(dentry)) {
+			if (!autofs_empty(ino)) {
 				spin_unlock(&sbi->fs_lock);
 				goto done;
 			}
@@ -428,9 +429,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
 
 	if (rcu_walk) {
 		/* We don't need fs_lock in rcu_walk mode,
-		 * just testing 'AUTOFS_INFO_NO_RCU' is enough.
-		 * simple_empty() takes a spinlock, so leave it
-		 * to last.
+		 * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough.
+		 *
 		 * We only return -EISDIR when certain this isn't
 		 * a mount-trap.
 		 */
@@ -443,9 +443,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
 		inode = d_inode_rcu(dentry);
 		if (inode && S_ISLNK(inode->i_mode))
 			return -EISDIR;
-		if (list_empty(&dentry->d_subdirs))
-			return 0;
-		if (!simple_empty(dentry))
+		if (!autofs_empty(ino))
 			return -EISDIR;
 		return 0;
 	}
@@ -465,7 +463,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
 		 * we can avoid needless calls ->d_automount() and avoid
 		 * an incorrect ELOOP error return.
 		 */
-		if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
+		if ((!path_is_mountpoint(path) && !autofs_empty(ino)) ||
 		    (d_really_is_positive(dentry) && d_is_symlink(dentry)))
 			status = -EISDIR;
 	}

From ba97a0a3a31a2451607ebf601c0b7c4b1322ce9a Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:18 +0800
Subject: [PATCH 50/72] autofs: add comment about autofs_mountpoint_changed()

The function autofs_mountpoint_changed() is unusual, add a comment about
two cases for which it is needed.

Link: https://lkml.kernel.org/r/165724459804.30914.10974834416046555127.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/root.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index e0fa71eb5c0587..ca03c1cae2be17 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -291,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path)
 	struct dentry *dentry = path->dentry;
 	struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
 
-	/*
-	 * If this is an indirect mount the dentry could have gone away
-	 * as a result of an expire and a new one created.
+	/* If this is an indirect mount the dentry could have gone away
+	 * and a new one created.
+	 *
+	 * This is unusual and I can't remember the case for which it
+	 * was originally added now. But an example of how this can
+	 * happen is an autofs indirect mount that has the "browse"
+	 * option set and also has the "symlink" option in the autofs
+	 * map entry. In this case the daemon will remove the browse
+	 * directory and create a symlink as the mount leaving the
+	 * struct path stale.
+	 *
+	 * Another not so obvious case is when a mount in an autofs
+	 * indirect mount that uses the "nobrowse" option is being
+	 * expired at the same time as a path walk. If the mount has
+	 * been umounted but the mount point directory seen before
+	 * becoming unhashed (during a lockless path walk) when a stat
+	 * family system call is made the mount won't be re-mounted as
+	 * it should. In this case the mount point that's been removed
+	 * (by the daemon) will be stale and the a new mount point
+	 * dentry created.
 	 */
 	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
 		struct dentry *parent = dentry->d_parent;

From 7ffe4e90a061a2f612b3b8c29b583ec3b707781f Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:23 +0800
Subject: [PATCH 51/72] autofs: remove unused ino field inode

Remove the unused inode field of the autofs dentry info structure.

Link: https://lkml.kernel.org/r/165724460393.30914.6511330213821246793.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/autofs_i.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 0117d6e0630089..d5a44fa88acf9a 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type;
  */
 struct autofs_info {
 	struct dentry	*dentry;
-	struct inode	*inode;
-
 	int		flags;
 
 	struct completion expire_complete;

From d919a1e79bac890421537cf02ae773007bf55e6b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 13 Jul 2022 21:00:29 +0800
Subject: [PATCH 52/72] proc: fix a dentry lock race between release_task and
 lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc")
moved proc_flush_task() behind __exit_signal().  Then, process systemd can
take long period high cpu usage during releasing task in following
concurrent processes:

  systemd                                 ps
kernel_waitid                 stat(/proc/tgid)
  do_wait                       filename_lookup
    wait_consider_task            lookup_fast
      release_task
        __exit_signal
          __unhash_process
            detach_pid
              __change_pid // remove task->pid_links
                                     d_revalidate -> pid_revalidate  // 0
                                     d_invalidate(/proc/tgid)
                                       shrink_dcache_parent(/proc/tgid)
                                         d_walk(/proc/tgid)
                                           spin_lock_nested(/proc/tgid/fd)
                                           // iterating opened fd
        proc_flush_pid                                    |
           d_invalidate (/proc/tgid/fd)                   |
              shrink_dcache_parent(/proc/tgid/fd)         |
                shrink_dentry_list(subdirs)               ↓
                  shrink_lock_dentry(/proc/tgid/fd) --> race on dentry lock

Function d_invalidate() will remove dentry from hash firstly, but why does
proc_flush_pid() process dentry '/proc/tgid/fd' before dentry
'/proc/tgid'?  That's because proc_pid_make_inode() adds proc inode in
reverse order by invoking hlist_add_head_rcu().  But proc should not add
any inodes under '/proc/tgid' except '/proc/tgid/task/pid', fix it by
adding inode into 'pid->inodes' only if the inode is /proc/tgid or
/proc/tgid/task/pid.

Performance regression:
Create 200 tasks, each task open one file for 50,000 times. Kill all
tasks when opened files exceed 10,000,000 (cat /proc/sys/fs/file-nr).

Before fix:
$ time killall -wq aa
  real    4m40.946s   # During this period, we can see 'ps' and 'systemd'
			taking high cpu usage.

After fix:
$ time killall -wq aa
  real    1m20.732s   # During this period, we can see 'systemd' taking
			high cpu usage.

Link: https://lkml.kernel.org/r/20220713130029.4133533-1-chengzhihao1@huawei.com
Fixes: 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216054
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Suggested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c | 46 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8dfa36a99c7421..93f7e3d971e4bb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1885,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei)
 	put_pid(pid);
 }
 
-struct inode *proc_pid_make_inode(struct super_block * sb,
+struct inode *proc_pid_make_inode(struct super_block *sb,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
@@ -1914,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 
 	/* Let the pid remember us for quick removal */
 	ei->pid = pid;
-	if (S_ISDIR(mode)) {
-		spin_lock(&pid->lock);
-		hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
-		spin_unlock(&pid->lock);
-	}
 
 	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 	security_task_to_inode(task, inode);
@@ -1931,6 +1926,39 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 	return NULL;
 }
 
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+				struct task_struct *task, umode_t mode)
+{
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct pid *pid;
+
+	inode = proc_pid_make_inode(sb, task, mode);
+	if (!inode)
+		return NULL;
+
+	/* Let proc_flush_pid find this directory inode */
+	ei = PROC_I(inode);
+	pid = ei->pid;
+	spin_lock(&pid->lock);
+	hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+	spin_unlock(&pid->lock);
+
+	return inode;
+}
+
 int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
@@ -3369,7 +3397,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
 {
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = proc_pid_make_base_inode(dentry->d_sb, task,
+					 S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
@@ -3671,7 +3700,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
 	struct task_struct *task, const void *ptr)
 {
 	struct inode *inode;
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = proc_pid_make_base_inode(dentry->d_sb, task,
+					 S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 

From 3adb2d87238dea5e05bab747238bb47306b9cb56 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 12 Jul 2022 17:51:45 +0300
Subject: [PATCH 53/72] proc: fix test for "vsyscall=xonly" boot option

Booting with vsyscall=xonly results in the following vsyscall VMA:

	ffffffffff600000-ffffffffff601000 --xp ... [vsyscall]


Test does read from fixed vsyscall address to determine if kernel
supports vsyscall page but it doesn't work because, well, vsyscall
page is execute only.

Fix test by trying to execute from the first byte of the page which
contains gettimeofday() stub. This should work because vsyscall
entry points have stable addresses by design.

	Alexey, avoiding parsing .config, /proc/config.gz and
	/proc/cmdline at all costs.

Link: https://lkml.kernel.org/r/Ys2KgeiEMboU8Ytu@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: <dylanbhatch@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/proc/proc-pid-vm.c | 75 ++++++++++++++++++++--
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c
index 28604c9f805c75..e5962f4794f566 100644
--- a/tools/testing/selftests/proc/proc-pid-vm.c
+++ b/tools/testing/selftests/proc/proc-pid-vm.c
@@ -211,10 +211,19 @@ static int make_exe(const uint8_t *payload, size_t len)
 }
 #endif
 
-static bool g_vsyscall = false;
+/*
+ * 0: vsyscall VMA doesn't exist	vsyscall=none
+ * 1: vsyscall VMA is r-xp		vsyscall=emulate
+ * 2: vsyscall VMA is --xp		vsyscall=xonly
+ */
+static int g_vsyscall;
+static const char *str_vsyscall;
 
-static const char str_vsyscall[] =
+static const char str_vsyscall_0[] = "";
+static const char str_vsyscall_1[] =
 "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n";
+static const char str_vsyscall_2[] =
+"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0                  [vsyscall]\n";
 
 #ifdef __x86_64__
 static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
@@ -223,13 +232,47 @@ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
 }
 
 /*
- * vsyscall page can't be unmapped, probe it with memory load.
+ * vsyscall page can't be unmapped, probe it directly.
  */
 static void vsyscall(void)
 {
 	pid_t pid;
 	int wstatus;
 
+	pid = fork();
+	if (pid < 0) {
+		fprintf(stderr, "fork, errno %d\n", errno);
+		exit(1);
+	}
+	if (pid == 0) {
+		struct rlimit rlim = {0, 0};
+		(void)setrlimit(RLIMIT_CORE, &rlim);
+
+		/* Hide "segfault at ffffffffff600000" messages. */
+		struct sigaction act;
+		memset(&act, 0, sizeof(struct sigaction));
+		act.sa_flags = SA_SIGINFO;
+		act.sa_sigaction = sigaction_SIGSEGV;
+		(void)sigaction(SIGSEGV, &act, NULL);
+
+		/* gettimeofday(NULL, NULL); */
+		asm volatile (
+			"call %P0"
+			:
+			: "i" (0xffffffffff600000), "D" (NULL), "S" (NULL)
+			: "rax", "rcx", "r11"
+		);
+		exit(0);
+	}
+	waitpid(pid, &wstatus, 0);
+	if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) {
+		/* vsyscall page exists and is executable. */
+	} else {
+		/* vsyscall page doesn't exist. */
+		g_vsyscall = 0;
+		return;
+	}
+
 	pid = fork();
 	if (pid < 0) {
 		fprintf(stderr, "fork, errno %d\n", errno);
@@ -251,8 +294,13 @@ static void vsyscall(void)
 	}
 	waitpid(pid, &wstatus, 0);
 	if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) {
-		g_vsyscall = true;
+		/* vsyscall page is readable and executable. */
+		g_vsyscall = 1;
+		return;
 	}
+
+	/* vsyscall page is executable but unreadable. */
+	g_vsyscall = 2;
 }
 
 int main(void)
@@ -261,6 +309,19 @@ int main(void)
 	int exec_fd;
 
 	vsyscall();
+	switch (g_vsyscall) {
+	case 0:
+		str_vsyscall = str_vsyscall_0;
+		break;
+	case 1:
+		str_vsyscall = str_vsyscall_1;
+		break;
+	case 2:
+		str_vsyscall = str_vsyscall_2;
+		break;
+	default:
+		abort();
+	}
 
 	atexit(ate);
 
@@ -314,7 +375,7 @@ int main(void)
 
 	/* Test /proc/$PID/maps */
 	{
-		const size_t len = strlen(buf0) + (g_vsyscall ? strlen(str_vsyscall) : 0);
+		const size_t len = strlen(buf0) + strlen(str_vsyscall);
 		char buf[256];
 		ssize_t rv;
 		int fd;
@@ -327,7 +388,7 @@ int main(void)
 		rv = read(fd, buf, sizeof(buf));
 		assert(rv == len);
 		assert(memcmp(buf, buf0, strlen(buf0)) == 0);
-		if (g_vsyscall) {
+		if (g_vsyscall > 0) {
 			assert(memcmp(buf + strlen(buf0), str_vsyscall, strlen(str_vsyscall)) == 0);
 		}
 	}
@@ -374,7 +435,7 @@ int main(void)
 			assert(memmem(buf, rv, S[i], strlen(S[i])));
 		}
 
-		if (g_vsyscall) {
+		if (g_vsyscall > 0) {
 			assert(memmem(buf, rv, str_vsyscall, strlen(str_vsyscall)));
 		}
 	}

From 1298f83b546921506afa4050a833f21433ed4b88 Mon Sep 17 00:00:00 2001
From: "Souptick Joarder (HPE)" <jrdr.linux@gmail.com>
Date: Sun, 26 Jun 2022 07:51:14 +0530
Subject: [PATCH 54/72] ia64: old_rr4 added under CONFIG_HUGETLB_PAGE

kernel test robot throws below warning ->

arch/ia64/include/asm/mmu_context.h: In function 'reload_context':
   arch/ia64/include/asm/mmu_context.h:127:48: warning: variable
'old_rr4' set but not used [-Wunused-but-set-variable]
     127 |         unsigned long rr0, rr1, rr2, rr3, rr4, old_rr4;

Add it under CONFIG_HUGETLB_PAGE

Link: https://lkml.kernel.org/r/20220626022114.4020-1-jrdr.linux@gmail.com
Signed-off-by: Souptick Joarder (HPE) <jrdr.linux@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/ia64/include/asm/mmu_context.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
index 87a0d5bc11ef04..06257e355d0076 100644
--- a/arch/ia64/include/asm/mmu_context.h
+++ b/arch/ia64/include/asm/mmu_context.h
@@ -124,9 +124,12 @@ reload_context (nv_mm_context_t context)
 {
 	unsigned long rid;
 	unsigned long rid_incr = 0;
-	unsigned long rr0, rr1, rr2, rr3, rr4, old_rr4;
+	unsigned long rr0, rr1, rr2, rr3, rr4;
 
+#ifdef CONFIG_HUGETLB_PAGE
+	unsigned long old_rr4;
 	old_rr4 = ia64_get_rr(RGN_BASE(RGN_HPAGE));
+#endif
 	rid = context << 3;	/* make space for encoding the region number */
 	rid_incr = 1 << 8;
 

From 233eb8d6894ed3349e9971a51dd8a9b5586e2598 Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Fri, 15 Jul 2022 14:00:35 +0800
Subject: [PATCH 55/72] fs/ocfs2: Fix spelling typo in comment

Fix spelling typo in comment.

Link: https://lkml.kernel.org/r/20220715060035.632903-1-13667453960@163.com
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Reported-by: k2ci <kernel-bot@kylinos.cn>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/quota_global.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 0b6f551a342a17..dc9f76ab7e13c3 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -412,7 +412,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	goto out_err;
 }
 
-/* Write information to global quota file. Expects exlusive lock on quota
+/* Write information to global quota file. Expects exclusive lock on quota
  * file inode and quota info */
 static int __ocfs2_global_write_info(struct super_block *sb, int type)
 {

From 0c12185728d602c27cd12a845249e7f37197f71f Mon Sep 17 00:00:00 2001
From: Hsin-Yi Wang <hsinyi@chromium.org>
Date: Fri, 17 Jun 2022 16:38:09 +0800
Subject: [PATCH 56/72] Revert "squashfs: provide backing_dev_info in order to
 disable read-ahead"

Patch series "Implement readahead for squashfs", v7.

Commit 9eec1d897139("squashfs: provide backing_dev_info in order to
disable read-ahead") mitigates the performance drop issue for squashfs by
closing readahead for it.

This series implements readahead callback for squashfs.


This patch (of 4):

This reverts 9eec1d897139e5 ("squashfs: provide backing_dev_info in order
to disable read-ahead").

Revert closing the readahead to squashfs since the readahead callback for
squashfs is implemented.

Link: https://lkml.kernel.org/r/20220617083810.337573-1-hsinyi@chromium.org
Link: https://lkml.kernel.org/r/20220617083810.337573-2-hsinyi@chromium.org
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Suggested-by: Xiongwei Song <Xiongwei.Song@windriver.com>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Hou Tao <houtao1@huawei.com>
Cc: Miao Xie <miaoxie@huawei.com>

Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/super.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6d594ba2ed28ff..32565dafa7f3ba 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -29,7 +29,6 @@
 #include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/xattr.h>
-#include <linux/backing-dev.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
 	return decompressor;
 }
 
-static int squashfs_bdi_init(struct super_block *sb)
-{
-	int err;
-	unsigned int major = MAJOR(sb->s_dev);
-	unsigned int minor = MINOR(sb->s_dev);
-
-	bdi_put(sb->s_bdi);
-	sb->s_bdi = &noop_backing_dev_info;
-
-	err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
-	if (err)
-		return err;
-
-	sb->s_bdi->ra_pages = 0;
-	sb->s_bdi->io_pages = 0;
-
-	return 0;
-}
 
 static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
@@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	TRACE("Entered squashfs_fill_superblock\n");
 
-	/*
-	 * squashfs provides 'backing_dev_info' in order to disable read-ahead. For
-	 * squashfs, I/O is not deferred, it is done immediately in read_folio,
-	 * which means the user would always have to wait their own I/O. So the effect
-	 * of readahead is very weak for squashfs. squashfs_bdi_init will set
-	 * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
-	 * squashfs.
-	 */
-	err = squashfs_bdi_init(sb);
-	if (err) {
-		errorf(fc, "squashfs init bdi failed");
-		return err;
-	}
-
 	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
 	if (sb->s_fs_info == NULL) {
 		ERROR("Failed to allocate squashfs_sb_info\n");

From db98b43086275350294f5c6f797249b714d6316d Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Fri, 17 Jun 2022 16:38:11 +0800
Subject: [PATCH 57/72] squashfs: always build "file direct" version of page
 actor

Squashfs_readahead uses the "file direct" version of the page actor, and
so build it unconditionally.

Link: https://lkml.kernel.org/r/20220617083810.337573-3-hsinyi@chromium.org
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Hou Tao <houtao1@huawei.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/Makefile     |  4 ++--
 fs/squashfs/page_actor.h | 46 ----------------------------------------
 2 files changed, 2 insertions(+), 48 deletions(-)

diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7bd9b8b856d0bf..477c89a519ee88 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,9 +5,9 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o
 squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
-squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 37523c54256fa7..24841d28bc0fb8 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -6,51 +6,6 @@
  * Phillip Lougher <phillip@squashfs.org.uk>
  */
 
-#ifndef CONFIG_SQUASHFS_FILE_DIRECT
-struct squashfs_page_actor {
-	void	**page;
-	int	pages;
-	int	length;
-	int	next_page;
-};
-
-static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
-	int pages, int length)
-{
-	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
-	if (actor == NULL)
-		return NULL;
-
-	actor->length = length ? : pages * PAGE_SIZE;
-	actor->page = page;
-	actor->pages = pages;
-	actor->next_page = 0;
-	return actor;
-}
-
-static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
-{
-	actor->next_page = 1;
-	return actor->page[0];
-}
-
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
-	return actor->next_page == actor->pages ? NULL :
-		actor->page[actor->next_page++];
-}
-
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
-	/* empty */
-}
-
-static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
-{
-	/* empty */
-}
-#else
 struct squashfs_page_actor {
 	union {
 		void		**buffer;
@@ -91,4 +46,3 @@ static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
 	actor->alloc_buffer = 0;
 }
 #endif
-#endif

From 8fc78b6fe24c36b151ac98d7546591ed92083d4f Mon Sep 17 00:00:00 2001
From: Hsin-Yi Wang <hsinyi@chromium.org>
Date: Fri, 17 Jun 2022 16:38:13 +0800
Subject: [PATCH 58/72] squashfs: implement readahead

Implement readahead callback for squashfs.  It will read datablocks which
cover pages in readahead request.  For a few cases it will not mark page
as uptodate, including:

- file end is 0.
- zero filled blocks.
- current batch of pages isn't in the same datablock.
- decompressor error.

Otherwise pages will be marked as uptodate.  The unhandled pages will be
updated by readpage later.

Link: https://lkml.kernel.org/r/20220617083810.337573-4-hsinyi@chromium.org
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Phillip Lougher <phillip@squashfs.org.uk>
Reported-by: Xiongwei Song <Xiongwei.Song@windriver.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Hou Tao <houtao1@huawei.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c | 92 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index a8e495d8eb8600..128ebe9aded87d 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -39,6 +39,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "page_actor.h"
 
 /*
  * Locate cache slot in range [offset, index] for specified inode.  If
@@ -495,7 +496,96 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
 	return 0;
 }
 
+static void squashfs_readahead(struct readahead_control *ractl)
+{
+	struct inode *inode = ractl->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	size_t mask = (1UL << msblk->block_log) - 1;
+	unsigned short shift = msblk->block_log - PAGE_SHIFT;
+	loff_t start = readahead_pos(ractl) & ~mask;
+	size_t len = readahead_length(ractl) + readahead_pos(ractl) - start;
+	struct squashfs_page_actor *actor;
+	unsigned int nr_pages = 0;
+	struct page **pages;
+	int i, file_end = i_size_read(inode) >> msblk->block_log;
+	unsigned int max_pages = 1UL << shift;
+
+	readahead_expand(ractl, start, (len | mask) + 1);
+
+	if (file_end == 0)
+		return;
+
+	pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL);
+	if (!pages)
+		return;
+
+	for (;;) {
+		pgoff_t index;
+		int res, bsize;
+		u64 block = 0;
+		unsigned int expected;
+
+		nr_pages = __readahead_batch(ractl, pages, max_pages);
+		if (!nr_pages)
+			break;
+
+		if (readahead_pos(ractl) >= i_size_read(inode))
+			goto skip_pages;
+
+		index = pages[0]->index >> shift;
+		if ((pages[nr_pages - 1]->index >> shift) != index)
+			goto skip_pages;
+
+		expected = index == file_end ?
+			   (i_size_read(inode) & (msblk->block_size - 1)) :
+			    msblk->block_size;
+
+		bsize = read_blocklist(inode, index, &block);
+		if (bsize == 0)
+			goto skip_pages;
+
+		actor = squashfs_page_actor_init_special(msblk, pages, nr_pages,
+							 expected);
+		if (!actor)
+			goto skip_pages;
+
+		res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+		kfree(actor);
+
+		if (res == expected) {
+			int bytes;
+
+			/* Last page (if present) may have trailing bytes not filled */
+			bytes = res % PAGE_SIZE;
+			if (pages[nr_pages - 1]->index == file_end && bytes)
+				memzero_page(pages[nr_pages - 1], bytes,
+					     PAGE_SIZE - bytes);
+
+			for (i = 0; i < nr_pages; i++) {
+				flush_dcache_page(pages[i]);
+				SetPageUptodate(pages[i]);
+			}
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			unlock_page(pages[i]);
+			put_page(pages[i]);
+		}
+	}
+
+	kfree(pages);
+	return;
+
+skip_pages:
+	for (i = 0; i < nr_pages; i++) {
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+	kfree(pages);
+}
 
 const struct address_space_operations squashfs_aops = {
-	.read_folio = squashfs_read_folio
+	.read_folio = squashfs_read_folio,
+	.readahead = squashfs_readahead
 };

From b09a7a036d2035b14636cd4c4c69518d73770f65 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Fri, 17 Jun 2022 16:38:15 +0800
Subject: [PATCH 59/72] squashfs: support reading fragments in readahead call

Add a function which can be used to read fragments in the readahead call.

This function is necessary because filesystems built with the -tailends
(or -always-use-fragments) option may have fragments present which cannot
be currently handled.

Link: https://lkml.kernel.org/r/20220617083810.337573-5-hsinyi@chromium.org
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Hou Tao <houtao1@huawei.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 128ebe9aded87d..7ff0b03cceab01 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -496,6 +496,41 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
 	return 0;
 }
 
+static int squashfs_readahead_fragment(struct page **page,
+	unsigned int pages, unsigned int expected)
+{
+	struct inode *inode = page[0]->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+		squashfs_i(inode)->fragment_block,
+		squashfs_i(inode)->fragment_size);
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
+
+	if (buffer->error)
+		goto out;
+
+	expected += squashfs_i(inode)->fragment_offset;
+
+	for (n = 0; n < pages; n++) {
+		unsigned int base = (page[n]->index & mask) << PAGE_SHIFT;
+		unsigned int offset = base + squashfs_i(inode)->fragment_offset;
+
+		if (expected > offset) {
+			unsigned int avail = min_t(unsigned int, expected -
+				offset, PAGE_SIZE);
+
+			squashfs_fill_page(page[n], buffer, offset, avail);
+		}
+
+		unlock_page(page[n]);
+		put_page(page[n]);
+	}
+
+out:
+	squashfs_cache_put(buffer);
+	return buffer->error;
+}
+
 static void squashfs_readahead(struct readahead_control *ractl)
 {
 	struct inode *inode = ractl->mapping->host;
@@ -512,9 +547,6 @@ static void squashfs_readahead(struct readahead_control *ractl)
 
 	readahead_expand(ractl, start, (len | mask) + 1);
 
-	if (file_end == 0)
-		return;
-
 	pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL);
 	if (!pages)
 		return;
@@ -540,6 +572,15 @@ static void squashfs_readahead(struct readahead_control *ractl)
 			   (i_size_read(inode) & (msblk->block_size - 1)) :
 			    msblk->block_size;
 
+		if (index == file_end && squashfs_i(inode)->fragment_block !=
+						SQUASHFS_INVALID_BLK) {
+			res = squashfs_readahead_fragment(pages, nr_pages,
+							  expected);
+			if (res)
+				goto skip_pages;
+			continue;
+		}
+
 		bsize = read_blocklist(inode, index, &block);
 		if (bsize == 0)
 			goto skip_pages;

From a10c9ede9913fd54be61bbb01884e647e83dfcae Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Thu, 14 Jul 2022 09:54:41 +0800
Subject: [PATCH 60/72] lib/lzo/lzo1x_compress.c: replace ternary operator with
 min() and min_t()

Fix the following coccicheck warning:

lib/lzo/lzo1x_compress.c:54: WARNING opportunity for min().
lib/lzo/lzo1x_compress.c:329: WARNING opportunity for min().

min() and min_t() macro is defined in include/linux/minmax.h.  It avoids
multiple evaluations of the arguments when non-constant and performs
strict type-checking.

Link: https://lkml.kernel.org/r/20220714015441.1313036-1-13667453960@163.com
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Tested-by: Dave Rodgman <dave.rodgman@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/lzo/lzo1x_compress.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c
index 76758e9296ba65..9d31e7126606ac 100644
--- a/lib/lzo/lzo1x_compress.c
+++ b/lib/lzo/lzo1x_compress.c
@@ -50,9 +50,7 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
 
 		if (dv == 0 && bitstream_version) {
 			const unsigned char *ir = ip + 4;
-			const unsigned char *limit = ip_end
-				< (ip + MAX_ZERO_RUN_LENGTH + 1)
-				? ip_end : ip + MAX_ZERO_RUN_LENGTH + 1;
+			const unsigned char *limit = min(ip_end, ip + MAX_ZERO_RUN_LENGTH + 1);
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \
 	defined(LZO_FAST_64BIT_MEMORY_ACCESS)
 			u64 dv64;
@@ -326,7 +324,7 @@ static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len,
 	data_start = op;
 
 	while (l > 20) {
-		size_t ll = l <= (m4_max_offset + 1) ? l : (m4_max_offset + 1);
+		size_t ll = min_t(size_t, l, m4_max_offset + 1);
 		uintptr_t ll_end = (uintptr_t) ip + ll;
 		if ((ll_end + ((t + ll) >> 5)) <= ll_end)
 			break;

From 591c32bddbe20ba0e172d9def3c7f22b9c926ad9 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@sifive.com>
Date: Thu, 14 Jul 2022 08:47:44 +0100
Subject: [PATCH 61/72] kernel/hung_task: fix address space of
 proc_dohung_task_timeout_secs

The proc_dohung_task_timeout_secs() function is incorrectly marked
as having a __user buffer as argument 3. However this is not the
case and it is casing multiple sparse warnings. Fix the following
warnings by removing __user from the argument:

kernel/hung_task.c:237:52: warning: incorrect type in argument 3 (different address spaces)
kernel/hung_task.c:237:52:    expected void *
kernel/hung_task.c:237:52:    got void [noderef] __user *buffer
kernel/hung_task.c:287:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces))
kernel/hung_task.c:287:35:    expected int ( [usertype] *proc_handler )( ... )
kernel/hung_task.c:287:35:    got int ( * )( ... )
kernel/hung_task.c:295:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces))
kernel/hung_task.c:295:35:    expected int ( [usertype] *proc_handler )( ... )
kernel/hung_task.c:295:35:    got int ( * )( ... )

Link: https://lkml.kernel.org/r/20220714074744.189017-1-ben.dooks@sifive.com
Signed-off-by: Ben Dooks <ben.dooks@sifive.com>
Cc: <Conor.Dooley@microchip.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/hung_task.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index cff3ae8c818fd3..bb2354f73dedca 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -229,7 +229,7 @@ static long hung_timeout_jiffies(unsigned long last_checked,
  * Process updating of timeout sysctl
  */
 static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-				  void __user *buffer,
+				  void *buffer,
 				  size_t *lenp, loff_t *ppos)
 {
 	int ret;

From fa7d574ba4f4f3f4f78d432c8545d9045daa89b1 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Tue, 19 Jul 2022 16:33:49 +0800
Subject: [PATCH 62/72] bdi: remove enum wb_congested_state

enum wb_congested_state and the member 'congested' in bdi_writeback are
useless since commit a88f2096d5a2 ("remove congestion tracking
framework"), so remove it.

Link: https://lkml.kernel.org/r/20220719083349.87547-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/backing-dev-defs.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e863c88df95f97..ae12696ec492c6 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -28,11 +28,6 @@ enum wb_state {
 	WB_start_all,		/* nr_pages == 0 (all) work pending */
 };
 
-enum wb_congested_state {
-	WB_async_congested,	/* The async (write) queue is getting full */
-	WB_sync_congested,	/* The sync queue is getting full */
-};
-
 enum wb_stat_item {
 	WB_RECLAIMABLE,
 	WB_WRITEBACK,
@@ -122,8 +117,6 @@ struct bdi_writeback {
 	atomic_t writeback_inodes;	/* number of inodes under writeback */
 	struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
-	unsigned long congested;	/* WB_[a]sync_congested flags */
-
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
 	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */

From ed8fb78d7ecdeb3e2e86df0027e2c2cc55f9908b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 23 Jul 2022 20:09:07 +0300
Subject: [PATCH 63/72] proc: add some (hopefully) insightful comments

* /proc/${pid}/net status
* removing PDE vs last close stuff (again!)
* random small stuff

Link: https://lkml.kernel.org/r/YtwrM6sDC0OQ53YB@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/array.c    |  4 ++++
 fs/proc/inode.c    | 17 ++++++++++++-----
 fs/proc/proc_net.c |  6 ++++++
 fs/proc/root.c     |  5 +++++
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65fa603422e04d..99fcbfda8e2593 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -99,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
 {
 	char tcomm[64];
 
+	/*
+	 * Test before PF_KTHREAD because all workqueue worker threads are
+	 * kernel threads.
+	 */
 	if (p->flags & PF_WQ_WORKER)
 		wq_worker_comm(tcomm, sizeof(tcomm), p);
 	else if (p->flags & PF_KTHREAD)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fd40d60169b5a2..f130499ad8432d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -212,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde)
 		complete(pde->pde_unload_completion);
 }
 
-/* pde is locked on entry, unlocked on exit */
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 	__releases(&pde->pde_unload_lock)
 {
@@ -222,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 	 *
 	 * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
 	 * "struct file" needs to be available at the right moment.
-	 *
-	 * Therefore, first process to enter this function does ->release() and
-	 * signals its completion to the other process which does nothing.
 	 */
 	if (pdeo->closing) {
 		/* somebody else is doing that, just wait */
@@ -238,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 
 		pdeo->closing = true;
 		spin_unlock(&pde->pde_unload_lock);
+
 		file = pdeo->file;
 		pde->proc_ops->proc_release(file_inode(file), file);
+
 		spin_lock(&pde->pde_unload_lock);
-		/* After ->release. */
+		/* Strictly after ->proc_release, see above. */
 		list_del(&pdeo->lh);
 		c = pdeo->c;
 		spin_unlock(&pde->pde_unload_lock);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index bbce6fbe779c8c..856839b8ae8b7e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -350,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net)
 	kgid_t gid;
 	int err;
 
+	/*
+	 * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+	 * Corresponding inode (PDE(inode) == net->proc_net) is never
+	 * instantiated therefore blanket zeroing is fine.
+	 * net->proc_net_stat inode is instantiated normally.
+	 */
 	err = -ENOMEM;
 	netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
 	if (!netd)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5a7d15d197f8e0..3c2ee3eb1138aa 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -302,6 +302,11 @@ void __init proc_root_init(void)
 	proc_mkdir("bus", NULL);
 	proc_sys_init();
 
+	/*
+	 * Last things last. It is not like userspace processes eager
+	 * to open /proc files exist at this point but register last
+	 * anyway.
+	 */
 	register_filesystem(&proc_fs_type);
 }
 

From cf069c3b47fed4e475a13e3ec89451fbdb88869a Mon Sep 17 00:00:00 2001
From: Slark Xiao <slark_xiao@163.com>
Date: Fri, 22 Jul 2022 18:19:22 +0800
Subject: [PATCH 64/72] lib/mpi: fix typo 'the the' in comment

Replace 'the the' with 'the' in the comment.

Link: https://lkml.kernel.org/r/20220722101922.81126-1-slark_xiao@163.com
Signed-off-by: Slark Xiao <slark_xiao@163.com>
Cc: Hongbo Li <herberthbli@tencent.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/mpi/mpiutil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c
index bc81419f400c55..aa8c46544af8e0 100644
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -272,7 +272,7 @@ MPI mpi_set_ui(MPI w, unsigned long u)
 	if (!w)
 		w = mpi_alloc(1);
 	/* FIXME: If U is 0 we have no need to resize and thus possible
-	 * allocating the the limbs.
+	 * allocating the limbs.
 	 */
 	RESIZE_IF_NEEDED(w, 1);
 	w->d[0] = u;

From 97d3b2676fc6bc4865eb825037f4492f0fb804eb Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 21 Jul 2022 22:49:25 +0200
Subject: [PATCH 65/72] ocfs2: remove some useless functions

Patch series "ocfs2: A few clean_ups", v2.

__ocfs2_node_map_set_bit() and __ocfs2_node_map_clear_bit() are just
wrapper around set_bit() and clear_bit().

The leading __ also makes think that these functions are non-atomic just
like __set_bit() and __clear_bit().

So, just remove these wrappers and call set_bit() and clear_bit()
directly.

Link: https://lkml.kernel.org/r/cover.1658436259.git.christophe.jaillet@wanadoo.fr
Link: https://lkml.kernel.org/r/bd1429c84ec7d174c96dbb67a2b42b1b456d9394.1658436259.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/heartbeat.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 9099d8fc759999..1d72e078894384 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -24,11 +24,6 @@
 
 #include "buffer_head_io.h"
 
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
-					    int bit);
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
-					      int bit);
-
 /* special case -1 for now
  * TODO: should *really* make sure the calling func never passes -1!!  */
 static void ocfs2_node_map_init(struct ocfs2_node_map *map)
@@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data)
 	ocfs2_recovery_thread(osb, node_num);
 }
 
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
-					    int bit)
-{
-	set_bit(bit, map->map);
-}
-
 void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
 			    struct ocfs2_node_map *map,
 			    int bit)
@@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
 		return;
 	BUG_ON(bit >= map->num_nodes);
 	spin_lock(&osb->node_map_lock);
-	__ocfs2_node_map_set_bit(map, bit);
+	set_bit(bit, map->map);
 	spin_unlock(&osb->node_map_lock);
 }
 
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
-					      int bit)
-{
-	clear_bit(bit, map->map);
-}
-
 void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 			      struct ocfs2_node_map *map,
 			      int bit)
@@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 		return;
 	BUG_ON(bit >= map->num_nodes);
 	spin_lock(&osb->node_map_lock);
-	__ocfs2_node_map_clear_bit(map, bit);
+	clear_bit(bit, map->map);
 	spin_unlock(&osb->node_map_lock);
 }
 

From 702f3cf374b85d2e77431c80e870ee31ea03cdd8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 21 Jul 2022 22:49:37 +0200
Subject: [PATCH 66/72] ocfs2: use the bitmap API to simplify code

Use bitmap_zero() instead of hand-writing it.  It is less verbose.

While at it, add an explicit #include <linux/bitmap.h>.

Link: https://lkml.kernel.org/r/86d2a027c319db12055c98f00c65f7d01e703722.1658436259.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/heartbeat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 1d72e078894384..dd29d60af1547d 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  */
 
+#include <linux/bitmap.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/highmem.h>
@@ -29,8 +30,7 @@
 static void ocfs2_node_map_init(struct ocfs2_node_map *map)
 {
 	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
-	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
-	       sizeof(unsigned long));
+	bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES);
 }
 
 void ocfs2_init_node_maps(struct ocfs2_super *osb)

From 45ee6d1e935d879d86aebd1fd15afb3bc015c4a0 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 21 Jul 2022 22:49:48 +0200
Subject: [PATCH 67/72] ocfs2: fix a typo in a comment

s/heartbaet/heartbeat

Link: https://lkml.kernel.org/r/4d4a6786e8ad522bfad6d2401b7f6634f8af0e5d.1658436259.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/heartbeat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index dd29d60af1547d..22da768e65b7ce 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -2,7 +2,7 @@
 /*
  * heartbeat.c
  *
- * Register ourselves with the heartbaet service, keep our node maps
+ * Register ourselves with the heartbeat service, keep our node maps
  * up to date, and fire off recovery when needed.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.

From 787dbea11a5d6843999ff71a3fb9aa1ed6d5d889 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Thu, 21 Jul 2022 20:55:09 +0100
Subject: [PATCH 68/72] profile: setup_profiling_timer() is moslty not
 implemented

The setup_profiling_timer() is mostly un-implemented by many
architectures.  In many places it isn't guarded by CONFIG_PROFILE which is
needed for it to be used.  Make it a weak symbol in kernel/profile.c and
remove the 'return -EINVAL' implementations from the kenrel.

There are a couple of architectures which do return 0 from the
setup_profiling_timer() function but they don't seem to do anything else
with it.  To keep the /proc compatibility for now, leave these for a
future update or removal.

On ARM, this fixes the following sparse warning:
arch/arm/kernel/smp.c:793:5: warning: symbol 'setup_profiling_timer' was not declared. Should it be static?

Link: https://lkml.kernel.org/r/20220721195509.418205-1-ben-linux@fluff.org
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/kernel/smp.c     | 6 ------
 arch/arc/kernel/smp.c       | 8 --------
 arch/arm/kernel/smp.c       | 8 --------
 arch/arm64/kernel/smp.c     | 8 --------
 arch/csky/kernel/smp.c      | 5 -----
 arch/hexagon/kernel/smp.c   | 5 -----
 arch/ia64/kernel/smp.c      | 6 ------
 arch/openrisc/kernel/smp.c  | 6 ------
 arch/parisc/kernel/smp.c    | 7 -------
 arch/powerpc/kernel/smp.c   | 7 -------
 arch/riscv/kernel/smp.c     | 6 ------
 arch/sparc/kernel/smp_32.c  | 5 -----
 arch/sparc/kernel/smp_64.c  | 6 ------
 arch/x86/include/asm/apic.h | 2 --
 arch/x86/kernel/apic/apic.c | 5 -----
 kernel/profile.c            | 8 ++++++--
 16 files changed, 6 insertions(+), 92 deletions(-)

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index cb64e4797d2a89..f4e20f75438f8b 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -497,12 +497,6 @@ smp_cpus_done(unsigned int max_cpus)
 	       ((bogosum + 2500) / (5000/HZ)) % 100);
 }
 
-int
-setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 static void
 send_ipi_message(const struct cpumask *to_whom, enum ipi_message_type operation)
 {
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index d947473f1e6da5..ab9e75e90f729d 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -232,14 +232,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	return 0;
 }
 
-/*
- * not supported here
- */
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 /*****************************************************************************/
 /*              Inter Processor Interrupt Handling                           */
 /*****************************************************************************/
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 73fc645fc4c7e3..978db2d96b4469 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -787,14 +787,6 @@ void panic_smp_self_stop(void)
 		cpu_relax();
 }
 
-/*
- * not supported here
- */
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 #ifdef CONFIG_CPU_FREQ
 
 static DEFINE_PER_CPU(unsigned long, l_p_j_ref);
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 62ed361a4376ba..ffc5d76cf69555 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1078,14 +1078,6 @@ bool smp_crash_stop_failed(void)
 }
 #endif
 
-/*
- * not supported here
- */
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 static bool have_cpu_die(void)
 {
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index 6bb38bc2f39b43..4b605aa2e1d65d 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -243,11 +243,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
 {
 }
 
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 void csky_start_secondary(void)
 {
 	struct mm_struct *mm = &init_mm;
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 619c56420aa0c7..4ba93e59370c41 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -240,11 +240,6 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 	send_ipi(mask, IPI_CALL_FUNC);
 }
 
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 void smp_start_cpus(void)
 {
 	int i;
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 7b7b64eb312975..e2cc59db86bc2d 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -333,9 +333,3 @@ smp_send_stop (void)
 {
 	send_IPI_allbutself(IPI_CPU_STOP);
 }
-
-int
-setup_profiling_timer (unsigned int multiplier)
-{
-	return -EINVAL;
-}
diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index 27041db2c8b0f5..e1419095a6f0af 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -197,12 +197,6 @@ void smp_send_stop(void)
 	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
-/* not supported, yet */
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
 {
 	smp_cross_call = fn;
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 24d0744c3b3abc..7dbd92cafae38a 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -513,10 +513,3 @@ void __cpu_die(unsigned int cpu)
 
 	pdc_cpu_rendezvous_unlock();
 }
-
-#ifdef CONFIG_PROC_FS
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-#endif
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index bcefab484ea61d..c037c26540ddc6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1674,13 +1674,6 @@ void start_secondary(void *unused)
 	BUG();
 }
 
-#ifdef CONFIG_PROFILING
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return 0;
-}
-#endif
-
 static void __init fixup_topology(void)
 {
 	int i;
diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index b5d30ea922925f..441d0ceb80adbc 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c
@@ -64,12 +64,6 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
 	return phys_id == cpuid_to_hartid_map(cpu);
 }
 
-/* Unsupported */
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 static void ipi_stop(void)
 {
 	set_cpu_online(smp_processor_id(), false);
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 22b148e5a5f88c..ad8094d955eba6 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -174,11 +174,6 @@ void smp_call_function_interrupt(void)
 	irq_exit();
 }
 
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int i, cpuid, extra;
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index a1f78e9ddaf371..a55295d1b92448 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1186,12 +1186,6 @@ void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs)
 	preempt_enable();
 }
 
-/* /proc/profile writes can call this, don't __init it please. */
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 }
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bd8ae0a7010ae5..3415321c8240c4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -98,8 +98,6 @@ static inline bool apic_from_smp_config(void)
 #include <asm/paravirt.h>
 #endif
 
-extern int setup_profiling_timer(unsigned int);
-
 static inline void native_apic_mem_write(u32 reg, u32 v)
 {
 	volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 189d3a5e471adc..df764ceac2c85d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1115,11 +1115,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
 	set_irq_regs(old_regs);
 }
 
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 /*
  * Local APIC start and shutdown
  */
diff --git a/kernel/profile.c b/kernel/profile.c
index ae82ddfc6a6845..7ea01ba30e757e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -425,6 +425,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	return read;
 }
 
+/* default is to not implement this call */
+int __weak setup_profiling_timer(unsigned mult)
+{
+	return -EINVAL;
+}
+
 /*
  * Writing to /proc/profile resets the counters
  *
@@ -435,8 +441,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
 #ifdef CONFIG_SMP
-	extern int setup_profiling_timer(unsigned int multiplier);
-
 	if (count == sizeof(int)) {
 		unsigned int multiplier;
 

From 50feece7f770cd5d850334716d71bb8ea4868810 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <tkhai@ya.ru>
Date: Mon, 25 Jul 2022 23:37:15 +0300
Subject: [PATCH 69/72] mailmap: update Kirill's email

I disconnected from both Virtuozzo and OpenVZ, so this updates my email to
point to my own.  I haven't used @openvz address for patches, so let's
rewrite the line instead of to add a new one.  CC all previous addresses.

Link: https://lkml.kernel.org/r/14ca895b-e745-6ba2-8be8-652feacbc907@ya.ru
Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .mailmap | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index 2ed1cf86917530..04561cd90a0994 100644
--- a/.mailmap
+++ b/.mailmap
@@ -221,7 +221,7 @@ Kees Cook <keescook@chromium.org> <kees@ubuntu.com>
 Keith Busch <kbusch@kernel.org> <keith.busch@intel.com>
 Keith Busch <kbusch@kernel.org> <keith.busch@linux.intel.com>
 Kenneth W Chen <kenneth.w.chen@intel.com>
-Kirill Tkhai <kirill.tkhai@openvz.org> <ktkhai@virtuozzo.com>
+Kirill Tkhai <tkhai@ya.ru> <ktkhai@virtuozzo.com>
 Konstantin Khlebnikov <koct9i@gmail.com> <khlebnikov@yandex-team.ru>
 Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
 Koushik <raghavendra.koushik@neterion.com>

From 9f3cebf0bb84a2b94cb59df869f1463e4954e150 Mon Sep 17 00:00:00 2001
From: Brendan Higgins <brendan.higgins@linux.dev>
Date: Mon, 25 Jul 2022 17:58:33 -0400
Subject: [PATCH 70/72] mailmap: add linux.dev alias for Brendan Higgins

Because of my new work remote setup at Google, I can no longer use command
line tools with my google.com email address, for this reason I got a
linux.dev account.  So update the mailmap to show the new alias I will be
using.

Link: https://lkml.kernel.org/r/20220725215833.789133-1-brendan.higgins@linux.dev
Signed-off-by: Brendan Higgins <brendan.higgins@linux.dev>
Reviewed-by: David Gow <davidgow@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Daniel Latypov <dlatypov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .mailmap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index 04561cd90a0994..23e98625d1ce6f 100644
--- a/.mailmap
+++ b/.mailmap
@@ -71,6 +71,7 @@ Boris Brezillon <bbrezillon@kernel.org> <b.brezillon.dev@gmail.com>
 Boris Brezillon <bbrezillon@kernel.org> <b.brezillon@overkiz.com>
 Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@bootlin.com>
 Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@free-electrons.com>
+Brendan Higgins <brendan.higgins@linux.dev> <brendanhiggins@google.com>
 Brian Avery <b.avery@hp.com>
 Brian King <brking@us.ibm.com>
 Brian Silverman <bsilver16384@gmail.com> <brian.silverman@bluerivertech.com>

From 9f98911a9d6e08037903e8ecf44d50f4bcf2368d Mon Sep 17 00:00:00 2001
From: Brendan Higgins <brendan.higgins@linux.dev>
Date: Mon, 25 Jul 2022 18:07:37 -0400
Subject: [PATCH 71/72] MAINTAINERS: kunit: add David Gow as a maintainer of
 KUnit

David has been a de facto maintainer of KUnit for a long time now.
Formalize this in the MAINTAINERS file.

Link: https://lkml.kernel.org/r/20220725220737.790976-1-brendan.higgins@linux.dev
Signed-off-by: Brendan Higgins <brendan.higgins@linux.dev>
Reviewed-by: David Gow <davidgow@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Daniel Latypov <dlatypov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 52d1c5d0ca9514..ba745a624c756d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10818,6 +10818,7 @@ F:	fs/smbfs_common/
 
 KERNEL UNIT TESTING FRAMEWORK (KUnit)
 M:	Brendan Higgins <brendanhiggins@google.com>
+M:	David Gow <davidgow@google.com>
 L:	linux-kselftest@vger.kernel.org
 L:	kunit-dev@googlegroups.com
 S:	Maintained

From b99695580bfc1f91364023c673681ddb88e375dc Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 12 Jul 2022 12:02:48 +0100
Subject: [PATCH 72/72] scripts/gdb: ensure the absolute path is generated on
 initial source

Post 'make scripts_gdb' a symbolic link to scripts/gdb/vmlinux-gdb.py is
created.  Currently 'os.path.dirname(__file__)' does not generate the
absolute path to scripts/gdb resulting in the following:

    (gdb) source vmlinux-gdb.py
    Traceback (most recent call last):
      File "scripts/gdb/vmlinux-gdb.py", line 25, in <module>
	import linux.utils
    ModuleNotFoundError: No module named 'linux'

This patch ensures that the absolute path to scripts/gdb in relation to
the given file is generated so each module can be located accordingly.

Link: https://lkml.kernel.org/r/20220712110248.1404125-1-atomlin@redhat.com
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/gdb/vmlinux-gdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index 4136dc2c59df23..3e8d3669f0ce00 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -13,7 +13,7 @@
 
 import os
 
-sys.path.insert(0, os.path.dirname(__file__) + "/scripts/gdb")
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/scripts/gdb")
 
 try:
     gdb.parse_and_eval("0")