Skip to content

Commit

Permalink
proc/vmcore: fix possible deadlock on concurrent mmap and read
Browse files Browse the repository at this point in the history
Lockdep noticed that there is chance for a deadlock if we have concurrent
mmap, concurrent read, and the addition/removal of a callback.

As nicely explained by Boqun:
 "Lockdep warned about the above sequences because rw_semaphore is a
  fair read-write lock, and the following can cause a deadlock:

	TASK 1			TASK 2		TASK 3
	======			======		======
	down_write(mmap_lock);
				down_read(vmcore_cb_rwsem)
						down_write(vmcore_cb_rwsem); // blocked
	down_read(vmcore_cb_rwsem); // cannot get the lock because of the fairness
				down_read(mmap_lock); // blocked

  IOW, a reader can block another read if there is a writer queued by
  the second reader and the lock is fair"

To fix this, convert to srcu to make this deadlock impossible.  We need
srcu as our callbacks can sleep.  With this change, I cannot trigger any
lockdep warnings.

    ======================================================
    WARNING: possible circular locking dependency detected
    5.17.0-0.rc0.20220117git0c947b893d69.68.test.fc36.x86_64 #1 Not tainted
    ------------------------------------------------------
    makedumpfile/542 is trying to acquire lock:
    ffffffff832d2eb8 (vmcore_cb_rwsem){.+.+}-{3:3}, at: mmap_vmcore+0x340/0x580

    but task is already holding lock:
    ffff8880af226438 (&mm->mmap_lock#2){++++}-{3:3}, at: vm_mmap_pgoff+0x84/0x150

    which lock already depends on the new lock.

    the existing dependency chain (in reverse order) is:

    -> #1 (&mm->mmap_lock#2){++++}-{3:3}:
           lock_acquire+0xc3/0x1a0
           __might_fault+0x4e/0x70
           _copy_to_user+0x1f/0x90
           __copy_oldmem_page+0x72/0xc0
           read_from_oldmem+0x77/0x1e0
           read_vmcore+0x2c2/0x310
           proc_reg_read+0x47/0xa0
           vfs_read+0x101/0x340
           __x64_sys_pread64+0x5d/0xa0
           do_syscall_64+0x43/0x90
           entry_SYSCALL_64_after_hwframe+0x44/0xae

    -> #0 (vmcore_cb_rwsem){.+.+}-{3:3}:
           validate_chain+0x9f4/0x2670
           __lock_acquire+0x8f7/0xbc0
           lock_acquire+0xc3/0x1a0
           down_read+0x4a/0x140
           mmap_vmcore+0x340/0x580
           proc_reg_mmap+0x3e/0x90
           mmap_region+0x504/0x880
           do_mmap+0x38a/0x520
           vm_mmap_pgoff+0xc1/0x150
           ksys_mmap_pgoff+0x178/0x200
           do_syscall_64+0x43/0x90
           entry_SYSCALL_64_after_hwframe+0x44/0xae

    other info that might help us debug this:

     Possible unsafe locking scenario:

           CPU0                    CPU1
           ----                    ----
      lock(&mm->mmap_lock#2);
                                   lock(vmcore_cb_rwsem);
                                   lock(&mm->mmap_lock#2);
      lock(vmcore_cb_rwsem);

     *** DEADLOCK ***

    1 lock held by makedumpfile/542:
     #0: ffff8880af226438 (&mm->mmap_lock#2){++++}-{3:3}, at: vm_mmap_pgoff+0x84/0x150

    stack backtrace:
    CPU: 0 PID: 542 Comm: makedumpfile Not tainted 5.17.0-0.rc0.20220117git0c947b893d69.68.test.fc36.x86_64 #1
    Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
    Call Trace:
     __lock_acquire+0x8f7/0xbc0
     lock_acquire+0xc3/0x1a0
     down_read+0x4a/0x140
     mmap_vmcore+0x340/0x580
     proc_reg_mmap+0x3e/0x90
     mmap_region+0x504/0x880
     do_mmap+0x38a/0x520
     vm_mmap_pgoff+0xc1/0x150
     ksys_mmap_pgoff+0x178/0x200
     do_syscall_64+0x43/0x90

Link: https://lkml.kernel.org/r/[email protected]
Fixes: cc5f270 ("proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks")
Signed-off-by: David Hildenbrand <[email protected]>
Reported-by: Baoquan He <[email protected]>
Acked-by: Baoquan He <[email protected]>
Cc: Vivek Goyal <[email protected]>
Cc: Dave Young <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Josh Triplett <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Boqun Feng <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
davidhildenbrand authored and torvalds committed Mar 24, 2022
1 parent 3a72917 commit 5039b17
Showing 1 changed file with 22 additions and 19 deletions.
41 changes: 22 additions & 19 deletions fs/proc/vmcore.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,39 +62,42 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;

static DECLARE_RWSEM(vmcore_cb_rwsem);
static DEFINE_SPINLOCK(vmcore_cb_lock);
DEFINE_STATIC_SRCU(vmcore_cb_srcu);
/* List of registered vmcore callbacks. */
static LIST_HEAD(vmcore_cb_list);
/* Whether the vmcore has been opened once. */
static bool vmcore_opened;

void register_vmcore_cb(struct vmcore_cb *cb)
{
down_write(&vmcore_cb_rwsem);
INIT_LIST_HEAD(&cb->next);
spin_lock(&vmcore_cb_lock);
list_add_tail(&cb->next, &vmcore_cb_list);
/*
* Registering a vmcore callback after the vmcore was opened is
* very unusual (e.g., manual driver loading).
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback registration\n");
up_write(&vmcore_cb_rwsem);
spin_unlock(&vmcore_cb_lock);
}
EXPORT_SYMBOL_GPL(register_vmcore_cb);

void unregister_vmcore_cb(struct vmcore_cb *cb)
{
down_write(&vmcore_cb_rwsem);
list_del(&cb->next);
spin_lock(&vmcore_cb_lock);
list_del_rcu(&cb->next);
/*
* Unregistering a vmcore callback after the vmcore was opened is
* very unusual (e.g., forced driver removal), but we cannot stop
* unregistering.
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback unregistration\n");
up_write(&vmcore_cb_rwsem);
spin_unlock(&vmcore_cb_lock);

synchronize_srcu(&vmcore_cb_srcu);
}
EXPORT_SYMBOL_GPL(unregister_vmcore_cb);

Expand All @@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn)
struct vmcore_cb *cb;
bool ret = true;

lockdep_assert_held_read(&vmcore_cb_rwsem);

list_for_each_entry(cb, &vmcore_cb_list, next) {
list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
srcu_read_lock_held(&vmcore_cb_srcu)) {
if (unlikely(!cb->pfn_is_ram))
continue;
ret = cb->pfn_is_ram(cb, pfn);
Expand All @@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn)

static int open_vmcore(struct inode *inode, struct file *file)
{
down_read(&vmcore_cb_rwsem);
spin_lock(&vmcore_cb_lock);
vmcore_opened = true;
up_read(&vmcore_cb_rwsem);
spin_unlock(&vmcore_cb_lock);

return 0;
}
Expand All @@ -133,14 +135,15 @@ ssize_t read_from_oldmem(char *buf, size_t count,
unsigned long pfn, offset;
size_t nr_bytes;
ssize_t read = 0, tmp;
int idx;

if (!count)
return 0;

offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);

down_read(&vmcore_cb_rwsem);
idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
Expand All @@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset, userbuf);
}
if (tmp < 0) {
up_read(&vmcore_cb_rwsem);
srcu_read_unlock(&vmcore_cb_srcu, idx);
return tmp;
}

Expand All @@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count,
++pfn;
offset = 0;
} while (count);
srcu_read_unlock(&vmcore_cb_srcu, idx);

up_read(&vmcore_cb_rwsem);
return read;
}

Expand Down Expand Up @@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
int ret;
int ret, idx;

/*
* Check if oldmem_pfn_is_ram was registered to avoid
* looping over all pages without a reason.
* Check if a callback was registered to avoid looping over all
* pages without a reason.
*/
down_read(&vmcore_cb_rwsem);
idx = srcu_read_lock(&vmcore_cb_srcu);
if (!list_empty(&vmcore_cb_list))
ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
up_read(&vmcore_cb_rwsem);
srcu_read_unlock(&vmcore_cb_srcu, idx);
return ret;
}

Expand Down

0 comments on commit 5039b17

Please sign in to comment.