Skip to content

Commit

Permalink
mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY
Browse files Browse the repository at this point in the history
On UFFDIO_COPY, if we fail to copy the page contents while holding the
hugetlb_fault_mutex, we will drop the mutex and return to the caller after
allocating a page that consumed a reservation.  In this case there may be
a fault that double consumes the reservation.  To handle this, we free the
allocated page, fix the reservations, and allocate a temporary hugetlb
page and return that to the caller.  When the caller does the copy outside
of the lock, we again check the cache, and allocate a page consuming the
reservation, and copy over the contents.

Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning and the copy_huge_page_from_user() always fails, then:

./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
        2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
	2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success

Both tests succeed and produce no warnings. After the
test runs number of free/resv hugepages is correct.

[[email protected]: remove set but not used variable 'vm_alloc_shared']
  Link: https://lkml.kernel.org/r/[email protected]
[[email protected]: fix allocation error check and copy func name]
  Link: https://lkml.kernel.org/r/[email protected]

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Mina Almasry <[email protected]>
Signed-off-by: YueHaibing <[email protected]>
Cc: Axel Rasmussen <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Mike Kravetz <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
mina authored and torvalds committed Jul 1, 2021
1 parent 22f3c95 commit 8cc5fcb
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 59 deletions.
4 changes: 4 additions & 0 deletions include/linux/migrate.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
extern int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page, int extra_count);
extern void copy_huge_page(struct page *dst, struct page *src);
#else

static inline void putback_movable_pages(struct list_head *l) {}
Expand All @@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
return -ENOSYS;
}

static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
#endif /* CONFIG_MIGRATION */

#ifdef CONFIG_COMPACTION
Expand Down
48 changes: 39 additions & 9 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
#include <linux/migrate.h>

#include <asm/page.h>
#include <asm/pgalloc.h>
Expand Down Expand Up @@ -5076,20 +5077,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct address_space *mapping;
pgoff_t idx;
struct hstate *h = hstate_vma(dst_vma);
struct address_space *mapping = dst_vma->vm_file->f_mapping;
pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
int ret;
int ret = -ENOMEM;
struct page *page;
int writable;

mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);

if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
Expand Down Expand Up @@ -5118,12 +5116,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
/* Free the allocated page which may have
* consumed a reservation.
*/
restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);

/* Allocate a temporary page to hold the copied
* contents.
*/
page = alloc_huge_page_vma(h, dst_vma, dst_addr);
if (!page) {
ret = -ENOMEM;
goto out;
}
*pagep = page;
/* don't free the page */
/* Set the outparam pagep and return to the caller to
* copy the contents outside the lock. Don't free the
* page.
*/
goto out;
}
} else {
page = *pagep;
if (vm_shared &&
hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
put_page(*pagep);
ret = -EEXIST;
*pagep = NULL;
goto out;
}

page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page)) {
ret = -ENOMEM;
*pagep = NULL;
goto out;
}
copy_huge_page(page, *pagep);
put_page(*pagep);
*pagep = NULL;
}

Expand Down
2 changes: 1 addition & 1 deletion mm/migrate.c
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
}
}

static void copy_huge_page(struct page *dst, struct page *src)
void copy_huge_page(struct page *dst, struct page *src)
{
int i;
int nr_pages;
Expand Down
50 changes: 1 addition & 49 deletions mm/userfaultfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long len,
enum mcopy_atomic_mode mode)
{
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
Expand Down Expand Up @@ -308,7 +307,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,

mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;

cond_resched();

Expand Down Expand Up @@ -346,54 +344,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
out_unlock:
mmap_read_unlock(dst_mm);
out:
if (page) {
/*
* We encountered an error and are about to free a newly
* allocated huge page.
*
* Reservation handling is very subtle, and is different for
* private and shared mappings. See the routine
* restore_reserve_on_error for details. Unfortunately, we
* can not call restore_reserve_on_error now as it would
* require holding mmap_lock.
*
* If a reservation for the page existed in the reservation
* map of a private mapping, the map was modified to indicate
* the reservation was consumed when the page was allocated.
* We clear the HPageRestoreReserve flag now so that the global
* reserve count will not be incremented in free_huge_page.
* The reservation map will still indicate the reservation
* was consumed and possibly prevent later page allocation.
* This is better than leaking a global reservation. If no
* reservation existed, it is still safe to clear
* HPageRestoreReserve as no adjustments to reservation counts
* were made during allocation.
*
* The reservation map for shared mappings indicates which
* pages have reservations. When a huge page is allocated
* for an address with a reservation, no change is made to
* the reserve map. In this case HPageRestoreReserve will be
* set to indicate that the global reservation count should be
* incremented when the page is freed. This is the desired
* behavior. However, when a huge page is allocated for an
* address without a reservation a reservation entry is added
* to the reservation map, and HPageRestoreReserve will not be
* set. When the page is freed, the global reserve count will
* NOT be incremented and it will appear as though we have
* leaked reserved page. In this case, set HPageRestoreReserve
* so that the global reserve count will be incremented to
* match the reservation map entry which was created.
*
* Note that vm_alloc_shared is based on the flags of the vma
* for which the page was originally allocated. dst_vma could
* be different or NULL on error.
*/
if (vm_alloc_shared)
SetHPageRestoreReserve(page);
else
ClearHPageRestoreReserve(page);
if (page)
put_page(page);
}
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
Expand Down

0 comments on commit 8cc5fcb

Please sign in to comment.