mmap: handle mlocked pages during map, remap, unmap

Originally by Nick Piggin <[email protected]> Remove mlocked pages from the LRU using "unevictable infrastructure" during mmap(), munmap(), mremap() and truncate(). Try to move back to normal LRU lists on munmap() when last mlocked mapping removed. Remove PageMlocked() status when page truncated from file. [[email protected]: cleanup] [[email protected]: fix double unlock_page()] [[email protected]: split LRU: munlock rework] [[email protected]: mlock: fix __mlock_vma_pages_range comment block] [[email protected]: remove bogus kerneldoc token] Signed-off-by: Nick Piggin <[email protected]> Signed-off-by: Lee Schermerhorn <[email protected]> Signed-off-by: Rik van Riel <[email protected]> Signed-off-by: KOSAKI Motohiro <[email protected]> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
diverger · Oct 20, 2008 · ba470de · ba470de
1 parent 8edb08c
commit ba470de
Show file tree

Hide file tree

Showing 6 changed files with 180 additions and 160 deletions.
diff --git a/mm/fremap.c b/mm/fremap.c
@@ -21,6 +21,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, pte_t *ptep)
 {
@@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
+	if (vma->vm_flags & VM_LOCKED) {
+		/*
+		 * drop PG_Mlocked flag for over-mapped range
+		 */
+		unsigned int saved_flags = vma->vm_flags;
+		munlock_vma_pages_range(vma, start, start + size);
+		vma->vm_flags = saved_flags;
+	}
+
 	mmu_notifier_invalidate_range_start(mm, start, start + size);
 	err = populate_range(mm, vma, start, size, pgoff);
 	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
-		if (unlikely(has_write_lock)) {
-			downgrade_write(&mm->mmap_sem);
-			has_write_lock = 0;
+		if (vma->vm_flags & VM_LOCKED) {
+			/*
+			 * might be mapping previously unmapped range of file
+			 */
+			mlock_vma_pages_range(vma, start, start + size);
+		} else {
+			if (unlikely(has_write_lock)) {
+				downgrade_write(&mm->mmap_sem);
+				has_write_lock = 0;
+			}
+			make_pages_present(start, start+size);
 		}
-		make_pages_present(start, start+size);
 	}
 
 	/*
@@ -240,4 +258,3 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 
 	return err;
 }
-
diff --git a/mm/internal.h b/mm/internal.h
@@ -61,9 +61,14 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
-extern int mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
-extern void munlock_vma_pages_all(struct vm_area_struct *vma);
+extern void munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end);
+static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 /*

diff --git a/mm/mlock.c b/mm/mlock.c
@@ -112,26 +112,49 @@ static void munlock_vma_page(struct page *page)
 	}
 }
 
-/*
- * mlock a range of pages in the vma.
+/**
+ * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
+ * @vma:   target vma
+ * @start: start address
+ * @end:   end address
+ * @mlock: 0 indicate munlock, otherwise mlock.
+ *
+ * If @mlock == 0, unlock an mlocked range;
+ * else mlock the range of pages.  This takes care of making the pages present ,
+ * too.
  *
- * This takes care of making the pages present too.
+ * return 0 on success, negative error code on error.
  *
- * vma->vm_mm->mmap_sem must be held for write.
+ * vma->vm_mm->mmap_sem must be held for at least read.
  */
-static int __mlock_vma_pages_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long end)
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end,
+				   int mlock)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr = start;
 	struct page *pages[16]; /* 16 gives a reasonable batch */
-	int write = !!(vma->vm_flags & VM_WRITE);
 	int nr_pages = (end - start) / PAGE_SIZE;
 	int ret;
+	int gup_flags = 0;
 
-	VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
-	VM_BUG_ON(start < vma->vm_start || end > vma->vm_end);
-	VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
+	VM_BUG_ON(start & ~PAGE_MASK);
+	VM_BUG_ON(end   & ~PAGE_MASK);
+	VM_BUG_ON(start < vma->vm_start);
+	VM_BUG_ON(end   > vma->vm_end);
+	VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
+		  (atomic_read(&mm->mm_users) != 0));
+
+	/*
+	 * mlock:   don't page populate if page has PROT_NONE permission.
+	 * munlock: the pages always do munlock althrough
+	 *          its has PROT_NONE permission.
+	 */
+	if (!mlock)
+		gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+
+	if (vma->vm_flags & VM_WRITE)
+		gup_flags |= GUP_FLAGS_WRITE;
 
 	lru_add_drain_all();	/* push cached pages to LRU */
 
@@ -146,9 +169,9 @@ static int __mlock_vma_pages_range(struct vm_area_struct *vma,
 		 * disable migration of this page.  However, page may
 		 * still be truncated out from under us.
 		 */
-		ret = get_user_pages(current, mm, addr,
+		ret = __get_user_pages(current, mm, addr,
 				min_t(int, nr_pages, ARRAY_SIZE(pages)),
-				write, 0, pages, NULL);
+				gup_flags, pages, NULL);
 		/*
 		 * This can happen for, e.g., VM_NONLINEAR regions before
 		 * a page has been allocated and mapped at a given offset,
@@ -178,8 +201,12 @@ static int __mlock_vma_pages_range(struct vm_area_struct *vma,
 			 * by the elevated reference, we need only check for
 			 * page truncation (file-cache only).
 			 */
-			if (page->mapping)
-				mlock_vma_page(page);
+			if (page->mapping) {
+				if (mlock)
+					mlock_vma_page(page);
+				else
+					munlock_vma_page(page);
+			}
 			unlock_page(page);
 			put_page(page);		/* ref from get_user_pages() */
 
@@ -197,125 +224,38 @@ static int __mlock_vma_pages_range(struct vm_area_struct *vma,
 	return 0;	/* count entire vma as locked_vm */
 }
 
-/*
- * private structure for munlock page table walk
- */
-struct munlock_page_walk {
-	struct vm_area_struct *vma;
-	pmd_t                 *pmd; /* for migration_entry_wait() */
-};
-
-/*
- * munlock normal pages for present ptes
- */
-static int __munlock_pte_handler(pte_t *ptep, unsigned long addr,
-				   unsigned long end, struct mm_walk *walk)
-{
-	struct munlock_page_walk *mpw = walk->private;
-	swp_entry_t entry;
-	struct page *page;
-	pte_t pte;
-
-retry:
-	pte = *ptep;
-	/*
-	 * If it's a swap pte, we might be racing with page migration.
-	 */
-	if (unlikely(!pte_present(pte))) {
-		if (!is_swap_pte(pte))
-			goto out;
-		entry = pte_to_swp_entry(pte);
-		if (is_migration_entry(entry)) {
-			migration_entry_wait(mpw->vma->vm_mm, mpw->pmd, addr);
-			goto retry;
-		}
-		goto out;
-	}
-
-	page = vm_normal_page(mpw->vma, addr, pte);
-	if (!page)
-		goto out;
-
-	lock_page(page);
-	if (!page->mapping) {
-		unlock_page(page);
-		goto retry;
-	}
-	munlock_vma_page(page);
-	unlock_page(page);
-
-out:
-	return 0;
-}
-
-/*
- * Save pmd for pte handler for waiting on migration entries
- */
-static int __munlock_pmd_handler(pmd_t *pmd, unsigned long addr,
-				 unsigned long end, struct mm_walk *walk)
-{
-	struct munlock_page_walk *mpw = walk->private;
-
-	mpw->pmd = pmd;
-	return 0;
-}
-
-
-/*
- * munlock a range of pages in the vma using standard page table walk.
- *
- * vma->vm_mm->mmap_sem must be held for write.
- */
-static void __munlock_vma_pages_range(struct vm_area_struct *vma,
-			      unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	struct munlock_page_walk mpw = {
-		.vma = vma,
-	};
-	struct mm_walk munlock_page_walk = {
-		.pmd_entry = __munlock_pmd_handler,
-		.pte_entry = __munlock_pte_handler,
-		.private = &mpw,
-		.mm = mm,
-	};
-
-	VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
-	VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
-	VM_BUG_ON(start < vma->vm_start);
-	VM_BUG_ON(end > vma->vm_end);
-
-	lru_add_drain_all();	/* push cached pages to LRU */
-	walk_page_range(start, end, &munlock_page_walk);
-	lru_add_drain_all();	/* to update stats */
-}
-
 #else /* CONFIG_UNEVICTABLE_LRU */
 
 /*
  * Just make pages present if VM_LOCKED.  No-op if unlocking.
  */
-static int __mlock_vma_pages_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long end)
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end,
+				   int mlock)
 {
-	if (vma->vm_flags & VM_LOCKED)
+	if (mlock && (vma->vm_flags & VM_LOCKED))
 		make_pages_present(start, end);
 	return 0;
 }
-
-/*
- * munlock a range of pages in the vma -- no-op.
- */
-static void __munlock_vma_pages_range(struct vm_area_struct *vma,
-			      unsigned long start, unsigned long end)
-{
-}
 #endif /* CONFIG_UNEVICTABLE_LRU */
 
-/*
- * mlock all pages in this vma range.  For mmap()/mremap()/...
+/**
+ * mlock_vma_pages_range() - mlock pages in specified vma range.
+ * @vma - the vma containing the specfied address range
+ * @start - starting address in @vma to mlock
+ * @end   - end address [+1] in @vma to mlock
+ *
+ * For mmap()/mremap()/expansion of mlocked vma.
+ *
+ * return 0 on success for "normal" vmas.
+ *
+ * return number of pages [> 0] to be removed from locked_vm on success
+ * of "special" vmas.
+ *
+ * return negative error if vma spanning @start-@range disappears while
+ * mmap semaphore is dropped.  Unlikely?
  */
-int mlock_vma_pages_range(struct vm_area_struct *vma,
+long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -331,17 +271,20 @@ int mlock_vma_pages_range(struct vm_area_struct *vma,
 	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
 			is_vm_hugetlb_page(vma) ||
 			vma == get_gate_vma(current))) {
+		long error;
 		downgrade_write(&mm->mmap_sem);
-		nr_pages = __mlock_vma_pages_range(vma, start, end);
+
+		error = __mlock_vma_pages_range(vma, start, end, 1);
 
 		up_read(&mm->mmap_sem);
 		/* vma can change or disappear */
 		down_write(&mm->mmap_sem);
 		vma = find_vma(mm, start);
 		/* non-NULL vma must contain @start, but need to check @end */
 		if (!vma ||  end > vma->vm_end)
-			return -EAGAIN;
-		return nr_pages;
+			return -ENOMEM;
+
+		return 0;	/* hide other errors from mmap(), et al */
 	}
 
 	/*
@@ -356,17 +299,33 @@ int mlock_vma_pages_range(struct vm_area_struct *vma,
 
 no_mlock:
 	vma->vm_flags &= ~VM_LOCKED;	/* and don't come back! */
-	return nr_pages;		/* pages NOT mlocked */
+	return nr_pages;		/* error or pages NOT mlocked */
 }
 
 
 /*
- * munlock all pages in vma.   For munmap() and exit().
+ * munlock_vma_pages_range() - munlock all pages in the vma range.'
+ * @vma - vma containing range to be munlock()ed.
+ * @start - start address in @vma of the range
+ * @end - end of range in @vma.
+ *
+ *  For mremap(), munmap() and exit().
+ *
+ * Called with @vma VM_LOCKED.
+ *
+ * Returns with VM_LOCKED cleared.  Callers must be prepared to
+ * deal with this.
+ *
+ * We don't save and restore VM_LOCKED here because pages are
+ * still on lru.  In unmap path, pages might be scanned by reclaim
+ * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * free them.  This will result in freeing mlocked pages.
  */
-void munlock_vma_pages_all(struct vm_area_struct *vma)
+void munlock_vma_pages_range(struct vm_area_struct *vma,
+			   unsigned long start, unsigned long end)
 {
 	vma->vm_flags &= ~VM_LOCKED;
-	__munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+	__mlock_vma_pages_range(vma, start, end, 0);
 }
 
 /*
@@ -443,7 +402,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		 */
 		downgrade_write(&mm->mmap_sem);
 
-		ret = __mlock_vma_pages_range(vma, start, end);
+		ret = __mlock_vma_pages_range(vma, start, end, 1);
 		if (ret > 0) {
 			mm->locked_vm -= ret;
 			ret = 0;
@@ -460,7 +419,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		*prev = find_vma(mm, start);
 		/* non-NULL *prev must contain @start, but need to check @end */
 		if (!(*prev) || end > (*prev)->vm_end)
-			ret = -EAGAIN;
+			ret = -ENOMEM;
 	} else {
 		/*
 		 * TODO:  for unlocking, pages will already be resident, so
@@ -469,7 +428,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		 * while.  Should we downgrade the semaphore for both lock
 		 * AND unlock ?
 		 */
-		__munlock_vma_pages_range(vma, start, end);
+		__mlock_vma_pages_range(vma, start, end, 0);
 	}
 
 out: