Skip to content

Commit

Permalink
mm: introduce VM_LOCKONFAULT
Browse files Browse the repository at this point in the history
The cost of faulting in all memory to be locked can be very high when
working with large mappings.  If only portions of the mapping will be used
this can incur a high penalty for locking.

For the example of a large file, this is the usage pattern for a large
statical language model (probably applies to other statical or graphical
models as well).  For the security example, any application transacting in
data that cannot be swapped out (credit card data, medical records, etc).

This patch introduces the ability to request that pages are not
pre-faulted, but are placed on the unevictable LRU when they are finally
faulted in.  The VM_LOCKONFAULT flag will be used together with VM_LOCKED
and has no effect when set without VM_LOCKED.  Setting the VM_LOCKONFAULT
flag for a VMA will cause pages faulted into that VMA to be added to the
unevictable LRU when they are faulted or if they are already present, but
will not cause any missing pages to be faulted in.

Exposing this new lock state means that we cannot overload the meaning of
the FOLL_POPULATE flag any longer.  Prior to this patch it was used to
mean that the VMA for a fault was locked.  This means we need the new
FOLL_MLOCK flag to communicate the locked state of a VMA.  FOLL_POPULATE
will now only control if the VMA should be populated and in the case of
VM_LOCKONFAULT, it will not be set.

Signed-off-by: Eric B Munson <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Geert Uytterhoeven <[email protected]>
Cc: Guenter Roeck <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Michael Kerrisk <[email protected]>
Cc: Ralf Baechle <[email protected]>
Cc: Shuah Khan <[email protected]>
Cc: Stephen Rothwell <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Eric B Munson authored and torvalds committed Nov 6, 2015
1 parent a8ca5d0 commit de60f5f
Show file tree
Hide file tree
Showing 8 changed files with 21 additions and 8 deletions.
5 changes: 5 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);

#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
Expand Down Expand Up @@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp);
/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK VM_NOHUGEPAGE

/* This mask is used to clear all the VMA flags used by mlock */
#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))

/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
Expand Down Expand Up @@ -2137,6 +2141,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_MLOCK 0x1000 /* lock present pages */

typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
Expand Down
3 changes: 2 additions & 1 deletion kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_flags &=
~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
Expand Down
1 change: 1 addition & 0 deletions mm/debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
{VM_GROWSDOWN, "growsdown" },
{VM_PFNMAP, "pfnmap" },
{VM_DENYWRITE, "denywrite" },
{VM_LOCKONFAULT, "lockonfault" },
{VM_LOCKED, "locked" },
{VM_IO, "io" },
{VM_SEQ_READ, "seqread" },
Expand Down
10 changes: 8 additions & 2 deletions mm/gup.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
*/
mark_page_accessed(page);
}
if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
Expand Down Expand Up @@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned int fault_flags = 0;
int ret;

/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
/* For mm_populate(), just skip the stack guard page. */
if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
Expand Down Expand Up @@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON_VMA(end > vma->vm_end, vma);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);

gup_flags = FOLL_TOUCH | FOLL_POPULATE;
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
gup_flags &= ~FOLL_POPULATE;

/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
Expand Down
2 changes: 1 addition & 1 deletion mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
Expand Down
4 changes: 2 additions & 2 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -4137,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;

/* Allow segments to share if only one is marked locked */
unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;

/*
* match the virtual addresses, permission and the alignment of the
Expand Down
2 changes: 1 addition & 1 deletion mm/mlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
vma->vm_flags &= ~VM_LOCKED;
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;

while (start < end) {
struct page *page = NULL;
Expand Down
2 changes: 1 addition & 1 deletion mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1661,7 +1661,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
vma->vm_flags &= ~VM_LOCKED;
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
}

if (file)
Expand Down

0 comments on commit de60f5f

Please sign in to comment.