Skip to content

Commit

Permalink
mlock: only hold mmap_sem in shared mode when faulting in pages
Browse files Browse the repository at this point in the history
Currently mlock() holds mmap_sem in exclusive mode while the pages get
faulted in.  In the case of a large mlock, this can potentially take a
very long time, during which various commands such as 'ps auxw' will
block.  This makes sysadmins unhappy:

real    14m36.232s
user    0m0.003s
sys     0m0.015s
(output from 'time ps auxw' while a 20GB file was being mlocked without
being previously preloaded into page cache)

I propose that mlock() could release mmap_sem after the VM_LOCKED bits
have been set in all appropriate VMAs.  Then a second pass could be done
to actually mlock the pages, in small batches, releasing mmap_sem when we
block on disk access or when we detect some contention.

This patch:

Before this change, mlock() holds mmap_sem in exclusive mode while the
pages get faulted in.  In the case of a large mlock, this can potentially
take a very long time.  Various things will block while mmap_sem is held,
including 'ps auxw'.  This can make sysadmins angry.

I propose that mlock() could release mmap_sem after the VM_LOCKED bits
have been set in all appropriate VMAs.  Then a second pass could be done
to actually mlock the pages with mmap_sem held for reads only.  We need to
recheck the vma flags after we re-acquire mmap_sem, but this is easy.

In the case where a vma has been munlocked before mlock completes, pages
that were already marked as PageMlocked() are handled by the munlock()
call, and mlock() is careful to not mark new page batches as PageMlocked()
after the munlock() call has cleared the VM_LOCKED vma flags.  So, the end
result will be identical to what'd happen if munlock() had executed after
the mlock() call.

In a later change, I will allow the second pass to release mmap_sem when
blocking on disk accesses or when it is otherwise contended, so that it
won't be held for long periods of time even in shared mode.

Signed-off-by: Michel Lespinasse <[email protected]>
Tested-by: Valdis Kletnieks <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Nick Piggin <[email protected]>
Cc: KOSAKI Motohiro <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: David Howells <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
walken-google authored and torvalds committed Jan 14, 2011
1 parent 5ecfda0 commit fed067d
Showing 1 changed file with 64 additions and 17 deletions.
81 changes: 64 additions & 17 deletions mm/mlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -377,18 +377,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
int ret = 0;
int lock = newflags & VM_LOCKED;

if (newflags == vma->vm_flags ||
(vma->vm_flags & (VM_IO | VM_PFNMAP)))
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
goto out; /* don't set VM_LOCKED, don't count */

if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current)) {
if (lock)
make_pages_present(start, end);
goto out; /* don't set VM_LOCKED, don't count */
}

pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
Expand Down Expand Up @@ -424,14 +416,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/

if (lock) {
if (lock)
vma->vm_flags = newflags;
ret = __mlock_vma_pages_range(vma, start, end);
if (ret < 0)
ret = __mlock_posix_error_return(ret);
} else {
else
munlock_vma_pages_range(vma, start, end);
}

out:
*prev = vma;
Expand All @@ -444,7 +432,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
struct vm_area_struct * vma, * prev;
int error;

len = PAGE_ALIGN(len);
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
return -EINVAL;
Expand Down Expand Up @@ -487,6 +476,58 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}

static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
{
struct mm_struct *mm = current->mm;
unsigned long end, nstart, nend;
struct vm_area_struct *vma = NULL;
int ret = 0;

VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;

down_read(&mm->mmap_sem);
for (nstart = start; nstart < end; nstart = nend) {
/*
* We want to fault in pages for [nstart; end) address range.
* Find first corresponding VMA.
*/
if (!vma)
vma = find_vma(mm, nstart);
else
vma = vma->vm_next;
if (!vma || vma->vm_start >= end)
break;
/*
* Set [nstart; nend) to intersection of desired address
* range with the first VMA. Also, skip undesirable VMA types.
*/
nend = min(end, vma->vm_end);
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
continue;
if (nstart < vma->vm_start)
nstart = vma->vm_start;
/*
* Now fault in a range of pages within the first VMA.
*/
if (vma->vm_flags & VM_LOCKED) {
ret = __mlock_vma_pages_range(vma, nstart, nend);
if (ret < 0 && ignore_errors) {
ret = 0;
continue; /* continue at next VMA */
}
if (ret) {
ret = __mlock_posix_error_return(ret);
break;
}
} else
make_pages_present(nstart, nend);
}
up_read(&mm->mmap_sem);
return ret; /* 0 or negative error code */
}

SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
Expand All @@ -512,6 +553,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
if (!error)
error = do_mlock_pages(start, len, 0);
return error;
}

Expand Down Expand Up @@ -576,6 +619,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
if (!ret && (flags & MCL_CURRENT)) {
/* Ignore errors */
do_mlock_pages(0, TASK_SIZE, 1);
}
out:
return ret;
}
Expand Down

0 comments on commit fed067d

Please sign in to comment.