Skip to content

Commit

Permalink
mm: rid swapoff of quadratic complexity
Browse files Browse the repository at this point in the history
This patch was initially posted by Kelley Nielsen.  Reposting the patch
with all review comments addressed and with minor modifications and
optimizations.  Also, folding in the fixes offered by Hugh Dickins and
Huang Ying.  Tests were rerun and commit message updated with new
results.

try_to_unuse() is of quadratic complexity, with a lot of wasted effort.
It unuses swap entries one by one, potentially iterating over all the
page tables for all the processes in the system for each one.

This new proposed implementation of try_to_unuse simplifies its
complexity to linear.  It iterates over the system's mms once, unusing
all the affected entries as it walks each set of page tables.  It also
makes similar changes to shmem_unuse.

Improvement

swapoff was called on a swap partition containing about 6G of data, in a
VM(8cpu, 16G RAM), and calls to unuse_pte_range() were counted.

Present implementation....about 1200M calls(8min, avg 80% cpu util).
Prototype.................about  9.0K calls(3min, avg 5% cpu util).

Details

In shmem_unuse(), iterate over the shmem_swaplist and, for each
shmem_inode_info that contains a swap entry, pass it to
shmem_unuse_inode(), along with the swap type.  In shmem_unuse_inode(),
iterate over its associated xarray, and store the index and value of
each swap entry in an array for passing to shmem_swapin_page() outside
of the RCU critical section.

In try_to_unuse(), instead of iterating over the entries in the type and
unusing them one by one, perhaps walking all the page tables for all the
processes for each one, iterate over the mmlist, making one pass.  Pass
each mm to unuse_mm() to begin its page table walk, and during the walk,
unuse all the ptes that have backing store in the swap type received by
try_to_unuse().  After the walk, check the type for orphaned swap
entries with find_next_to_unuse(), and remove them from the swap cache.
If find_next_to_unuse() starts over at the beginning of the type, repeat
the check of the shmem_swaplist and the walk a maximum of three times.

Change unuse_mm() and the intervening walk functions down to
unuse_pte_range() to take the type as a parameter, and to iterate over
their entire range, calling the next function down on every iteration.
In unuse_pte_range(), make a swap entry from each pte in the range using
the passed in type.  If it has backing store in the type, call
swapin_readahead() to retrieve the page and pass it to unuse_pte().

Pass the count of pages_to_unuse down the page table walks in
try_to_unuse(), and return from the walk when the desired number of
pages has been swapped back in.

Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Kelley Nielsen <[email protected]>
Signed-off-by: Huang Ying <[email protected]>
Acked-by: Hugh Dickins <[email protected]>
Cc: Rik van Riel <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Vineeth Remanan Pillai authored and torvalds committed Mar 6, 2019
1 parent c5bf121 commit b56a2d8
Show file tree
Hide file tree
Showing 4 changed files with 319 additions and 391 deletions.
7 changes: 7 additions & 0 deletions include/linux/frontswap.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
#include <linux/bitops.h>
#include <linux/jump_label.h>

/*
* Return code to denote that requested number of
* frontswap pages are unused(moved to page cache).
* Used in in shmem_unuse and try_to_unuse.
*/
#define FRONTSWAP_PAGES_UNUSED 2

struct frontswap_ops {
void (*init)(unsigned); /* this swap type was just swapon'ed */
int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
Expand Down
3 changes: 2 additions & 1 deletion include/linux/shmem_fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping);
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(swp_entry_t entry, struct page *page);
extern int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse);

extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
Expand Down
267 changes: 147 additions & 120 deletions mm/shmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
#include <linux/frontswap.h>

#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */

Expand Down Expand Up @@ -1093,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}

static unsigned long find_swap_entry(struct xarray *xa, void *item)
extern struct swap_info_struct *swap_info[];

static int shmem_find_swap_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices,
bool frontswap)
{
XA_STATE(xas, xa, 0);
unsigned int checked = 0;
void *entry;
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned int ret = 0;

if (!nr_entries)
return 0;

rcu_read_lock();
xas_for_each(&xas, entry, ULONG_MAX) {
if (xas_retry(&xas, entry))
xas_for_each(&xas, page, ULONG_MAX) {
if (xas_retry(&xas, page))
continue;
if (entry == item)
break;
checked++;
if ((checked % XA_CHECK_SCHED) != 0)

if (!xa_is_value(page))
continue;
xas_pause(&xas);
cond_resched_rcu();

if (frontswap) {
swp_entry_t entry = radix_to_swp_entry(page);

if (!frontswap_test(swap_info[swp_type(entry)],
swp_offset(entry)))
continue;
}

indices[ret] = xas.xa_index;
entries[ret] = page;

if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
if (++ret == nr_entries)
break;
}
rcu_read_unlock();

return entry ? xas.xa_index : -1;
return ret;
}

/*
* If swap found in inode, free it and move page from swapcache to filecache.
* Move the swapped pages for an inode to page cache. Returns the count
* of pages swapped in, or the error in case of failure.
*/
static int shmem_unuse_inode(struct shmem_inode_info *info,
swp_entry_t swap, struct page **pagep)
static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
pgoff_t *indices)
{
struct address_space *mapping = info->vfs_inode.i_mapping;
void *radswap;
pgoff_t index;
gfp_t gfp;
int i = 0;
int ret = 0;
int error = 0;
struct address_space *mapping = inode->i_mapping;

radswap = swp_to_radix_entry(swap);
index = find_swap_entry(&mapping->i_pages, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */

/*
* Move _head_ to start search for next from here.
* But be careful: shmem_evict_inode checks list_empty without taking
* mutex, and there's an instant in list_move_tail when info->swaplist
* would appear empty, if it were the only one on shmem_swaplist.
*/
if (shmem_swaplist.next != &info->swaplist)
list_move_tail(&shmem_swaplist, &info->swaplist);
for (i = 0; i < pvec.nr; i++) {
struct page *page = pvec.pages[i];

gfp = mapping_gfp_mask(mapping);
if (shmem_should_replace_page(*pagep, gfp)) {
mutex_unlock(&shmem_swaplist_mutex);
error = shmem_replace_page(pagep, gfp, info, index);
mutex_lock(&shmem_swaplist_mutex);
/*
* We needed to drop mutex to make that restrictive page
* allocation, but the inode might have been freed while we
* dropped it: although a racing shmem_evict_inode() cannot
* complete without emptying the page cache, our page lock
* on this swapcache page is not enough to prevent that -
* free_swap_and_cache() of our swap entry will only
* trylock_page(), removing swap from page cache whatever.
*
* We must not proceed to shmem_add_to_page_cache() if the
* inode has been freed, but of course we cannot rely on
* inode or mapping or info to check that. However, we can
* safely check if our swap entry is still in use (and here
* it can't have got reused for another page): if it's still
* in use, then the inode cannot have been freed yet, and we
* can safely proceed (if it's no longer in use, that tells
* nothing about the inode, but we don't need to unuse swap).
*/
if (!page_swapcount(*pagep))
error = -ENOENT;
if (!xa_is_value(page))
continue;
error = shmem_swapin_page(inode, indices[i],
&page, SGP_CACHE,
mapping_gfp_mask(mapping),
NULL, NULL);
if (error == 0) {
unlock_page(page);
put_page(page);
ret++;
}
if (error == -ENOMEM)
break;
error = 0;
}
return error ? error : ret;
}

/*
* We rely on shmem_swaplist_mutex, not only to protect the swaplist,
* but also to hold up shmem_evict_inode(): so inode cannot be freed
* beneath us (pagelock doesn't help until the page is in pagecache).
*/
if (!error)
error = shmem_add_to_page_cache(*pagep, mapping, index,
radswap, gfp);
if (error != -ENOMEM) {
/*
* Truncation and eviction use free_swap_and_cache(), which
* only does trylock page: if we raced, best clean up here.
*/
delete_from_swap_cache(*pagep);
set_page_dirty(*pagep);
if (!error) {
spin_lock_irq(&info->lock);
info->swapped--;
spin_unlock_irq(&info->lock);
swap_free(swap);
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
static int shmem_unuse_inode(struct inode *inode, unsigned int type,
bool frontswap, unsigned long *fs_pages_to_unuse)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0;
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
int ret = 0;

pagevec_init(&pvec);
do {
unsigned int nr_entries = PAGEVEC_SIZE;

if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
nr_entries = *fs_pages_to_unuse;

pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
pvec.pages, indices,
frontswap);
if (pvec.nr == 0) {
ret = 0;
break;
}
}
return error;

ret = shmem_unuse_swap_entries(inode, pvec, indices);
if (ret < 0)
break;

if (frontswap_partial) {
*fs_pages_to_unuse -= ret;
if (*fs_pages_to_unuse == 0) {
ret = FRONTSWAP_PAGES_UNUSED;
break;
}
}

start = indices[pvec.nr - 1];
} while (true);

return ret;
}

/*
* Search through swapped inodes to find and replace swap by page.
* Read all the shared memory data that resides in the swap
* device 'type' back into memory, so the swap device can be
* unused.
*/
int shmem_unuse(swp_entry_t swap, struct page *page)
int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
{
struct list_head *this, *next;
struct shmem_inode_info *info;
struct mem_cgroup *memcg;
struct shmem_inode_info *info, *next;
struct inode *inode;
struct inode *prev_inode = NULL;
int error = 0;

/*
* There's a faint possibility that swap page was replaced before
* caller locked it: caller will come back later with the right page.
*/
if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
goto out;
if (list_empty(&shmem_swaplist))
return 0;

mutex_lock(&shmem_swaplist_mutex);

/*
* Charge page using GFP_KERNEL while we can wait, before taking
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
* The extra refcount on the inode is necessary to safely dereference
* p->next after re-acquiring the lock. New shmem inodes with swap
* get added to the end of the list and we will scan them all.
*/
error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
&memcg, false);
if (error)
goto out;
/* No memory allocation: swap entry occupies the slot for the page */
error = -EAGAIN;

mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(this, next, &shmem_swaplist) {
info = list_entry(this, struct shmem_inode_info, swaplist);
if (info->swapped)
error = shmem_unuse_inode(info, swap, &page);
else
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
continue;
}

inode = igrab(&info->vfs_inode);
if (!inode)
continue;

mutex_unlock(&shmem_swaplist_mutex);
if (prev_inode)
iput(prev_inode);
prev_inode = inode;

error = shmem_unuse_inode(inode, type, frontswap,
fs_pages_to_unuse);
cond_resched();
if (error != -EAGAIN)

mutex_lock(&shmem_swaplist_mutex);
next = list_next_entry(info, swaplist);
if (!info->swapped)
list_del_init(&info->swaplist);
if (error)
break;
/* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);

if (error) {
if (error != -ENOMEM)
error = 0;
mem_cgroup_cancel_charge(page, memcg, false);
} else
mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
put_page(page);
if (prev_inode)
iput(prev_inode);

return error;
}

Expand Down Expand Up @@ -1329,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
*/
mutex_lock(&shmem_swaplist_mutex);
if (list_empty(&info->swaplist))
list_add_tail(&info->swaplist, &shmem_swaplist);
list_add(&info->swaplist, &shmem_swaplist);

if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
spin_lock_irq(&info->lock);
Expand Down Expand Up @@ -3886,7 +3912,8 @@ int __init shmem_init(void)
return 0;
}

int shmem_unuse(swp_entry_t swap, struct page *page)
int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
{
return 0;
}
Expand Down
Loading

0 comments on commit b56a2d8

Please sign in to comment.