Skip to content

Commit

Permalink
swap: make swap discard async
Browse files Browse the repository at this point in the history
swap can do cluster discard for SSD, which is good, but there are some
problems here:

1. swap do the discard just before page reclaim gets a swap entry and
   writes the disk sectors.  This is useless for high end SSD, because an
   overwrite to a sector implies a discard to original sector too.  A
   discard + overwrite == overwrite.

2. the purpose of doing discard is to improve SSD firmware garbage
   collection.  Idealy we should send discard as early as possible, so
   firmware can do something smart.  Sending discard just after swap entry
   is freed is considered early compared to sending discard before write.
   Of course, if workload is already bound to gc speed, sending discard
   earlier or later doesn't make

3. block discard is a sync API, which will delay scan_swap_map()
   significantly.

4. Write and discard command can be executed parallel in PCIe SSD.
   Making swap discard async can make execution more efficiently.

This patch makes swap discard async and moves discard to where swap entry
is freed.  Discard and write have no dependence now, so above issues can
be avoided.  Idealy we should do discard for any freed sectors, but some
SSD discard is very slow.  This patch still does discard for a whole
cluster.

My test does a several round of 'mmap, write, unmap', which will trigger a
lot of swap discard.  In a fusionio card, with this patch, the test
runtime is reduced to 18% of the time without it, so around 5.5x faster.

[[email protected]: coding-style fixes]
Signed-off-by: Shaohua Li <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Kyungmin Park <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Rafael Aquini <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Shaohua Li authored and torvalds committed Sep 11, 2013
1 parent 2a8f944 commit 815c2c5
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 87 deletions.
20 changes: 11 additions & 9 deletions include/linux/swap.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,6 @@ struct swap_info_struct {
unsigned int inuse_pages; /* number of those currently in use */
unsigned int cluster_next; /* likely index for next allocation */
unsigned int cluster_nr; /* countdown to next cluster search */
unsigned int lowest_alloc; /* while preparing discard cluster */
unsigned int highest_alloc; /* while preparing discard cluster */
struct swap_extent *curr_swap_extent;
struct swap_extent first_swap_extent;
struct block_device *bdev; /* swap device or bdev of swap file */
Expand All @@ -232,14 +230,18 @@ struct swap_info_struct {
* protect map scan related fields like
* swap_map, lowest_bit, highest_bit,
* inuse_pages, cluster_next,
* cluster_nr, lowest_alloc and
* highest_alloc. other fields are only
* changed at swapon/swapoff, so are
* protected by swap_lock. changing
* flags need hold this lock and
* swap_lock. If both locks need hold,
* hold swap_lock first.
* cluster_nr, lowest_alloc,
* highest_alloc, free/discard cluster
* list. other fields are only changed
* at swapon/swapoff, so are protected
* by swap_lock. changing flags need
* hold this lock and swap_lock. If
* both locks need hold, hold swap_lock
* first.
*/
struct work_struct discard_work; /* discard worker */
struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
};

struct swap_list_t {
Expand Down
192 changes: 114 additions & 78 deletions mm/swapfile.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}

static int wait_for_discard(void *word)
{
schedule();
return 0;
}

#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256

Expand Down Expand Up @@ -242,6 +236,90 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0;
}

/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
unsigned int idx)
{
/*
* If scan_swap_map() can't find a free cluster, it will check
* si->swap_map directly. To make sure the discarding cluster isn't
* taken by scan_swap_map(), mark the swap entries bad (occupied). It
* will be cleared after discard
*/
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
SWAP_MAP_BAD, SWAPFILE_CLUSTER);

if (cluster_is_null(&si->discard_cluster_head)) {
cluster_set_next_flag(&si->discard_cluster_head,
idx, 0);
cluster_set_next_flag(&si->discard_cluster_tail,
idx, 0);
} else {
unsigned int tail = cluster_next(&si->discard_cluster_tail);
cluster_set_next(&si->cluster_info[tail], idx);
cluster_set_next_flag(&si->discard_cluster_tail,
idx, 0);
}

schedule_work(&si->discard_work);
}

/*
* Doing discard actually. After a cluster discard is finished, the cluster
* will be added to free cluster list. caller should hold si->lock.
*/
static void swap_do_scheduled_discard(struct swap_info_struct *si)
{
struct swap_cluster_info *info;
unsigned int idx;

info = si->cluster_info;

while (!cluster_is_null(&si->discard_cluster_head)) {
idx = cluster_next(&si->discard_cluster_head);

cluster_set_next_flag(&si->discard_cluster_head,
cluster_next(&info[idx]), 0);
if (cluster_next(&si->discard_cluster_tail) == idx) {
cluster_set_null(&si->discard_cluster_head);
cluster_set_null(&si->discard_cluster_tail);
}
spin_unlock(&si->lock);

discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
SWAPFILE_CLUSTER);

spin_lock(&si->lock);
cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
if (cluster_is_null(&si->free_cluster_head)) {
cluster_set_next_flag(&si->free_cluster_head,
idx, 0);
cluster_set_next_flag(&si->free_cluster_tail,
idx, 0);
} else {
unsigned int tail;

tail = cluster_next(&si->free_cluster_tail);
cluster_set_next(&info[tail], idx);
cluster_set_next_flag(&si->free_cluster_tail,
idx, 0);
}
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
}
}

static void swap_discard_work(struct work_struct *work)
{
struct swap_info_struct *si;

si = container_of(work, struct swap_info_struct, discard_work);

spin_lock(&si->lock);
swap_do_scheduled_discard(si);
spin_unlock(&si->lock);
}

/*
* The cluster corresponding to page_nr will be used. The cluster will be
* removed from free cluster list and its usage counter will be increased.
Expand Down Expand Up @@ -287,6 +365,16 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
cluster_count(&cluster_info[idx]) - 1);

if (cluster_count(&cluster_info[idx]) == 0) {
/*
* If the swap is discardable, prepare discard the cluster
* instead of free it immediately. The cluster will be freed
* after discard.
*/
if (p->flags & SWP_PAGE_DISCARD) {
swap_cluster_schedule_discard(p, idx);
return;
}

cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
if (cluster_is_null(&p->free_cluster_head)) {
cluster_set_next_flag(&p->free_cluster_head, idx, 0);
Expand Down Expand Up @@ -319,7 +407,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned long scan_base;
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
int found_free_cluster = 0;

/*
* We try to cluster swap pages by allocating them sequentially
Expand All @@ -340,35 +427,34 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
}
if (si->flags & SWP_PAGE_DISCARD) {
/*
* Start range check on racing allocations, in case
* they overlap the cluster we eventually decide on
* (we scan without swap_lock to allow preemption).
* It's hardly conceivable that cluster_nr could be
* wrapped during our scan, but don't depend on it.
*/
if (si->lowest_alloc)
goto checks;
si->lowest_alloc = si->max;
si->highest_alloc = 0;
}
check_cluster:
if (!cluster_is_null(&si->free_cluster_head)) {
offset = cluster_next(&si->free_cluster_head) *
SWAPFILE_CLUSTER;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
found_free_cluster = 1;
goto checks;
} else if (si->cluster_info) {
/*
* we don't have free cluster but have some clusters in
* discarding, do discard now and reclaim them
*/
if (!cluster_is_null(&si->discard_cluster_head)) {
si->cluster_nr = 0;
swap_do_scheduled_discard(si);
scan_base = offset = si->cluster_next;
if (!si->cluster_nr)
goto check_cluster;
si->cluster_nr--;
goto checks;
}

/*
* Checking free cluster is fast enough, we can do the
* check every time
*/
si->cluster_nr = 0;
si->lowest_alloc = 0;
goto checks;
}

Expand All @@ -395,7 +481,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
found_free_cluster = 1;
goto checks;
}
if (unlikely(--latency_ration < 0)) {
Expand All @@ -416,7 +501,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
found_free_cluster = 1;
goto checks;
}
if (unlikely(--latency_ration < 0)) {
Expand All @@ -428,7 +512,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
offset = scan_base;
spin_lock(&si->lock);
si->cluster_nr = SWAPFILE_CLUSTER - 1;
si->lowest_alloc = 0;
}

checks:
Expand Down Expand Up @@ -470,59 +553,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;

if (si->lowest_alloc) {
/*
* Only set when SWP_PAGE_DISCARD, and there's a scan
* for a free cluster in progress or just completed.
*/
if (found_free_cluster) {
/*
* To optimize wear-levelling, discard the
* old data of the cluster, taking care not to
* discard any of its pages that have already
* been allocated by racing tasks (offset has
* already stepped over any at the beginning).
*/
if (offset < si->highest_alloc &&
si->lowest_alloc <= last_in_cluster)
last_in_cluster = si->lowest_alloc - 1;
si->flags |= SWP_DISCARDING;
spin_unlock(&si->lock);

if (offset < last_in_cluster)
discard_swap_cluster(si, offset,
last_in_cluster - offset + 1);

spin_lock(&si->lock);
si->lowest_alloc = 0;
si->flags &= ~SWP_DISCARDING;

smp_mb(); /* wake_up_bit advises this */
wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));

} else if (si->flags & SWP_DISCARDING) {
/*
* Delay using pages allocated by racing tasks
* until the whole discard has been issued. We
* could defer that delay until swap_writepage,
* but it's easier to keep this self-contained.
*/
spin_unlock(&si->lock);
wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
wait_for_discard, TASK_UNINTERRUPTIBLE);
spin_lock(&si->lock);
} else {
/*
* Note pages allocated by racing tasks while
* scan for a free cluster is in progress, so
* that its final discard can exclude them.
*/
if (offset < si->lowest_alloc)
si->lowest_alloc = offset;
if (offset > si->highest_alloc)
si->highest_alloc = offset;
}
}
return offset;

scan:
Expand Down Expand Up @@ -1806,6 +1836,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out_dput;
}

flush_work(&p->discard_work);

destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
Expand Down Expand Up @@ -2172,6 +2204,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,

cluster_set_null(&p->free_cluster_head);
cluster_set_null(&p->free_cluster_tail);
cluster_set_null(&p->discard_cluster_head);
cluster_set_null(&p->discard_cluster_tail);

for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
Expand Down Expand Up @@ -2281,6 +2315,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (IS_ERR(p))
return PTR_ERR(p);

INIT_WORK(&p->discard_work, swap_discard_work);

name = getname(specialfile);
if (IS_ERR(name)) {
error = PTR_ERR(name);
Expand Down

0 comments on commit 815c2c5

Please sign in to comment.