Skip to content

Commit

Permalink
zsmalloc: fix a race with deferred_handles storing
Browse files Browse the repository at this point in the history
Currently, there is a race between zs_free() and zs_reclaim_page():
zs_reclaim_page() finds a handle to an allocated object, but before the
eviction happens, an independent zs_free() call to the same handle could
come in and overwrite the object value stored at the handle with the last
deferred handle.  When zs_reclaim_page() finally gets to call the eviction
handler, it will see an invalid object value (i.e the previous deferred
handle instead of the original object value).

This race happens quite infrequently.  We only managed to produce it with
out-of-tree developmental code that triggers zsmalloc writeback with a
much higher frequency than usual.

This patch fixes this race by storing the deferred handle in the object
header instead.  We differentiate the deferred handle from the other two
cases (handle for allocated object, and linkage for free object) with a
new tag.  If zspage reclamation succeeds, we will free these deferred
handles by walking through the zspage objects.  On the other hand, if
zspage reclamation fails, we reconstruct the zspage freelist (with the
deferred handle tag and allocated tag) before trying again with the
reclamation.

[[email protected]: avoid unused-function warning]
  Link: https://lkml.kernel.org/r/[email protected]
Link: https://lkml.kernel.org/r/[email protected]
Fixes: 9997bc0 ("zsmalloc: implement writeback mechanism for zsmalloc")
Signed-off-by: Nhat Pham <[email protected]>
Signed-off-by: Arnd Bergmann <[email protected]>
Suggested-by: Johannes Weiner <[email protected]>
Cc: Dan Streetman <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Nitin Gupta <[email protected]>
Cc: Sergey Senozhatsky <[email protected]>
Cc: Seth Jennings <[email protected]>
Cc: Vitaly Wool <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
  • Loading branch information
nhatsmrt authored and akpm00 committed Feb 1, 2023
1 parent 023f47a commit 85b3258
Showing 1 changed file with 205 additions and 32 deletions.
237 changes: 205 additions & 32 deletions mm/zsmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,23 @@
* have room for two bit at least.
*/
#define OBJ_ALLOCATED_TAG 1
#define OBJ_TAG_BITS 1

#ifdef CONFIG_ZPOOL
/*
* The second least-significant bit in the object's header identifies if the
* value stored at the header is a deferred handle from the last reclaim
* attempt.
*
* As noted above, this is valid because we have room for two bits.
*/
#define OBJ_DEFERRED_HANDLE_TAG 2
#define OBJ_TAG_BITS 2
#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
#else
#define OBJ_TAG_BITS 1
#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
#endif /* CONFIG_ZPOOL */

#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)

Expand Down Expand Up @@ -222,6 +238,12 @@ struct link_free {
* Handle of allocated object.
*/
unsigned long handle;
#ifdef CONFIG_ZPOOL
/*
* Deferred handle of a reclaimed object.
*/
unsigned long deferred_handle;
#endif
};
};

Expand Down Expand Up @@ -272,8 +294,6 @@ struct zspage {
/* links the zspage to the lru list in the pool */
struct list_head lru;
bool under_reclaim;
/* list of unfreed handles whose objects have been reclaimed */
unsigned long *deferred_handles;
#endif

struct zs_pool *pool;
Expand Down Expand Up @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}

static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
int tag)
{
unsigned long handle;
struct zspage *zspage = get_zspage(page);
Expand All @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
} else
handle = *(unsigned long *)obj;

if (!(handle & OBJ_ALLOCATED_TAG))
if (!(handle & tag))
return false;

*phandle = handle & ~OBJ_ALLOCATED_TAG;
/* Clear all tags before returning the handle */
*phandle = handle & ~OBJ_TAG_MASK;
return true;
}

static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
{
return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
}

#ifdef CONFIG_ZPOOL
static bool obj_stores_deferred_handle(struct page *page, void *obj,
unsigned long *phandle)
{
return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
}
#endif

static void reset_page(struct page *page)
{
__ClearPageMovable(page);
Expand Down Expand Up @@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage)
}

#ifdef CONFIG_ZPOOL
static unsigned long find_deferred_handle_obj(struct size_class *class,
struct page *page, int *obj_idx);

/*
* Free all the deferred handles whose objects are freed in zs_free.
*/
static void free_handles(struct zs_pool *pool, struct zspage *zspage)
static void free_handles(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
unsigned long handle = (unsigned long)zspage->deferred_handles;
int obj_idx = 0;
struct page *page = get_first_page(zspage);
unsigned long handle;

while (handle) {
unsigned long nxt_handle = handle_to_obj(handle);
while (1) {
handle = find_deferred_handle_obj(class, page, &obj_idx);
if (!handle) {
page = get_next_page(page);
if (!page)
break;
obj_idx = 0;
continue;
}

cache_free_handle(pool, handle);
handle = nxt_handle;
obj_idx++;
}
}
#else
static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
static inline void free_handles(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage) {}
#endif

static void __free_zspage(struct zs_pool *pool, struct size_class *class,
Expand All @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(fg != ZS_EMPTY);

/* Free all deferred handles from zs_free */
free_handles(pool, zspage);
free_handles(pool, class, zspage);

next = page = get_first_page(zspage);
do {
Expand Down Expand Up @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
#ifdef CONFIG_ZPOOL
INIT_LIST_HEAD(&zspage->lru);
zspage->under_reclaim = false;
zspage->deferred_handles = NULL;
#endif

set_freeobj(zspage, 0);
Expand Down Expand Up @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(zs_malloc);

static void obj_free(int class_size, unsigned long obj)
static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
{
struct link_free *link;
struct zspage *zspage;
Expand All @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
zspage = get_zspage(f_page);

vaddr = kmap_atomic(f_page);

/* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
f_page->index = 0;

if (handle) {
#ifdef CONFIG_ZPOOL
/* Stores the (deferred) handle in the object's header */
*handle |= OBJ_DEFERRED_HANDLE_TAG;
*handle &= ~OBJ_ALLOCATED_TAG;

if (likely(!ZsHugePage(zspage)))
link->deferred_handle = *handle;
else
f_page->index = *handle;
#endif
} else {
/* Insert this object in containing zspage's freelist */
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
f_page->index = 0;
set_freeobj(zspage, f_objidx);
}

kunmap_atomic(vaddr);
set_freeobj(zspage, f_objidx);
mod_zspage_inuse(zspage, -1);
}

Expand All @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);

obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);

#ifdef CONFIG_ZPOOL
Expand All @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
* Reclaim needs the handles during writeback. It'll free
* them along with the zspage when it's done with them.
*
* Record current deferred handle at the memory location
* whose address is given by handle.
* Record current deferred handle in the object's header.
*/
record_obj(handle, (unsigned long)zspage->deferred_handles);
zspage->deferred_handles = (unsigned long *)handle;
obj_free(class->size, obj, &handle);
spin_unlock(&pool->lock);
return;
}
#endif
obj_free(class->size, obj, NULL);

fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_EMPTY)
free_zspage(pool, class, zspage);
Expand Down Expand Up @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}

/*
* Find alloced object in zspage from index object and
* Find object with a certain tag in zspage from index object and
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
static unsigned long find_tagged_obj(struct size_class *class,
struct page *page, int *obj_idx, int tag)
{
unsigned int offset;
int index = *obj_idx;
Expand All @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index;

while (offset < PAGE_SIZE) {
if (obj_allocated(page, addr + offset, &handle))
if (obj_tagged(page, addr + offset, &handle, tag))
break;

offset += class->size;
Expand All @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
return handle;
}

/*
* Find alloced object in zspage from index object and
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
}

#ifdef CONFIG_ZPOOL
/*
* Find object storing a deferred handle in header in zspage from index object
* and return handle.
*/
static unsigned long find_deferred_handle_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
}
#endif

struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
Expand Down Expand Up @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
obj_free(class->size, used_obj);
obj_free(class->size, used_obj, NULL);
}

/* Remember last position in this iteration */
Expand Down Expand Up @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool)
EXPORT_SYMBOL_GPL(zs_destroy_pool);

#ifdef CONFIG_ZPOOL
static void restore_freelist(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
unsigned int obj_idx = 0;
unsigned long handle, off = 0; /* off is within-page offset */
struct page *page = get_first_page(zspage);
struct link_free *prev_free = NULL;
void *prev_page_vaddr = NULL;

/* in case no free object found */
set_freeobj(zspage, (unsigned int)(-1UL));

while (page) {
void *vaddr = kmap_atomic(page);
struct page *next_page;

while (off < PAGE_SIZE) {
void *obj_addr = vaddr + off;

/* skip allocated object */
if (obj_allocated(page, obj_addr, &handle)) {
obj_idx++;
off += class->size;
continue;
}

/* free deferred handle from reclaim attempt */
if (obj_stores_deferred_handle(page, obj_addr, &handle))
cache_free_handle(pool, handle);

if (prev_free)
prev_free->next = obj_idx << OBJ_TAG_BITS;
else /* first free object found */
set_freeobj(zspage, obj_idx);

prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
/* if last free object in a previous page, need to unmap */
if (prev_page_vaddr) {
kunmap_atomic(prev_page_vaddr);
prev_page_vaddr = NULL;
}

obj_idx++;
off += class->size;
}

/*
* Handle the last (full or partial) object on this page.
*/
next_page = get_next_page(page);
if (next_page) {
if (!prev_free || prev_page_vaddr) {
/*
* There is no free object in this page, so we can safely
* unmap it.
*/
kunmap_atomic(vaddr);
} else {
/* update prev_page_vaddr since prev_free is on this page */
prev_page_vaddr = vaddr;
}
} else { /* this is the last page */
if (prev_free) {
/*
* Reset OBJ_TAG_BITS bit to last link to tell
* whether it's allocated object or not.
*/
prev_free->next = -1UL << OBJ_TAG_BITS;
}

/* unmap previous page (if not done yet) */
if (prev_page_vaddr) {
kunmap_atomic(prev_page_vaddr);
prev_page_vaddr = NULL;
}

kunmap_atomic(vaddr);
}

page = next_page;
off %= PAGE_SIZE;
}
}

static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
{
int i, obj_idx, ret = 0;
Expand Down Expand Up @@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
return 0;
}

/*
* Eviction fails on one of the handles, so we need to restore zspage.
* We need to rebuild its freelist (and free stored deferred handles),
* put it back to the correct size class, and add it to the LRU list.
*/
restore_freelist(pool, class, zspage);
putback_zspage(class, zspage);
list_add(&zspage->lru, &pool->lru);
unlock_zspage(zspage);
Expand Down

0 comments on commit 85b3258

Please sign in to comment.