Skip to content

Commit

Permalink
mm/hwpoison: do not lock page again when me_huge_page() successfully …
Browse files Browse the repository at this point in the history
…recovers

Currently me_huge_page() temporary unlocks page to perform some actions
then locks it again later.  My testcase (which calls hard-offline on
some tail page in a hugetlb, then accesses the address of the hugetlb
range) showed that page allocation code detects this page lock on buddy
page and printed out "BUG: Bad page state" message.

check_new_page_bad() does not consider a page with __PG_HWPOISON as bad
page, so this flag works as kind of filter, but this filtering doesn't
work in this case because the "bad page" is not the actual hwpoisoned
page.  So stop locking page again.  Actions to be taken depend on the
page type of the error, so page unlocking should be done in ->action()
callbacks.  So let's make it assumed and change all existing callbacks
that way.

Link: https://lkml.kernel.org/r/[email protected]
Fixes: commit 78bb920 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Naoya Horiguchi <[email protected]>
Cc: Oscar Salvador <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: "Aneesh Kumar K.V" <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
nhoriguchi authored and torvalds committed Jun 25, 2021
1 parent 47af12b commit ea6d063
Showing 1 changed file with 30 additions and 14 deletions.
44 changes: 30 additions & 14 deletions mm/memory-failure.c
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
unlock_page(p);
return MF_IGNORED;
}

Expand All @@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
unlock_page(p);
return MF_FAILED;
}

Expand All @@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
int ret;
struct address_space *mapping;

delete_from_lru_cache(p);
Expand All @@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
if (PageAnon(p))
return MF_RECOVERED;
if (PageAnon(p)) {
ret = MF_RECOVERED;
goto out;
}

/*
* Now truncate the page in the page cache. This is really
Expand All @@ -698,15 +703,19 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
return MF_FAILED;
ret = MF_FAILED;
goto out;
}

/*
* Truncation is a bit tricky. Enable it per file system for now.
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
return truncate_error_page(p, pfn, mapping);
ret = truncate_error_page(p, pfn, mapping);
out:
unlock_page(p);
return ret;
}

/*
Expand Down Expand Up @@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
int ret;

ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);

if (!delete_from_lru_cache(p))
return MF_DELAYED;
else
return MF_FAILED;
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
unlock_page(p);
return ret;
}

static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
int ret;

delete_from_swap_cache(p);

if (!delete_from_lru_cache(p))
return MF_RECOVERED;
else
return MF_FAILED;
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
unlock_page(p);
return ret;
}

/*
Expand All @@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
unlock_page(hpage);
} else {
res = MF_FAILED;
unlock_page(hpage);
Expand All @@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
page_ref_inc(p);
res = MF_RECOVERED;
}
lock_page(hpage);
}

return res;
Expand Down Expand Up @@ -866,6 +877,8 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;

/* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
Expand Down Expand Up @@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;

/* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);

count = page_count(p) - 1;
Expand Down Expand Up @@ -1313,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}

res = identify_page_state(pfn, p, page_flags);
return identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
Expand Down Expand Up @@ -1596,6 +1610,8 @@ int memory_failure(unsigned long pfn, int flags)

identify_page_state:
res = identify_page_state(pfn, p, page_flags);
mutex_unlock(&mf_mutex);
return res;
unlock_page:
unlock_page(p);
unlock_mutex:
Expand Down

0 comments on commit ea6d063

Please sign in to comment.