Skip to content

Commit

Permalink
mm,thp: add read-only THP support for (non-shmem) FS
Browse files Browse the repository at this point in the history
This patch is (hopefully) the first step to enable THP for non-shmem
filesystems.

This patch enables an application to put part of its text sections to THP
via madvise, for example:

    madvise((void *)0x600000, 0x200000, MADV_HUGEPAGE);

We tried to reuse the logic for THP on tmpfs.

Currently, write is not supported for non-shmem THP.  khugepaged will only
process vma with VM_DENYWRITE.  sys_mmap() ignores VM_DENYWRITE requests
(see ksys_mmap_pgoff).  The only way to create vma with VM_DENYWRITE is
execve().  This requirement limits non-shmem THP to text sections.

The next patch will handle writes, which would only happen when the all
the vmas with VM_DENYWRITE are unmapped.

An EXPERIMENTAL config, READ_ONLY_THP_FOR_FS, is added to gate this
feature.

[[email protected]: fix build without CONFIG_SHMEM]
  Link: http://lkml.kernel.org/r/[email protected]
[[email protected]: fix double unlock in collapse_file()]
  Link: http://lkml.kernel.org/r/[email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Song Liu <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Cc: Stephen Rothwell <[email protected]>
Cc: Dan Carpenter <[email protected]>
Cc: Hillf Danton <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: William Kucharski <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
liu-song-6 authored and torvalds committed Sep 24, 2019
1 parent 579c571 commit 99cb0db
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 48 deletions.
11 changes: 11 additions & 0 deletions mm/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,17 @@ config GUP_BENCHMARK
config GUP_GET_PTE_LOW_HIGH
bool

config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM

help
Allow khugepaged to put read-only file-backed pages in THP.

This is marked experimental because it is a new feature. Write
support of file THPs will be developed in the next few release
cycles.

config ARCH_HAS_PTE_SPECIAL
bool

Expand Down
4 changes: 2 additions & 2 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ static void unaccount_page_cache_page(struct address_space *mapping,
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else {
VM_BUG_ON_PAGE(PageTransHuge(page), page);
} else if (PageTransHuge(page)) {
__dec_node_page_state(page, NR_FILE_THPS);
}

/*
Expand Down
149 changes: 107 additions & 42 deletions mm/khugepaged.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ enum scan_result {
SCAN_CGROUP_CHARGE_FAIL,
SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
};

#define CREATE_TRACE_POINTS
Expand Down Expand Up @@ -404,7 +405,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
(vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
if (shmem_file(vma->vm_file)) {

if (shmem_file(vma->vm_file) ||
(IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
vma->vm_file &&
(vm_flags & VM_DENYWRITE))) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
Expand Down Expand Up @@ -456,8 +461,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long hstart, hend;

/*
* khugepaged does not yet work on non-shmem files or special
* mappings. And file-private shmem THP is not supported.
* khugepaged only supports read-only files for non-shmem files.
* khugepaged does not yet work on special mappings. And
* file-private shmem THP is not supported.
*/
if (!hugepage_vma_check(vma, vm_flags))
return 0;
Expand Down Expand Up @@ -1287,12 +1293,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
}

/**
* collapse_file - collapse small tmpfs/shmem pages into huge one.
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
* + swap in pages if necessary;
* + swap/gup in pages if necessary;
* + fill in gaps;
* + keep old pages around in case rollback is required;
* - if replacing succeeds:
Expand All @@ -1316,7 +1322,9 @@ static void collapse_file(struct mm_struct *mm,
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);

VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

/* Only allocate from the target node */
Expand Down Expand Up @@ -1348,7 +1356,8 @@ static void collapse_file(struct mm_struct *mm,
} while (1);

__SetPageLocked(new_page);
__SetPageSwapBacked(new_page);
if (is_shmem)
__SetPageSwapBacked(new_page);
new_page->index = start;
new_page->mapping = mapping;

Expand All @@ -1363,41 +1372,75 @@ static void collapse_file(struct mm_struct *mm,
struct page *page = xas_next(&xas);

VM_BUG_ON(index != xas.xa_index);
if (!page) {
/*
* Stop if extent has been truncated or hole-punched,
* and is now completely empty.
*/
if (index == start) {
if (!xas_next_entry(&xas, end - 1)) {
result = SCAN_TRUNCATED;
if (is_shmem) {
if (!page) {
/*
* Stop if extent has been truncated or
* hole-punched, and is now completely
* empty.
*/
if (index == start) {
if (!xas_next_entry(&xas, end - 1)) {
result = SCAN_TRUNCATED;
goto xa_locked;
}
xas_set(&xas, index);
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
goto xa_locked;
}
xas_set(&xas, index);
xas_store(&xas, new_page);
nr_none++;
continue;
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;

if (xa_is_value(page) || !PageUptodate(page)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
xas_store(&xas, new_page);
nr_none++;
continue;
}

if (xa_is_value(page) || !PageUptodate(page)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
} else { /* !is_shmem */
if (!page || xa_is_value(page)) {
xas_unlock_irq(&xas);
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
PAGE_SIZE);
/* drain pagevecs to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
} else if (!PageUptodate(page)) {
xas_unlock_irq(&xas);
wait_on_page_locked(page);
if (!trylock_page(page)) {
result = SCAN_PAGE_LOCK;
goto xa_unlocked;
}
get_page(page);
} else if (PageDirty(page)) {
result = SCAN_FAIL;
goto xa_unlocked;
goto xa_locked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}

/*
Expand Down Expand Up @@ -1426,6 +1469,12 @@ static void collapse_file(struct mm_struct *mm,
goto out_unlock;
}

if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
goto out_unlock;
}

if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);

Expand Down Expand Up @@ -1463,12 +1512,18 @@ static void collapse_file(struct mm_struct *mm,
goto xa_unlocked;
}

__inc_node_page_state(new_page, NR_SHMEM_THPS);
if (is_shmem)
__inc_node_page_state(new_page, NR_SHMEM_THPS);
else
__inc_node_page_state(new_page, NR_FILE_THPS);

if (nr_none) {
struct zone *zone = page_zone(new_page);

__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
__mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
if (is_shmem)
__mod_node_page_state(zone->zone_pgdat,
NR_SHMEM, nr_none);
}

xa_locked:
Expand Down Expand Up @@ -1506,10 +1561,15 @@ static void collapse_file(struct mm_struct *mm,

SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);

if (is_shmem) {
set_page_dirty(new_page);
lru_cache_add_anon(new_page);
} else {
lru_cache_add_file(new_page);
}
count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_anon(new_page);

/*
* Remove pte page tables, so we can re-fault the page as huge.
Expand All @@ -1524,7 +1584,9 @@ static void collapse_file(struct mm_struct *mm,
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
shmem_uncharge(mapping->host, nr_none);

if (is_shmem)
shmem_uncharge(mapping->host, nr_none);

xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
Expand Down Expand Up @@ -1607,7 +1669,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
break;
}

if (page_count(page) != 1 + page_mapcount(page)) {
if (page_count(page) !=
1 + page_mapcount(page) + page_has_private(page)) {
result = SCAN_PAGE_COUNT;
break;
}
Expand Down Expand Up @@ -1713,11 +1776,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
if (shmem_file(vma->vm_file)) {
if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
if (!shmem_huge_enabled(vma))

if (shmem_file(vma->vm_file)
&& !shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
Expand Down
12 changes: 8 additions & 4 deletions mm/rmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1189,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound)
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
if (PageSwapBacked(page))
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__inc_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
Expand Down Expand Up @@ -1229,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound)
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
if (PageSwapBacked(page))
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
Expand Down

0 comments on commit 99cb0db

Please sign in to comment.