diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ce44caa40eed56..6370026689e0b3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -342,6 +342,17 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags); +#else +static inline int +change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) { return 0; } +#endif + #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ @@ -585,6 +596,19 @@ static inline int next_order(unsigned long *orders, int prev) { return 0; } + +static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ +} + +static inline int change_huge_pud(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pudp, + unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 608ea551da1634..ac9e8a77059d03 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2104,6 +2104,53 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } +/* + * Returns: + * + * - 0: if pud leaf changed from under us + * - 1: if pud can be skipped + * - HPAGE_PUD_NR: if pud was successfully processed + */ +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) +{ + struct mm_struct *mm = vma->vm_mm; + pud_t oldpud, entry; + spinlock_t *ptl; + + tlb_change_page_size(tlb, HPAGE_PUD_SIZE); + + /* NUMA balancing doesn't apply to dax */ + if (cp_flags & MM_CP_PROT_NUMA) + return 1; + + /* + * Huge entries on userfault-wp only works with anonymous, while we + * don't have anonymous PUDs yet. + */ + if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) + return 1; + + ptl = __pud_trans_huge_lock(pudp, vma); + if (!ptl) + return 0; + + /* + * Can't clear PUD or it can race with concurrent zapping. See + * change_huge_pmd(). + */ + oldpud = pudp_invalidate(vma, addr, pudp); + entry = pud_modify(oldpud, newprot); + set_pud_at(mm, addr, pudp, entry); + tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE); + + spin_unlock(ptl); + return HPAGE_PUD_NR; +} +#endif + #ifdef CONFIG_USERFAULTFD /* * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by @@ -2334,6 +2381,11 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } +#else +void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ +} #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, diff --git a/mm/mprotect.c b/mm/mprotect.c index d423080e650980..446f8e5f10d98b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -302,8 +302,9 @@ pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags) { /* * pte markers only resides in pte level, if we need pte markers, - * we need to split. We cannot wr-protect shmem thp because file - * thp is handled differently when split by erasing the pmd so far. + * we need to split. For example, we cannot wr-protect a file thp + * (e.g. 2M shmem) because file thp is handled differently when + * split by erasing the pmd so far. */ return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); } @@ -430,31 +431,53 @@ static inline long change_pud_range(struct mmu_gather *tlb, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mmu_notifier_range range; - pud_t *pud; + pud_t *pudp, pud; unsigned long next; long pages = 0, ret; range.start = 0; - pud = pud_offset(p4d, addr); + pudp = pud_offset(p4d, addr); do { +again: next = pud_addr_end(addr, end); - ret = change_prepare(vma, pud, pmd, addr, cp_flags); + ret = change_prepare(vma, pudp, pmd, addr, cp_flags); if (ret) { pages = ret; break; } - if (pud_none_or_clear_bad(pud)) + + pud = READ_ONCE(*pudp); + if (pud_none(pud)) continue; + if (!range.start) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } - pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, + + if (pud_leaf(pud)) { + if ((next - addr != PUD_SIZE) || + pgtable_split_needed(vma, cp_flags)) { + __split_huge_pud(vma, pudp, addr); + goto again; + } else { + ret = change_huge_pud(tlb, vma, pudp, + addr, newprot, cp_flags); + if (ret == 0) + goto again; + /* huge pud was handled */ + if (ret == HPAGE_PUD_NR) + pages += HPAGE_PUD_NR; + continue; + } + } + + pages += change_pmd_range(tlb, vma, pudp, addr, next, newprot, cp_flags); - } while (pud++, addr = next, addr != end); + } while (pudp++, addr = next, addr != end); if (range.start) mmu_notifier_invalidate_range_end(&range);