diff --git a/drivers/dax/super.c b/drivers/dax/super.c index c2c46f96b18c9d..60d01b5d2a6710 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -86,6 +86,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) { struct block_device *bdev = sb->s_bdev; struct dax_device *dax_dev; + bool dax_enabled = false; pgoff_t pgoff; int err, id; void *kaddr; @@ -134,14 +135,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) * on being able to do (page_address(pfn_to_page())). */ WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); + dax_enabled = true; } else if (pfn_t_devmap(pfn)) { - /* pass */; - } else { + struct dev_pagemap *pgmap; + + pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); + if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX) + dax_enabled = true; + put_dev_pagemap(pgmap); + } + + if (!dax_enabled) { pr_debug("VFS (%s): error: dax support not enabled\n", sb->s_id); return -EOPNOTSUPP; } - return 0; } EXPORT_SYMBOL_GPL(__bdev_dax_supported); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 30b08791597d7f..3f7ad5bc443ee8 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) res->start += start_pad; res->end -= end_trunc; - pgmap->type = MEMORY_DEVICE_HOST; - if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < SZ_8K) return -EINVAL; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 97b4c39a9267a5..bf2dd2a4a5e600 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -289,6 +289,27 @@ static void pmem_release_disk(void *__pmem) put_disk(pmem->disk); } +static void pmem_release_pgmap_ops(void *__pgmap) +{ + dev_pagemap_put_ops(); +} + +static void fsdax_pagefree(struct page *page, void *data) +{ + wake_up_var(&page->_refcount); +} + +static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) +{ + dev_pagemap_get_ops(); + if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) + return -ENOMEM; + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->page_free = fsdax_pagefree; + + return 0; +} + static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) { @@ -347,6 +368,8 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { + if (setup_pagemap_fsdax(dev, &pmem->pgmap)) + return -ENOMEM; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -358,6 +381,8 @@ static int pmem_attach_disk(struct device *dev, } else if (pmem_should_map_pages(dev)) { memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); pmem->pgmap.altmap_valid = false; + if (setup_pagemap_fsdax(dev, &pmem->pgmap)) + return -ENOMEM; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); diff --git a/fs/Kconfig b/fs/Kconfig index bc821a86d96519..1e050e012eb970 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -38,6 +38,7 @@ config FS_DAX bool "Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) + select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help diff --git a/fs/dax.c b/fs/dax.c index aaec72ded1b63c..31e9f51ac917e2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, } } +static struct page *dax_busy_page(void *entry) +{ + unsigned long pfn; + + for_each_mapped_pfn(entry, pfn) { + struct page *page = pfn_to_page(pfn); + + if (page_ref_count(page) > 1) + return page; + } + return NULL; +} + /* * Find radix tree entry at given index. If it points to an exceptional entry, * return it with the radix tree entry locked. If the radix tree doesn't @@ -492,6 +505,90 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, return entry; } +/** + * dax_layout_busy_page - find first pinned page in @mapping + * @mapping: address space to scan for a page with ref count > 1 + * + * DAX requires ZONE_DEVICE mapped pages. These pages are never + * 'onlined' to the page allocator so they are considered idle when + * page->count == 1. A filesystem uses this interface to determine if + * any page in the mapping is busy, i.e. for DMA, or other + * get_user_pages() usages. + * + * It is expected that the filesystem is holding locks to block the + * establishment of new mappings in this address_space. I.e. it expects + * to be able to run unmap_mapping_range() and subsequently not race + * mapping_mapped() becoming true. + */ +struct page *dax_layout_busy_page(struct address_space *mapping) +{ + pgoff_t indices[PAGEVEC_SIZE]; + struct page *page = NULL; + struct pagevec pvec; + pgoff_t index, end; + unsigned i; + + /* + * In the 'limited' case get_user_pages() for dax is disabled. + */ + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return NULL; + + if (!dax_mapping(mapping) || !mapping_mapped(mapping)) + return NULL; + + pagevec_init(&pvec); + index = 0; + end = -1; + + /* + * If we race get_user_pages_fast() here either we'll see the + * elevated page count in the pagevec_lookup and wait, or + * get_user_pages_fast() will see that the page it took a reference + * against is no longer mapped in the page tables and bail to the + * get_user_pages() slow path. The slow path is protected by + * pte_lock() and pmd_lock(). New references are not taken without + * holding those locks, and unmap_mapping_range() will not zero the + * pte or pmd without holding the respective lock, so we are + * guaranteed to either see new references or prevent new + * references from being established. + */ + unmap_mapping_range(mapping, 0, 0, 1); + + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *pvec_ent = pvec.pages[i]; + void *entry; + + index = indices[i]; + if (index >= end) + break; + + if (!radix_tree_exceptional_entry(pvec_ent)) + continue; + + xa_lock_irq(&mapping->i_pages); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (entry) + page = dax_busy_page(entry); + put_unlocked_mapping_entry(mapping, index, entry); + xa_unlock_irq(&mapping->i_pages); + if (page) + break; + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + index++; + + if (page) + break; + } + return page; +} +EXPORT_SYMBOL_GPL(dax_layout_busy_page); + static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -912,7 +1009,6 @@ static int dax_load_hole(struct address_space *mapping, void *entry, unsigned long vaddr = vmf->address; int ret = VM_FAULT_NOPAGE; struct page *zero_page; - void *entry2; pfn_t pfn; zero_page = ZERO_PAGE(0); @@ -922,13 +1018,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } pfn = page_to_pfn_t(zero_page); - entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_ZERO_PAGE, false); - if (IS_ERR(entry2)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - + dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, + false); vm_insert_mixed(vmf->vma, vaddr, pfn); out: trace_dax_load_hole(inode, vmf, ret); @@ -1240,10 +1331,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 0, write && !sync); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto error_finish_iomap; - } /* * If we are doing synchronous page fault and inode needs fsync, @@ -1327,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pfn = page_to_pfn_t(zero_page); ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); - if (IS_ERR(ret)) - goto fallback; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1450,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD, write && !sync); - if (IS_ERR(entry)) - goto finish_iomap; /* * If we are doing synchronous page fault and inode needs fsync, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e70fb8cceceaa5..19b0c3e0e23220 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -312,7 +312,7 @@ xfs_file_aio_write_checks( if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, BREAK_WRITE); if (error) return error; @@ -718,6 +718,69 @@ xfs_file_write_iter( return ret; } +static void +xfs_wait_dax_page( + struct inode *inode, + bool *did_unlock) +{ + struct xfs_inode *ip = XFS_I(inode); + + *did_unlock = true; + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + schedule(); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +} + +static int +xfs_break_dax_layouts( + struct inode *inode, + uint iolock, + bool *did_unlock) +{ + struct page *page; + + ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); + + page = dax_layout_busy_page(inode->i_mapping); + if (!page) + return 0; + + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, xfs_wait_dax_page(inode, did_unlock)); +} + +int +xfs_break_layouts( + struct inode *inode, + uint *iolock, + enum layout_break_reason reason) +{ + bool retry; + int error; + + ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + do { + retry = false; + switch (reason) { + case BREAK_UNMAP: + error = xfs_break_dax_layouts(inode, *iolock, &retry); + if (error || retry) + break; + /* fall through */ + case BREAK_WRITE: + error = xfs_break_leased_layouts(inode, iolock, &retry); + break; + default: + WARN_ON_ONCE(1); + error = -EINVAL; + } + } while (error == 0 && retry); + + return error; +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -734,7 +797,7 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; enum xfs_prealloc_flags flags = 0; - uint iolock = XFS_IOLOCK_EXCL; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -744,13 +807,10 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; - if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1eebc53df7d72f..e5b849815ce186 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -378,6 +378,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ >> XFS_ILOCK_SHIFT) +/* + * Layouts are broken in the BREAK_WRITE case to ensure that + * layout-holders do not collide with local writes. Additionally, + * layouts are broken in the BREAK_UNMAP case to make sure the + * layout-holder has a consistent view of the file's extent map. While + * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases, + * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to + * go idle. + */ +enum layout_break_reason { + BREAK_WRITE, + BREAK_UNMAP, +}; + /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and @@ -443,6 +457,8 @@ enum xfs_prealloc_flags { int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); +int xfs_break_layouts(struct inode *inode, uint *iolock, + enum layout_break_reason reason); /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 89fb1eb80aae88..91e73d66309904 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -39,7 +39,6 @@ #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_trans.h" -#include "xfs_pnfs.h" #include "xfs_acl.h" #include "xfs_btree.h" #include @@ -614,7 +613,7 @@ xfs_ioc_space( struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; - uint iolock = XFS_IOLOCK_EXCL; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; /* @@ -644,13 +643,10 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; - switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a3ed3c811dfa4c..ce0c1f9466a8ca 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -37,7 +37,6 @@ #include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_trans_space.h" -#include "xfs_pnfs.h" #include "xfs_iomap.h" #include @@ -1030,14 +1029,19 @@ xfs_vn_setattr( int error; if (iattr->ia_valid & ATTR_SIZE) { - struct xfs_inode *ip = XFS_I(d_inode(dentry)); - uint iolock = XFS_IOLOCK_EXCL; + struct inode *inode = d_inode(dentry); + struct xfs_inode *ip = XFS_I(inode); + uint iolock; - error = xfs_break_layouts(d_inode(dentry), &iolock); - if (error) + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); + if (error) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; + } - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); error = xfs_vn_setattr_size(dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index aa6c5c193f4581..f44c3599527d07 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -31,19 +31,20 @@ * rules in the page fault path we don't bother. */ int -xfs_break_layouts( +xfs_break_leased_layouts( struct inode *inode, - uint *iolock) + uint *iolock, + bool *did_unlock) { struct xfs_inode *ip = XFS_I(inode); int error; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); - while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); + *did_unlock = true; error = break_layout(inode, true); - *iolock = XFS_IOLOCK_EXCL; + *iolock &= ~XFS_IOLOCK_SHARED; + *iolock |= XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); } @@ -120,8 +121,8 @@ xfs_fs_map_blocks( * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more - * complicated. See the comment near xfs_break_layouts for a detailed - * explanation. + * complicated. See the comment near xfs_break_leased_layouts + * for a detailed explanation. */ xfs_ilock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index bf45951e28fe54..940c6c2ad88c55 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -9,10 +9,11 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock); +int xfs_break_leased_layouts(struct inode *inode, uint *iolock, + bool *did_unlock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock) +xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock) { return 0; } diff --git a/include/linux/dax.h b/include/linux/dax.h index f9eb22ad341e4a..25bab6abb69523 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -83,6 +83,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc); + +struct page *dax_layout_busy_page(struct address_space *mapping); #else static inline int bdev_dax_supported(struct super_block *sb, int blocksize) { @@ -103,6 +105,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) return NULL; } +static inline struct page *dax_layout_busy_page(struct address_space *mapping) +{ + return NULL; +} + static inline int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 7b4899c06f49c7..5ebfff65da4d4a 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ -#include #include #include @@ -30,13 +29,6 @@ struct vmem_altmap { * Specialize ZONE_DEVICE memory into multiple types each having differents * usage. * - * MEMORY_DEVICE_HOST: - * Persistent device memory (pmem): struct page might be allocated in different - * memory and architecture might want to perform special actions. It is similar - * to regular memory, in that the CPU can access it transparently. However, - * it is likely to have different bandwidth and latency than regular memory. - * See Documentation/nvdimm/nvdimm.txt for more information. - * * MEMORY_DEVICE_PRIVATE: * Device memory that is not directly addressable by the CPU: CPU can neither * read nor write private memory. In this case, we do still have struct pages @@ -53,11 +45,19 @@ struct vmem_altmap { * driver can hotplug the device memory using ZONE_DEVICE and with that memory * type. Any page of a process can be migrated to such memory. However no one * should be allow to pin such memory so that it can always be evicted. + * + * MEMORY_DEVICE_FS_DAX: + * Host memory that has similar access semantics as System RAM i.e. DMA + * coherent and supports page pinning. In support of coordinating page + * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a + * wakeup event whenever a page is unpinned and becomes idle. This + * wakeup is used to coordinate physical address space management (ex: + * fs truncate/hole punch) vs pinned pages (ex: device dma). */ enum memory_type { - MEMORY_DEVICE_HOST = 0, - MEMORY_DEVICE_PRIVATE, + MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_PUBLIC, + MEMORY_DEVICE_FS_DAX, }; /* @@ -129,8 +129,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); - -static inline bool is_zone_device_page(const struct page *page); #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) @@ -161,20 +159,6 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap, } #endif /* CONFIG_ZONE_DEVICE */ -#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) -static inline bool is_device_private_page(const struct page *page) -{ - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; -} - -static inline bool is_device_public_page(const struct page *page) -{ - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PUBLIC; -} -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ - static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) diff --git a/include/linux/mm.h b/include/linux/mm.h index 02a616e2f17d0f..274d5242bd0d84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -821,27 +821,65 @@ static inline bool is_zone_device_page(const struct page *page) } #endif -#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) -void put_zone_device_private_or_public_page(struct page *page); -DECLARE_STATIC_KEY_FALSE(device_private_key); -#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key) -static inline bool is_device_private_page(const struct page *page); -static inline bool is_device_public_page(const struct page *page); -#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -static inline void put_zone_device_private_or_public_page(struct page *page) +#ifdef CONFIG_DEV_PAGEMAP_OPS +void dev_pagemap_get_ops(void); +void dev_pagemap_put_ops(void); +void __put_devmap_managed_page(struct page *page); +DECLARE_STATIC_KEY_FALSE(devmap_managed_key); +static inline bool put_devmap_managed_page(struct page *page) +{ + if (!static_branch_unlikely(&devmap_managed_key)) + return false; + if (!is_zone_device_page(page)) + return false; + switch (page->pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_PUBLIC: + case MEMORY_DEVICE_FS_DAX: + __put_devmap_managed_page(page); + return true; + default: + break; + } + return false; +} + +static inline bool is_device_private_page(const struct page *page) { + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; } -#define IS_HMM_ENABLED 0 + +static inline bool is_device_public_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PUBLIC; +} + +#else /* CONFIG_DEV_PAGEMAP_OPS */ +static inline void dev_pagemap_get_ops(void) +{ +} + +static inline void dev_pagemap_put_ops(void) +{ +} + +static inline bool put_devmap_managed_page(struct page *page) +{ + return false; +} + static inline bool is_device_private_page(const struct page *page) { return false; } + static inline bool is_device_public_page(const struct page *page) { return false; } -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ - +#endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline void get_page(struct page *page) { @@ -859,16 +897,13 @@ static inline void put_page(struct page *page) page = compound_head(page); /* - * For private device pages we need to catch refcount transition from - * 2 to 1, when refcount reach one it means the private device page is - * free and we need to inform the device driver through callback. See + * For devmap managed pages we need to catch refcount transition from + * 2 to 1, when refcount reach one it means the page is free and we + * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) || - unlikely(is_device_public_page(page)))) { - put_zone_device_private_or_public_page(page); + if (put_devmap_managed_page(page)) return; - } if (put_page_testzero(page)) __put_page(page); diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa47400..9b924136131158 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -112,7 +112,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_HAS_IOMEM) += memremap.o +obj-$(CONFIG_HAS_IOMEM) += iomem.o +obj-$(CONFIG_ZONE_DEVICE) += memremap.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/iomem.c b/kernel/iomem.c new file mode 100644 index 00000000000000..f7525e14ebc6f1 --- /dev/null +++ b/kernel/iomem.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + /* In the simple case just return the existing linear address */ + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) + return __va(offset); + + return NULL; /* fallback to arch_memremap_wb */ +} + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. + * + * MEMREMAP_WB - matches the default mapping for System RAM on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + void *addr = NULL; + + if (!flags) + return NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in System RAM. + */ + if (is_ram == REGION_INTERSECTS) + addr = try_ram_remap(offset, size, flags); + if (!addr) + addr = arch_memremap_wb(offset, size); + } + + /* + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * System RAM. + */ + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) + addr = ioremap_wt(offset, size); + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(*(void **)res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); + if (!ptr) + return ERR_PTR(-ENOMEM); + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else { + devres_free(ptr); + return ERR_PTR(-ENXIO); + } + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); +} +EXPORT_SYMBOL(devm_memunmap); diff --git a/kernel/memremap.c b/kernel/memremap.c index 895e6b76b25e06..5857267a4af5da 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,15 +1,5 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include #include #include @@ -19,170 +9,8 @@ #include #include #include +#include -#ifndef ioremap_cache -/* temporary while we convert existing ioremap_cache users to memremap */ -__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) -{ - return ioremap(offset, size); -} -#endif - -#ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) -{ - return (__force void *)ioremap_cache(offset, size); -} -#endif - -#ifndef arch_memremap_can_ram_remap -static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - return true; -} -#endif - -static void *try_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - unsigned long pfn = PHYS_PFN(offset); - - /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && - arch_memremap_can_ram_remap(offset, size, flags)) - return __va(offset); - - return NULL; /* fallback to arch_memremap_wb */ -} - -/** - * memremap() - remap an iomem_resource as cacheable memory - * @offset: iomem resource start address - * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, - * MEMREMAP_ENC, MEMREMAP_DEC - * - * memremap() is "ioremap" for cases where it is known that the resource - * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. In the case of multiple flags, the different - * mapping types will be attempted in the order listed below until one of - * them succeeds. - * - * MEMREMAP_WB - matches the default mapping for System RAM on - * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM - * memremap() will bypass establishing a new mapping and instead return - * a pointer into the direct map. - * - * MEMREMAP_WT - establish a mapping whereby writes either bypass the - * cache or are written through to memory and never exist in a - * cache-dirty state with respect to program visibility. Attempts to - * map System RAM with this mapping type will fail. - * - * MEMREMAP_WC - establish a writecombine mapping, whereby writes may - * be coalesced together (e.g. in the CPU's write buffers), but is otherwise - * uncached. Attempts to map System RAM with this mapping type will fail. - */ -void *memremap(resource_size_t offset, size_t size, unsigned long flags) -{ - int is_ram = region_intersects(offset, size, - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - void *addr = NULL; - - if (!flags) - return NULL; - - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - /* Try all mapping types requested until one returns non-NULL */ - if (flags & MEMREMAP_WB) { - /* - * MEMREMAP_WB is special in that it can be satisifed - * from the direct map. Some archs depend on the - * capability of memremap() to autodetect cases where - * the requested range is potentially in System RAM. - */ - if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size, flags); - if (!addr) - addr = arch_memremap_wb(offset, size); - } - - /* - * If we don't have a mapping yet and other request flags are - * present then we will be attempting to establish a new virtual - * address mapping. Enforce that this mapping is not aliasing - * System RAM. - */ - if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { - WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - if (!addr && (flags & MEMREMAP_WT)) - addr = ioremap_wt(offset, size); - - if (!addr && (flags & MEMREMAP_WC)) - addr = ioremap_wc(offset, size); - - return addr; -} -EXPORT_SYMBOL(memremap); - -void memunmap(void *addr) -{ - if (is_vmalloc_addr(addr)) - iounmap((void __iomem *) addr); -} -EXPORT_SYMBOL(memunmap); - -static void devm_memremap_release(struct device *dev, void *res) -{ - memunmap(*(void **)res); -} - -static int devm_memremap_match(struct device *dev, void *res, void *match_data) -{ - return *(void **)res == match_data; -} - -void *devm_memremap(struct device *dev, resource_size_t offset, - size_t size, unsigned long flags) -{ - void **ptr, *addr; - - ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, - dev_to_node(dev)); - if (!ptr) - return ERR_PTR(-ENOMEM); - - addr = memremap(offset, size, flags); - if (addr) { - *ptr = addr; - devres_add(dev, ptr); - } else { - devres_free(ptr); - return ERR_PTR(-ENXIO); - } - - return addr; -} -EXPORT_SYMBOL(devm_memremap); - -void devm_memunmap(struct device *dev, void *addr) -{ - WARN_ON(devres_release(dev, devm_memremap_release, - devm_memremap_match, addr)); -} -EXPORT_SYMBOL(devm_memunmap); - -#ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, return pgmap; } -#endif /* CONFIG_ZONE_DEVICE */ +EXPORT_SYMBOL_GPL(get_dev_pagemap); + +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL_GPL(devmap_managed_key); +static atomic_t devmap_enable; + +/* + * Toggle the static key for ->page_free() callbacks when dev_pagemap + * pages go idle. + */ +void dev_pagemap_get_ops(void) +{ + if (atomic_inc_return(&devmap_enable) == 1) + static_branch_enable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); + +void dev_pagemap_put_ops(void) +{ + if (atomic_dec_and_test(&devmap_enable)) + static_branch_disable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -void put_zone_device_private_or_public_page(struct page *page) +void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_or_public_page); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ +EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +#endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/Kconfig b/mm/Kconfig index e14c01513bfd07..5f39bca5d82b00 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -693,6 +693,9 @@ config ARCH_HAS_HMM config MIGRATE_VMA_HELPER bool +config DEV_PAGEMAP_OPS + bool + config HMM bool select MIGRATE_VMA_HELPER @@ -713,6 +716,7 @@ config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ARCH_HAS_HMM select HMM + select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device @@ -723,6 +727,7 @@ config DEVICE_PUBLIC bool "Addressable device memory (like GPU memory)" depends on ARCH_HAS_HMM select HMM + select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent addressable device diff --git a/mm/gup.c b/mm/gup.c index 541904a7c60fd5..3d8472d48a0b88 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1459,32 +1459,48 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, return 1; } -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, +static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { unsigned long fault_pfn; + int nr_start = *nr; + + fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) + return 0; - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); + if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + return 1; } -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, +static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { unsigned long fault_pfn; + int nr_start = *nr; + + fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) + return 0; - fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); + if (unlikely(pud_val(orig) != pud_val(*pudp))) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + return 1; } #else -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, +static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { BUILD_BUG(); return 0; } -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, +static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { BUILD_BUG(); @@ -1502,7 +1518,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; if (pmd_devmap(orig)) - return __gup_device_huge_pmd(orig, addr, end, pages, nr); + return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1540,7 +1556,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; if (pud_devmap(orig)) - return __gup_device_huge_pud(orig, addr, end, pages, nr); + return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); refs = 0; page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); diff --git a/mm/hmm.c b/mm/hmm.c index 486dc394a5a3cd..de7b6bf7720104 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -35,15 +35,6 @@ #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) -#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) -/* - * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h - */ -DEFINE_STATIC_KEY_FALSE(device_private_key); -EXPORT_SYMBOL(device_private_key); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ - - #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; @@ -1167,7 +1158,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, resource_size_t addr; int ret; - static_branch_enable(&device_private_key); + dev_pagemap_get_ops(); devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), GFP_KERNEL, dev_to_node(device)); @@ -1261,7 +1252,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) return ERR_PTR(-EINVAL); - static_branch_enable(&device_private_key); + dev_pagemap_get_ops(); devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), GFP_KERNEL, dev_to_node(device)); diff --git a/mm/swap.c b/mm/swap.c index 3dd518832096ea..26fc9b5f1b6c1b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -743,7 +744,7 @@ void release_pages(struct page **pages, int nr) flags); locked_pgdat = NULL; } - put_zone_device_private_or_public_page(page); + put_devmap_managed_page(page); continue; }