From ece84b390ab0ceada9c771749455f3594c36e3df Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:08:19 -0800 Subject: [PATCH 01/78] hugetlb, x86: register 1G page size if we can allocate them at runtime After commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") we can allocate 1G pages at runtime if CMA is enabled. Let's register 1G pages into hugetlb even if the user hasn't requested them explicitly at boot time with hugepagesz=1G. Signed-off-by: Kirill A. Shutemov Reviewed-by: Luiz Capitulino Cc: Naoya Horiguchi Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8b977ebf9388c4..bca0aa3a003f6b 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -178,4 +178,15 @@ static __init int setup_hugepagesz(char *opt) return 1; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_CMA +static __init int gigantic_pages_init(void) +{ + /* With CMA we can allocate gigantic pages at runtime */ + if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + return 0; +} +arch_initcall(gigantic_pages_init); +#endif #endif From a118449a7792ea800db6f23cf6c1d2f937c2629a Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 10 Feb 2015 14:08:21 -0800 Subject: [PATCH 02/78] fanotify: only destroy mark when both mask and ignored_mask are cleared In fanotify_mark_remove_from_mask() a mark is destroyed if only one of both bitmasks (mask or ignored_mask) of a mark is cleared. However the other mask may still be set and contain information that should not be lost. So only destroy a mark if both masks are cleared. Signed-off-by: Lino Sanfilippo Reviewed-by: Jan Kara Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index bff8567aa42d1b..25adb6d1b3db80 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -497,10 +497,9 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, oldmask = fsn_mark->ignored_mask; fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask)); } + *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); - *destroy = !(oldmask & ~mask); - return mask & oldmask; } From d2c1874ce687c175b544bc28b6187bf03735a931 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 10 Feb 2015 14:08:24 -0800 Subject: [PATCH 03/78] fanotify: don't recalculate a marks mask if only the ignored mask changed If removing bits from a mark's ignored mask, the concerning inodes/vfsmounts mask is not affected. So don't recalculate it. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Lino Sanfilippo Reviewed-by: Jan Kara Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 25adb6d1b3db80..f4d279807a96f1 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -487,15 +487,16 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, unsigned int flags, int *destroy) { - __u32 oldmask; + __u32 oldmask = 0; spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { oldmask = fsn_mark->mask; fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask)); } else { - oldmask = fsn_mark->ignored_mask; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask)); + __u32 tmask = fsn_mark->ignored_mask & ~mask; + + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); } *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); From 66ba93c0d7fe63def447ad0afe380307ff9ebcad Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 10 Feb 2015 14:08:27 -0800 Subject: [PATCH 04/78] fanotify: don't set FAN_ONDIR implicitly on a marks ignored mask Currently FAN_ONDIR is always set on a mark's ignored mask when the event mask is extended without FAN_MARK_ONDIR being set. This may result in events for directories being ignored unexpectedly for call sequences like fanotify_mark(fd, FAN_MARK_ADD, FAN_OPEN | FAN_ONDIR , AT_FDCWD, "dir"); fanotify_mark(fd, FAN_MARK_ADD, FAN_CLOSE, AT_FDCWD, "dir"); Also FAN_MARK_ONDIR is only honored when adding events to a mark's mask, but not for event removal. Fix both issues by not setting FAN_ONDIR implicitly on the ignore mask any more. Instead treat FAN_ONDIR as any other event flag and require FAN_MARK_ONDIR to be set by the user for both event mask and ignore mask. Furthermore take FAN_MARK_ONDIR into account when set for event removal. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Lino Sanfilippo Reviewed-by: Jan Kara Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 2 +- fs/notify/fanotify/fanotify_user.c | 25 +++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 30d3addfad7583..51ceb810728474 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, } if (S_ISDIR(path->dentry->d_inode->i_mode) && - (marks_ignored_mask & FS_ISDIR)) + !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; if (event_mask & marks_mask & ~marks_ignored_mask) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f4d279807a96f1..cf275500a6658e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -491,10 +491,17 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { + __u32 tmask = fsn_mark->mask & ~mask; + + if (flags & FAN_MARK_ONDIR) + tmask &= ~FAN_ONDIR; + oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask)); + fsnotify_set_mark_mask_locked(fsn_mark, tmask); } else { __u32 tmask = fsn_mark->ignored_mask & ~mask; + if (flags & FAN_MARK_ONDIR) + tmask &= ~FAN_ONDIR; fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); } @@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { + __u32 tmask = fsn_mark->mask | mask; + + if (flags & FAN_MARK_ONDIR) + tmask |= FAN_ONDIR; + oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); + fsnotify_set_mark_mask_locked(fsn_mark, tmask); } else { __u32 tmask = fsn_mark->ignored_mask | mask; + if (flags & FAN_MARK_ONDIR) + tmask |= FAN_ONDIR; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } - - if (!(flags & FAN_MARK_ONDIR)) { - __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); - } - spin_unlock(&fsn_mark->lock); return mask & ~oldmask; From a5b2f95d0c1479c0b4400a41cdca57e53721bea5 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Tue, 10 Feb 2015 14:08:30 -0800 Subject: [PATCH 05/78] inotify: update documentation to reflect code changes The inotify interface has changed a lot. The user interface was too old, and the kernel interface was removed by Eric Paris in commit: 2dfc1ca inotify: remove inotify in kernel interface. Signed-off-by: Zhang Zhen Cc: Wang Kai Cc: Eric Paris Cc: Robert Love Cc: John McCutchan Cc: Heinrich Schuchardt Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/inotify.txt | 197 +------------------------- 1 file changed, 3 insertions(+), 194 deletions(-) diff --git a/Documentation/filesystems/inotify.txt b/Documentation/filesystems/inotify.txt index cfd02712b83ee4..51f61db787fb28 100644 --- a/Documentation/filesystems/inotify.txt +++ b/Documentation/filesystems/inotify.txt @@ -4,201 +4,10 @@ Document started 15 Mar 2005 by Robert Love +Document updated 4 Jan 2015 by Zhang Zhen + --Deleted obsoleted interface, just refer to manpages for user interface. - -(i) User Interface - -Inotify is controlled by a set of three system calls and normal file I/O on a -returned file descriptor. - -First step in using inotify is to initialise an inotify instance: - - int fd = inotify_init (); - -Each instance is associated with a unique, ordered queue. - -Change events are managed by "watches". A watch is an (object,mask) pair where -the object is a file or directory and the mask is a bit mask of one or more -inotify events that the application wishes to receive. See -for valid events. A watch is referenced by a watch descriptor, or wd. - -Watches are added via a path to the file. - -Watches on a directory will return events on any files inside of the directory. - -Adding a watch is simple: - - int wd = inotify_add_watch (fd, path, mask); - -Where "fd" is the return value from inotify_init(), path is the path to the -object to watch, and mask is the watch mask (see ). - -You can update an existing watch in the same manner, by passing in a new mask. - -An existing watch is removed via - - int ret = inotify_rm_watch (fd, wd); - -Events are provided in the form of an inotify_event structure that is read(2) -from a given inotify instance. The filename is of dynamic length and follows -the struct. It is of size len. The filename is padded with null bytes to -ensure proper alignment. This padding is reflected in len. - -You can slurp multiple events by passing a large buffer, for example - - size_t len = read (fd, buf, BUF_LEN); - -Where "buf" is a pointer to an array of "inotify_event" structures at least -BUF_LEN bytes in size. The above example will return as many events as are -available and fit in BUF_LEN. - -Each inotify instance fd is also select()- and poll()-able. - -You can find the size of the current event queue via the standard FIONREAD -ioctl on the fd returned by inotify_init(). - -All watches are destroyed and cleaned up on close. - - -(ii) - -Prototypes: - - int inotify_init (void); - int inotify_add_watch (int fd, const char *path, __u32 mask); - int inotify_rm_watch (int fd, __u32 mask); - - -(iii) Kernel Interface - -Inotify's kernel API consists a set of functions for managing watches and an -event callback. - -To use the kernel API, you must first initialize an inotify instance with a set -of inotify_operations. You are given an opaque inotify_handle, which you use -for any further calls to inotify. - - struct inotify_handle *ih = inotify_init(my_event_handler); - -You must provide a function for processing events and a function for destroying -the inotify watch. - - void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, - u32 cookie, const char *name, struct inode *inode) - - watch - the pointer to the inotify_watch that triggered this call - wd - the watch descriptor - mask - describes the event that occurred - cookie - an identifier for synchronizing events - name - the dentry name for affected files in a directory-based event - inode - the affected inode in a directory-based event - - void destroy_watch(struct inotify_watch *watch) - -You may add watches by providing a pre-allocated and initialized inotify_watch -structure and specifying the inode to watch along with an inotify event mask. -You must pin the inode during the call. You will likely wish to embed the -inotify_watch structure in a structure of your own which contains other -information about the watch. Once you add an inotify watch, it is immediately -subject to removal depending on filesystem events. You must grab a reference if -you depend on the watch hanging around after the call. - - inotify_init_watch(&my_watch->iwatch); - inotify_get_watch(&my_watch->iwatch); // optional - s32 wd = inotify_add_watch(ih, &my_watch->iwatch, inode, mask); - inotify_put_watch(&my_watch->iwatch); // optional - -You may use the watch descriptor (wd) or the address of the inotify_watch for -other inotify operations. You must not directly read or manipulate data in the -inotify_watch. Additionally, you must not call inotify_add_watch() more than -once for a given inotify_watch structure, unless you have first called either -inotify_rm_watch() or inotify_rm_wd(). - -To determine if you have already registered a watch for a given inode, you may -call inotify_find_watch(), which gives you both the wd and the watch pointer for -the inotify_watch, or an error if the watch does not exist. - - wd = inotify_find_watch(ih, inode, &watchp); - -You may use container_of() on the watch pointer to access your own data -associated with a given watch. When an existing watch is found, -inotify_find_watch() bumps the refcount before releasing its locks. You must -put that reference with: - - put_inotify_watch(watchp); - -Call inotify_find_update_watch() to update the event mask for an existing watch. -inotify_find_update_watch() returns the wd of the updated watch, or an error if -the watch does not exist. - - wd = inotify_find_update_watch(ih, inode, mask); - -An existing watch may be removed by calling either inotify_rm_watch() or -inotify_rm_wd(). - - int ret = inotify_rm_watch(ih, &my_watch->iwatch); - int ret = inotify_rm_wd(ih, wd); - -A watch may be removed while executing your event handler with the following: - - inotify_remove_watch_locked(ih, iwatch); - -Call inotify_destroy() to remove all watches from your inotify instance and -release it. If there are no outstanding references, inotify_destroy() will call -your destroy_watch op for each watch. - - inotify_destroy(ih); - -When inotify removes a watch, it sends an IN_IGNORED event to your callback. -You may use this event as an indication to free the watch memory. Note that -inotify may remove a watch due to filesystem events, as well as by your request. -If you use IN_ONESHOT, inotify will remove the watch after the first event, at -which point you may call the final inotify_put_watch. - -(iv) Kernel Interface Prototypes - - struct inotify_handle *inotify_init(struct inotify_operations *ops); - - inotify_init_watch(struct inotify_watch *watch); - - s32 inotify_add_watch(struct inotify_handle *ih, - struct inotify_watch *watch, - struct inode *inode, u32 mask); - - s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode, - struct inotify_watch **watchp); - - s32 inotify_find_update_watch(struct inotify_handle *ih, - struct inode *inode, u32 mask); - - int inotify_rm_wd(struct inotify_handle *ih, u32 wd); - - int inotify_rm_watch(struct inotify_handle *ih, - struct inotify_watch *watch); - - void inotify_remove_watch_locked(struct inotify_handle *ih, - struct inotify_watch *watch); - - void inotify_destroy(struct inotify_handle *ih); - - void get_inotify_watch(struct inotify_watch *watch); - void put_inotify_watch(struct inotify_watch *watch); - - -(v) Internal Kernel Implementation - -Each inotify instance is represented by an inotify_handle structure. -Inotify's userspace consumers also have an inotify_device which is -associated with the inotify_handle, and on which events are queued. - -Each watch is associated with an inotify_watch structure. Watches are chained -off of each associated inotify_handle and each associated inode. - -See fs/notify/inotify/inotify_fsnotify.c and fs/notify/inotify/inotify_user.c -for the locking and lifetime rules. - - -(vi) Rationale +(i) Rationale Q: What is the design decision behind not tying the watch to the open fd of the watched object? From 6ee8e25fc3e916193bce4ebb43d5439e1e2144ab Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2015 14:08:32 -0800 Subject: [PATCH 06/78] fsnotify: fix handling of renames in audit Commit e9fd702a58c4 ("audit: convert audit watches to use fsnotify instead of inotify") broke handling of renames in audit. Audit code wants to update inode number of an inode corresponding to watched name in a directory. When something gets renamed into a directory to a watched name, inotify previously passed moved inode to audit code however new fsnotify code passes directory inode where the change happened. That confuses audit and it starts watching parent directory instead of a file in a directory. This can be observed for example by doing: cd /tmp touch foo bar auditctl -w /tmp/foo touch foo mv bar foo touch foo In audit log we see events like: type=CONFIG_CHANGE msg=audit(1423563584.155:90): auid=1000 ses=2 op="updated rules" path="/tmp/foo" key=(null) list=4 res=1 ... type=PATH msg=audit(1423563584.155:91): item=2 name="bar" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE type=PATH msg=audit(1423563584.155:91): item=3 name="foo" inode=1046842 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE type=PATH msg=audit(1423563584.155:91): item=4 name="foo" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=CREATE ... and that's it - we see event for the first touch after creating the audit rule, we see events for rename but we don't see any event for the last touch. However we start seeing events for unrelated stuff happening in /tmp. Fix the problem by passing moved inode as data in the FS_MOVED_FROM and FS_MOVED_TO events instead of the directory where the change happens. This doesn't introduce any new problems because noone besides audit_watch.c cares about the passed value: fs/notify/fanotify/fanotify.c cares only about FSNOTIFY_EVENT_PATH events. fs/notify/dnotify/dnotify.c doesn't care about passed 'data' value at all. fs/notify/inotify/inotify_fsnotify.c uses 'data' only for FSNOTIFY_EVENT_PATH. kernel/audit_tree.c doesn't care about passed 'data' at all. kernel/audit_watch.c expects moved inode as 'data'. Fixes: e9fd702a58c49db ("audit: convert audit watches to use fsnotify instead of inotify") Signed-off-by: Jan Kara Cc: Paul Moore Cc: Eric Paris Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsnotify.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1c804b057fb109..7ee1774edee51c 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -101,8 +101,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, new_dir_mask |= FS_ISDIR; } - fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie); - fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie); + fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name, + fs_cookie); + fsnotify(new_dir, new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_name, + fs_cookie); if (target) fsnotify_link_count(target); From 560b8c0ed45ae8951e9a0fba4d89b083dc1d433a Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Tue, 10 Feb 2015 14:08:35 -0800 Subject: [PATCH 07/78] sh: build superh without CONFIG_EXPERT What sh4 actually wants is HAVE_PATA_PLATFORM, so select that instead. Signed-off-by: Rob Landley Acked-by: Randy Dunlap Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 0f09f5285d5e72..eb4ef274ae9bd7 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -1,7 +1,7 @@ config SUPERH def_bool y select ARCH_MIGHT_HAVE_PC_PARPORT - select EXPERT + select HAVE_PATA_PLATFORM select CLKDEV_LOOKUP select HAVE_IDE if HAS_IOPORT_MAP select HAVE_MEMBLOCK From 102ca6606c96979cbef53dff30deac5e909a1914 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Tue, 10 Feb 2015 14:08:38 -0800 Subject: [PATCH 08/78] sh: eliminate unused irq_reg_{readl,writel} accessors Defining these macros way down in arch/sh/.../irq.c doesn't cause kernel/irq/generic-chip.c to use them. As far as I can tell this code has no effect. Fixes: 332fd7c4fef5f3b1 ("genirq: Generic chip: Change irq_reg_{readl,writel} arguments") Signed-off-by: Kevin Cernekee Signed-off-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven (cpp/asm comparison) Cc: Thomas Gleixner Cc: Jason Cooper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/boards/mach-se/7343/irq.c | 3 --- arch/sh/boards/mach-se/7722/irq.c | 3 --- 2 files changed, 6 deletions(-) diff --git a/arch/sh/boards/mach-se/7343/irq.c b/arch/sh/boards/mach-se/7343/irq.c index 7646bf0486c2c9..1087dba9b01523 100644 --- a/arch/sh/boards/mach-se/7343/irq.c +++ b/arch/sh/boards/mach-se/7343/irq.c @@ -14,9 +14,6 @@ #define DRV_NAME "SE7343-FPGA" #define pr_fmt(fmt) DRV_NAME ": " fmt -#define irq_reg_readl ioread16 -#define irq_reg_writel iowrite16 - #include #include #include diff --git a/arch/sh/boards/mach-se/7722/irq.c b/arch/sh/boards/mach-se/7722/irq.c index f5e2af1bf040ff..00e699232621dc 100644 --- a/arch/sh/boards/mach-se/7722/irq.c +++ b/arch/sh/boards/mach-se/7722/irq.c @@ -11,9 +11,6 @@ #define DRV_NAME "SE7722-FPGA" #define pr_fmt(fmt) DRV_NAME ": " fmt -#define irq_reg_readl ioread16 -#define irq_reg_writel iowrite16 - #include #include #include From b934beaf4b920a95db012b3d4476e8375dd4415b Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Tue, 10 Feb 2015 14:08:40 -0800 Subject: [PATCH 09/78] ocfs2/dlm: add missing dlm_lock_put() when recovery master down When the recovery master is down, the owner of $RECOVERY calls dlm_do_local_recovery_cleanup() to prune any $RECOVERY entries for dead nodes. The lock is in the granted list and the refcount must be 2. We should put twice to remove this lock. Otherwise, it will lead to a memory leak. Signed-off-by: joyce.xue Reported-by: yangwenfang Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index cecd875653e4cc..ce12e0b1a31f18 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, dead_node, dlm->name); list_del_init(&lock->list); dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); break; } } @@ -2346,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dead_node, dlm->name); list_del_init(&lock->list); dlm_lock_put(lock); + /* Can't schedule + * DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); break; } } From e6e746187dd3fc5f8d9c5dc48a12f65941841467 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 10 Feb 2015 14:08:43 -0800 Subject: [PATCH 10/78] ocfs2: remove unnecessary else in ocfs2_set_acl() else is unnecessary after return. Signed-off-by: Fabian Frederick Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 7e8282dcea2a92..c58a1bcfda0fdf 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle, ret = posix_acl_equiv_mode(acl, &mode); if (ret < 0) return ret; - else { - if (ret == 0) - acl = NULL; - ret = ocfs2_acl_set_mode(inode, di_bh, - handle, mode); - if (ret) - return ret; + if (ret == 0) + acl = NULL; - } + ret = ocfs2_acl_set_mode(inode, di_bh, + handle, mode); + if (ret) + return ret; } break; case ACL_TYPE_DEFAULT: From 696cdf730f40c97ebccacddd40c5382608dc8320 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 10 Feb 2015 14:08:46 -0800 Subject: [PATCH 11/78] ocfs2: fix uninitialized variable access Variable "why" is not yet initialized at line 615, fix it. Signed-off-by: Junxiao Bi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3950693dd0f6d9..245db4f504dab4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, handle_t *handle = NULL; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; - enum ocfs2_alloc_restarted why; + enum ocfs2_alloc_restarted why = RESTART_NONE; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; int did_quota = 0; From 41d6247fdd1a30d4145c9c2590db3928ddc3d323 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 10 Feb 2015 14:08:48 -0800 Subject: [PATCH 12/78] ocfs2: fix wrong comment O2NET_CONN_IDLE_DELAY is not defined, connection attempts will not be canceled due to timeout. Signed-off-by: Junxiao Bi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp_internal.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index dc024367110a60..b95e7df5b76ac7 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -107,12 +107,12 @@ struct o2net_node { struct list_head nn_status_list; /* connects are attempted from when heartbeat comes up until either hb - * goes down, the node is unconfigured, no connect attempts succeed - * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work - * is queued from set_nn_state both from hb up and from itself if a - * connect attempt fails and so can be self-arming. shutdown is - * careful to first mark the nn such that no connects will be attempted - * before canceling delayed connect work and flushing the queue. */ + * goes down, the node is unconfigured, or a connect succeeds. + * connect_work is queued from set_nn_state both from hb up and from + * itself if a connect attempt fails and so can be self-arming. + * shutdown is careful to first mark the nn such that no connects will + * be attempted before canceling delayed connect work and flushing the + * queue. */ struct delayed_work nn_connect_work; unsigned long nn_last_connect_attempt; From 79c83ea1aae1536b3e21e59946b97be150033ec7 Mon Sep 17 00:00:00 2001 From: alex chen Date: Tue, 10 Feb 2015 14:08:51 -0800 Subject: [PATCH 13/78] ocfs2: fix snprintf format specifier in dlmdebug.c Use snprintf format specifier "%lu" instead of "%ld" for argument of type 'unsigned long'. Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdebug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 149eb556b8c630..825136070d2c11 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) } spin_unlock(&dlm->spinlock); - out += snprintf(buf + out, len - out, "Total on list: %ld\n", total); + out += snprintf(buf + out, len - out, "Total on list: %lu\n", total); return out; } @@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) spin_unlock(&dlm->master_lock); out += snprintf(buf + out, len - out, - "Total: %ld, Longest: %ld\n", total, longest); + "Total: %lu, Longest: %lu\n", total, longest); return out; } From d6edc87af85988ba126db05b4f26ac9ffa6c04fb Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Tue, 10 Feb 2015 14:08:54 -0800 Subject: [PATCH 14/78] ocfs2: xattr: remove unused function Remove ocfs2_xattr_bucket_get_val() that is not used anywhere. This was partially found by using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 662f8dee149f99..85b190dc132f3a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -5334,16 +5334,6 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode, return ret; } -static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, - struct ocfs2_xattr_bucket *bucket, - int offs) -{ - int block_off = offs >> inode->i_sb->s_blocksize_bits; - - offs = offs % inode->i_sb->s_blocksize; - return bucket_block(bucket, block_off) + offs; -} - /* * Truncate the specified xe_off entry in xattr bucket. * bucket is indicated by header_bh and len is the new length. From 7ea62d70311d92d4c40fa0514940110af055f7a9 Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Tue, 10 Feb 2015 14:08:56 -0800 Subject: [PATCH 15/78] ocfs2: quota_local: remove unused function Remove ol_dqblk_file_block() that is not used anywhere. This was partially found by using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/quota_local.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 10b653930ee26a..bb72af344475ad 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) ol_dqblk_block_off(sb, c, off); } -/* Compute block number from given offset */ -static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off) -{ - return off >> sb->s_blocksize_bits; -} - static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) { return off & ((1 << sb->s_blocksize_bits) - 1); From 95671c63d5ef3b8794fc9a05d44f0162cc5db425 Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Tue, 10 Feb 2015 14:08:59 -0800 Subject: [PATCH 16/78] ocfs2: dlm: dlmdomain: remove unused function Remove dlm_joined() that is not used anywhere. This was partially found by using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 14 -------------- fs/ocfs2/dlm/dlmdomain.h | 1 - 2 files changed, 15 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 50a59d2337b259..7df88a6dd6260c 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_joined(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_JOINED) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - int dlm_shutting_down(struct dlm_ctxt *dlm) { int ret = 0; diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index 2f7f60bfeb3b57..fd6122a38dbdf0 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -28,7 +28,6 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_joined(struct dlm_ctxt *dlm); int dlm_shutting_down(struct dlm_ctxt *dlm); void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); From 15eba0fe3eeaeb1b80489c1ebb9d47d6d7003f57 Mon Sep 17 00:00:00 2001 From: alex chen Date: Tue, 10 Feb 2015 14:09:02 -0800 Subject: [PATCH 17/78] ocfs2: fix journal commit deadlock in ocfs2_convert_inline_data_to_extents Similar to ocfs2_write_end_nolock() which is metioned at commit 136f49b91710 ("ocfs2: fix journal commit deadlock"), we should unlock pages before ocfs2_commit_trans() in ocfs2_convert_inline_data_to_extents. Otherwise, it will cause a deadlock with journal commit threads. Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fcae9ef1a32875..044158bd22be1c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6873,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_unlock; + goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, @@ -6931,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (ret) { mlog_errno(ret); need_free = 1; - goto out_commit; + goto out_unlock; } page_end = PAGE_CACHE_SIZE; @@ -6964,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (ret) { mlog_errno(ret); need_free = 1; - goto out_commit; + goto out_unlock; } inode->i_blocks = ocfs2_inode_sector_count(inode); } +out_unlock: + if (pages) + ocfs2_unlock_and_free_pages(pages, num_pages); + out_commit: if (ret < 0 && did_quota) dquot_free_space_nodirty(inode, @@ -6989,15 +6993,11 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ocfs2_commit_trans(osb, handle); -out_unlock: +out: if (data_ac) ocfs2_free_alloc_context(data_ac); - -out: - if (pages) { - ocfs2_unlock_and_free_pages(pages, num_pages); + if (pages) kfree(pages); - } return ret; } From 1dfeb768475dfded66bba03a1744c2e8141d3429 Mon Sep 17 00:00:00 2001 From: alex chen Date: Tue, 10 Feb 2015 14:09:04 -0800 Subject: [PATCH 18/78] ocfs2: add a mount option journal_async_commit on ocfs2 filesystem Add a mount option to support JBD2 feature: JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT. When this feature is opened, journal commit block can be written to disk without waiting for descriptor blocks, which can improve journal commit performance. This option will enable 'journal_checksum' internally. Using the fs_mark benchmark, using journal_async_commit shows a 50% improvement, the files per second go up from 215.2 to 317.5. test script: fs_mark -d /mnt/ocfs2/ -s 10240 -n 1000 default: FSUse% Count Size Files/sec App Overhead 0 1000 10240 215.2 17878 with journal_async_commit option: FSUse% Count Size Files/sec App Overhead 0 1000 10240 317.5 17881 Signed-off-by: Alex Chen Signed-off-by: Weiwei Wang Reviewed-by: Joseph Qi Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/ocfs2.txt | 4 ++++ fs/ocfs2/ocfs2.h | 2 ++ fs/ocfs2/super.c | 17 +++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 7618a287aa41f0..28f8c08201e29e 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -100,3 +100,7 @@ coherency=full (*) Disallow concurrent O_DIRECT writes, cluster inode coherency=buffered Allow concurrent O_DIRECT writes without EX lock among nodes, which gains high performance at risk of getting stale data on other nodes. +journal_async_commit Commit block can be written to disk without waiting + for descriptor blocks. If enabled older kernels cannot + mount the device. This will enable 'journal_checksum' + internally. diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7d6b7d09045292..fdbcbfed529ee5 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -279,6 +279,8 @@ enum ocfs2_mount_options writes */ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ + + OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83723179e1eccf..c09d6da23c3d53 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -191,6 +191,7 @@ enum { Opt_coherency_full, Opt_resv_level, Opt_dir_resv_level, + Opt_journal_async_commit, Opt_err, }; @@ -222,6 +223,7 @@ static const match_table_t tokens = { {Opt_coherency_full, "coherency=full"}, {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, + {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err, NULL} }; @@ -1500,6 +1502,9 @@ static int ocfs2_parse_options(struct super_block *sb, option < OCFS2_MAX_RESV_LEVEL) mopt->dir_resv_level = option; break; + case Opt_journal_async_commit: + mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1606,6 +1611,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (osb->osb_dir_resv_level != osb->osb_resv_level) seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + seq_printf(s, ",journal_async_commit"); + return 0; } @@ -2475,6 +2483,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) goto finally; } + if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + jbd2_journal_set_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + else + jbd2_journal_clear_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ status = ocfs2_begin_local_alloc_recovery(osb, From d9510a20f81467b48cd005e86e17b5288990bc71 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2015 14:09:07 -0800 Subject: [PATCH 19/78] ocfs2: remove pointless assignment from ocfs2_calc_refcount_meta_credits() The assigned value is never used. Coverity-id 1226847. Signed-off-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d81f6e2a97f5d4..ee541f92dab403 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, get_bh(prev_bh); } - rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; - trace_ocfs2_calc_refcount_meta_credits_iterate( recs_add, (unsigned long long)cpos, clusters, (unsigned long long)le64_to_cpu(rec.r_cpos), From e6d9f86d6b80dec23fd0e90bab114e642dac483e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Feb 2015 14:09:10 -0800 Subject: [PATCH 20/78] ocfs2: o2net: silence uninitialized variable warning Smatch complains that, if o2net_tx_can_proceed() returns false, then "sc" and "ret" are uninialized or maybe we are re-using the data from previous iteration. I do not know if we can hit this bug in real life but checking the return value is harmless and we may as well silence the static checker warning. Signed-off-by: Dan Carpenter Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2e355e0f833550..56c403a563bc3d 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes) memset(map, 0, bytes); for (node = 0; node < O2NM_MAX_NODES; ++node) { - o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); + if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) + continue; if (!ret) { set_bit(node, map); sc_put(sc); From 592a202a3d463d683632c268a3948978237a868d Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 10 Feb 2015 14:09:12 -0800 Subject: [PATCH 21/78] ocfs2: remove unreachable code Signed-off-by: Daeseok Youn Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 319e786175afed..940be6d994a927 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3513,7 +3513,6 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); } - status = 0; bail: brelse(bh); if (status) From 9b57269170820d707b048f305bb59a7b23cc5448 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 10 Feb 2015 14:09:15 -0800 Subject: [PATCH 22/78] ocfs2: removes mlog_errno() call twice in ocfs2_find_dir_space_el() mlog_errno() is called twice when some functions are failed. Signed-off-by: Daeseok Youn Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 940be6d994a927..b08050bd3f2ef8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3456,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, int blocksize = dir->i_sb->s_blocksize; status = ocfs2_read_dir_block(dir, 0, &bh, 0); - if (status) { - mlog_errno(status); + if (status) goto bail; - } rec_len = OCFS2_DIR_REC_LEN(namelen); offset = 0; @@ -3480,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = ocfs2_read_dir_block(dir, offset >> sb->s_blocksize_bits, &bh, 0); - if (status) { - mlog_errno(status); + if (status) goto bail; - } + /* move to next block */ de = (struct ocfs2_dir_entry *) bh->b_data; } From 9d6008c759b6fbca16a5adefdb3477d87fe4a15b Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 10 Feb 2015 14:09:18 -0800 Subject: [PATCH 23/78] ocfs2: remove unreachable code in __ocfs2_recovery_thread() Signed-off-by: Daeseok Youn Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4f502382180f67..d10860fde16545 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1447,7 +1447,6 @@ static int __ocfs2_recovery_thread(void *arg) * requires that we call do_exit(). And it isn't exported, but * complete_and_exit() seems to be a minimal wrapper around it. */ complete_and_exit(NULL, status); - return status; } void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) From 8989b6733027e6d28d77662113704adb434d0eae Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 10 Feb 2015 14:09:20 -0800 Subject: [PATCH 24/78] ocfs2: make resv_lock spinlock static resv_lock is only used in reservations.c Signed-off-by: Fabian Frederick Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/reservations.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 41ffd36c689c23..6a348b0294abfd 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -39,7 +39,7 @@ #define OCFS2_CHECK_RESERVATIONS #endif -DEFINE_SPINLOCK(resv_lock); +static DEFINE_SPINLOCK(resv_lock); #define OCFS2_MIN_RESV_WINDOW_BITS 8 #define OCFS2_MAX_RESV_WINDOW_BITS 1024 From 10ab88117d069a552a5efdb4b5fb1c087a948c63 Mon Sep 17 00:00:00 2001 From: alex chen Date: Tue, 10 Feb 2015 14:09:23 -0800 Subject: [PATCH 25/78] ocfs2: prune the dcache before deleting the dentry of directory In ocfs2_dentry_convert_worker, we should prune the dcache before deleting the dentry of directory, otherwise, in the following cases the inode of directory will still remain in orphan directory until the device being umounted. Mount point: /mnt/ocfs2 Node A Node B mkdir /mnt/ocfs2/testdir ocfs2_mkdir ->ocfs2_mknod ->ocfs2_dentry_attach_lock ->ocfs2_dentry_lock(dentry, 0) ... ... touch /mnt/ocfs2/testdir/testfile unlink /mnt/test/testdir/testfile rmdir /mnt/ocfs2/testdir ocfs2_unlink ->ocfs2_remote_dentry_delete ->ocfs2_dentry_lock(dentry, 1) ... ... ... ... ocfs2_downconvert_thread ->ocfs2_unblock_lock ->ocfs2_dentry_convert_worker ->ocfs2_find_local_alias ->dget_dlock ->d_delete Here the dentry can not be released because the children's dentry is negative but still exist. Finally, this inode will still remain in orphan directory until its children are destroyed. So before deleting dentry of directory, we should prune the dcache to remove unused children of the parent dentry by shrink_dcache_parent(). Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Reviewed-by: joyce.xue Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c423af04c69e3..11849a44dc5a90 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3750,6 +3750,9 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, break; spin_unlock(&dentry_attach_lock); + if (S_ISDIR(dl->dl_inode->i_mode)) + shrink_dcache_parent(dentry); + mlog(0, "d_delete(%pd);\n", dentry); /* From 99b8874e79619498ade354357f64299768642af6 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda Date: Tue, 10 Feb 2015 14:09:26 -0800 Subject: [PATCH 26/78] o2dlm: fix NULL pointer dereference in o2dlm_blocking_ast_wrapper A tiny race between BAST and unlock message causes the NULL dereference. A node sends an unlock request to master and receives a response. Before processing the response it receives a BAST from the master. Since both requests are processed by different threads it creates a race. While the BAST is being processed, lock can get freed by unlock code. This patch makes bast to return immediately if lock is found but unlock is pending. The code should handle this race. We also have to fix master node to skip sending BAST after receiving unlock message. Below is the crash stack BUG: unable to handle kernel NULL pointer dereference at 0000000000000048 IP: o2dlm_blocking_ast_wrapper+0xd/0x16 dlm_do_local_bast+0x8e/0x97 [ocfs2_dlm] dlm_proxy_ast_handler+0x838/0x87e [ocfs2_dlm] o2net_process_message+0x395/0x5b8 [ocfs2_nodemanager] o2net_rx_until_empty+0x762/0x90d [ocfs2_nodemanager] worker_thread+0x14d/0x1ed [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmast.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index b46278f9ae446a..fd6bbbbd7d78df 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, head = &res->granted; list_for_each_entry(lock, head, list) { - if (lock->ml.cookie == cookie) + /* if lock is found but unlock is pending ignore the bast */ + if (lock->ml.cookie == cookie) { + if (lock->unlock_pending) + break; goto do_ast; + } } mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " From 913e027ca17ee06fa9436a21e54464795b0fa0e8 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 10 Feb 2015 14:09:29 -0800 Subject: [PATCH 27/78] fsioctl.c: make generic_block_fiemap() signal-tolerant __generic_block_fiemap may spin very long time for large sparse files. Without this patch an unprivileged user may abuse system resources simply by spawning a vast number of unkilable busyloops (works on ext2/ext3): truncate --size 1T test for ((i=0;i<1024;i++)) do filefrag test > /dev/null & done Signed-off-by: Dmitry Monakhov Cc: Theodore Ts'o Cc: Al Viro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/fiemap.txt | 3 ++- fs/ioctl.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt index 1b805a0efbb027..f6d9c99103a46c 100644 --- a/Documentation/filesystems/fiemap.txt +++ b/Documentation/filesystems/fiemap.txt @@ -196,7 +196,8 @@ struct fiemap_extent_info { }; It is intended that the file system should not need to access any of this -structure directly. +structure directly. Filesystem handlers should be tolerant to signals and return +EINTR once fatal signal received. Flag checking should be done at the beginning of the ->fiemap callback via the diff --git a/fs/ioctl.c b/fs/ioctl.c index 214c3c11fbc24c..5d01d2638ca548 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode, past_eof = true; } cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + } while (1); /* If ret is 1 then we just hit the end of the extent array */ From 9aabf810a67cd97e2d1a48f0bab338b7680f1929 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Feb 2015 14:09:32 -0800 Subject: [PATCH 28/78] mm/slub: optimize alloc/free fastpath by removing preemption on/off We had to insert a preempt enable/disable in the fastpath a while ago in order to guarantee that tid and kmem_cache_cpu are retrieved on the same cpu. It is the problem only for CONFIG_PREEMPT in which scheduler can move the process to other cpu during retrieving data. Now, I reach the solution to remove preempt enable/disable in the fastpath. If tid is matched with kmem_cache_cpu's tid after tid and kmem_cache_cpu are retrieved by separate this_cpu operation, it means that they are retrieved on the same cpu. If not matched, we just have to retry it. With this guarantee, preemption enable/disable isn't need at all even if CONFIG_PREEMPT, so this patch removes it. I saw roughly 5% win in a fast-path loop over kmem_cache_alloc/free in CONFIG_PREEMPT. (14.821 ns -> 14.049 ns) Below is the result of Christoph's slab_test reported by Jesper Dangaard Brouer. * Before Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 49 cycles kfree -> 62 cycles 10000 times kmalloc(16) -> 48 cycles kfree -> 64 cycles 10000 times kmalloc(32) -> 53 cycles kfree -> 70 cycles 10000 times kmalloc(64) -> 64 cycles kfree -> 77 cycles 10000 times kmalloc(128) -> 74 cycles kfree -> 84 cycles 10000 times kmalloc(256) -> 84 cycles kfree -> 114 cycles 10000 times kmalloc(512) -> 83 cycles kfree -> 116 cycles 10000 times kmalloc(1024) -> 81 cycles kfree -> 120 cycles 10000 times kmalloc(2048) -> 104 cycles kfree -> 136 cycles 10000 times kmalloc(4096) -> 142 cycles kfree -> 165 cycles 10000 times kmalloc(8192) -> 238 cycles kfree -> 226 cycles 10000 times kmalloc(16384) -> 403 cycles kfree -> 264 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 68 cycles 10000 times kmalloc(16)/kfree -> 68 cycles 10000 times kmalloc(32)/kfree -> 69 cycles 10000 times kmalloc(64)/kfree -> 68 cycles 10000 times kmalloc(128)/kfree -> 68 cycles 10000 times kmalloc(256)/kfree -> 68 cycles 10000 times kmalloc(512)/kfree -> 74 cycles 10000 times kmalloc(1024)/kfree -> 75 cycles 10000 times kmalloc(2048)/kfree -> 74 cycles 10000 times kmalloc(4096)/kfree -> 74 cycles 10000 times kmalloc(8192)/kfree -> 75 cycles 10000 times kmalloc(16384)/kfree -> 510 cycles * After Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 46 cycles kfree -> 61 cycles 10000 times kmalloc(16) -> 46 cycles kfree -> 63 cycles 10000 times kmalloc(32) -> 49 cycles kfree -> 69 cycles 10000 times kmalloc(64) -> 57 cycles kfree -> 76 cycles 10000 times kmalloc(128) -> 66 cycles kfree -> 83 cycles 10000 times kmalloc(256) -> 84 cycles kfree -> 110 cycles 10000 times kmalloc(512) -> 77 cycles kfree -> 114 cycles 10000 times kmalloc(1024) -> 80 cycles kfree -> 116 cycles 10000 times kmalloc(2048) -> 102 cycles kfree -> 131 cycles 10000 times kmalloc(4096) -> 135 cycles kfree -> 163 cycles 10000 times kmalloc(8192) -> 238 cycles kfree -> 218 cycles 10000 times kmalloc(16384) -> 399 cycles kfree -> 262 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 65 cycles 10000 times kmalloc(16)/kfree -> 66 cycles 10000 times kmalloc(32)/kfree -> 65 cycles 10000 times kmalloc(64)/kfree -> 66 cycles 10000 times kmalloc(128)/kfree -> 66 cycles 10000 times kmalloc(256)/kfree -> 71 cycles 10000 times kmalloc(512)/kfree -> 72 cycles 10000 times kmalloc(1024)/kfree -> 71 cycles 10000 times kmalloc(2048)/kfree -> 71 cycles 10000 times kmalloc(4096)/kfree -> 71 cycles 10000 times kmalloc(8192)/kfree -> 65 cycles 10000 times kmalloc(16384)/kfree -> 511 cycles Most of the results are better than before. Note that this change slightly worses performance in !CONFIG_PREEMPT, roughly 0.3%. Implementing each case separately would help performance, but, since it's so marginal, I didn't do that. This would help maintanance since we have same code for all cases. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Tested-by: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fe376fe1f4fe3c..e7ed6f8304f4c2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2398,13 +2398,24 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * - * Preemption is disabled for the retrieval of the tid because that - * must occur from the current processor. We cannot allow rescheduling - * on a different processor between the determination of the pointer - * and the retrieval of the tid. + * We should guarantee that tid and kmem_cache are retrieved on + * the same cpu. It could be different if CONFIG_PREEMPT so we need + * to check if it is matched or not. */ - preempt_disable(); - c = this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); + + /* + * Irqless object alloc/free algorithm used here depends on sequence + * of fetching cpu_slab's data. tid should be fetched before anything + * on c to guarantee that object and page associated with previous tid + * won't be used with current tid. If we fetch tid first, object and + * page could be one associated with next tid and our alloc/free + * request will be failed. In this case, we will retry. So, no problem. + */ + barrier(); /* * The transaction ids are globally unique per cpu and per operation on @@ -2412,8 +2423,6 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, * occurs on the right processor and that there was no operation on the * linked list in between. */ - tid = c->tid; - preempt_enable(); object = c->freelist; page = c->page; @@ -2659,11 +2668,13 @@ static __always_inline void slab_free(struct kmem_cache *s, * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succedd. */ - preempt_disable(); - c = this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); - tid = c->tid; - preempt_enable(); + /* Same with comment on barrier() in slab_alloc_node() */ + barrier(); if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); From ccaafd7fd039aebc9359a9799f8558b01f1c2adc Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Feb 2015 14:09:35 -0800 Subject: [PATCH 29/78] mm: don't use compound_head() in virt_to_head_page() compound_head() is implemented with assumption that there would be race condition when checking tail flag. This assumption is only true when we try to access arbitrary positioned struct page. The situation that virt_to_head_page() is called is different case. We call virt_to_head_page() only in the range of allocated pages, so there is no race condition on tail flag. In this case, we don't need to handle race condition and we can reduce overhead slightly. This patch implements compound_head_fast() which is similar with compound_head() except tail flag race handling. And then, virt_to_head_page() uses this optimized function to improve performance. I saw 1.8% win in a fast-path loop over kmem_cache_alloc/free, (14.063 ns -> 13.810 ns) if target object is on tail page. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dd5ea3016fc4e8..2c6fd3c5424a68 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -446,6 +446,12 @@ static inline struct page *compound_head_by_tail(struct page *tail) return tail; } +/* + * Since either compound page could be dismantled asynchronously in THP + * or we access asynchronously arbitrary positioned struct page, there + * would be tail flag race. To handle this race, we should call + * smp_rmb() before checking tail flag. compound_head_by_tail() did it. + */ static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) @@ -453,6 +459,18 @@ static inline struct page *compound_head(struct page *page) return page; } +/* + * If we access compound page synchronously such as access to + * allocated page, there is no need to handle tail flag race, so we can + * check tail flag directly without any synchronization primitive. + */ +static inline struct page *compound_head_fast(struct page *page) +{ + if (unlikely(PageTail(page))) + return page->first_page; + return page; +} + /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -531,7 +549,14 @@ static inline void get_page(struct page *page) static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); - return compound_head(page); + + /* + * We don't need to worry about synchronization of tail flag + * when we call virt_to_head_page() since it is only called for + * already allocated page and this page won't be freed until + * this virt_to_head_page() is finished. So use _fast variant. + */ + return compound_head_fast(page); } /* From 94e4d712eb28c9a87c40c898af540725f63e68eb Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 10 Feb 2015 14:09:37 -0800 Subject: [PATCH 30/78] mm/slub.c: fix typo in comment Signed-off-by: Kim Phillips Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index e7ed6f8304f4c2..8b8508adf9c240 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2521,7 +2521,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif /* - * Slow patch handling. This may still be called frequently since objects + * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. * * So we still attempt to reduce cache line usage. Just take the slab From 7c4da061f2e953df479b126b9263f0e845bce0ec Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Tue, 10 Feb 2015 14:09:40 -0800 Subject: [PATCH 31/78] mm/slab_common.c: use kmem_cache_free() Here, free memory is allocated using kmem_cache_zalloc. So, use kmem_cache_free instead of kfree. This is done using Coccinelle and semantic patch used is as follows: @@ expression x,E,c; @@ x = \(kmem_cache_alloc\|kmem_cache_zalloc\|kmem_cache_alloc_node\)(c,...) ... when != x = E when != &x ?-kfree(x) +kmem_cache_free(c,x) Signed-off-by: Vaishali Thakkar Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index e03dd6f2a27212..67f182c10f24d2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -331,7 +331,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, out_free_cache: memcg_free_cache_params(s); - kfree(s); + kmem_cache_free(kmem_cache, s); goto out; } From 3c4868710951dd7a6b991d71ca5f46737c4acf28 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 10 Feb 2015 14:09:43 -0800 Subject: [PATCH 32/78] mm/vmstat.c: fix/cleanup ifdefs CONFIG_COMPACTION=y, CONFIG_DEBUG_FS=n: mm/vmstat.c:690: warning: 'frag_start' defined but not used mm/vmstat.c:702: warning: 'frag_next' defined but not used mm/vmstat.c:710: warning: 'frag_stop' defined but not used mm/vmstat.c:715: warning: 'walk_zones_in_node' defined but not used It's all a bit of a tangly mess and it's unclear why CONFIG_COMPACTION figures in there at all. Move frag_start/frag_next/frag_stop and migratetype_names[] into the existing CONFIG_PROC_FS block. walk_zones_in_node() gets a special ifdef. Also move the #include lines up to where #include lines live. [axel.lin@ingics.com: fix build error when !CONFIG_PROC_FS] Signed-off-by: Axel Lin Cc: Mel Gorman Cc: Joonsoo Kim Cc: Mel Gorman Cc: Joonsoo Kim Tested-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 124 ++++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1284f89fca082a..9943e5fd74e622 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) } #endif -#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) -#include -#include - -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Reclaimable", - "Movable", - "Reserve", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - -static void *frag_start(struct seq_file *m, loff_t *pos) -{ - pg_data_t *pgdat; - loff_t node = *pos; - for (pgdat = first_online_pgdat(); - pgdat && node; - pgdat = next_online_pgdat(pgdat)) - --node; - - return pgdat; -} - -static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - - (*pos)++; - return next_online_pgdat(pgdat); -} - -static void frag_stop(struct seq_file *m, void *arg) -{ -} - -/* Walk all the zones in a node and print using a callback */ -static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, - void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) -{ - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - print(m, pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); - } -} -#endif - #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) #ifdef CONFIG_ZONE_DMA #define TEXT_FOR_DMA(xx) xx "_dma", @@ -907,7 +850,66 @@ const char * const vmstat_text[] = { #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ +#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ + defined(CONFIG_PROC_FS) +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* Walk all the zones in a node and print using a callback */ +static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + print(m, pgdat, zone); + spin_unlock_irqrestore(&zone->lock, flags); + } +} +#endif + #ifdef CONFIG_PROC_FS +static char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Reclaimable", + "Movable", + "Reserve", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void) module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) -#include - /* * Return an index indicating how much of the available free memory is From c8d78c1823f46519473949d33f0d1d33fe21ea16 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:46 -0800 Subject: [PATCH 33/78] mm: replace remap_file_pages() syscall with emulation remap_file_pages(2) was invented to be able efficiently map parts of huge file into limited 32-bit virtual address space such as in database workloads. Nonlinear mappings are pain to support and it seems there's no legitimate use-cases nowadays since 64-bit systems are widely available. Let's drop it and get rid of all these special-cased code. The patch replaces the syscall with emulation which creates new VMA on each remap_file_pages(), unless they it can be merged with an adjacent one. I didn't find *any* real code that uses remap_file_pages(2) to test emulation impact on. I've checked Debian code search and source of all packages in ALT Linux. No real users: libc wrappers, mentions in strace, gdb, valgrind and this kind of stuff. There are few basic tests in LTP for the syscall. They work just fine with emulation. To test performance impact, I've written small test case which demonstrate pretty much worst case scenario: map 4G shmfs file, write to begin of every page pgoff of the page, remap pages in reverse order, read every page. The test creates 1 million of VMAs if emulation is in use, so I had to set vm.max_map_count to 1100000 to avoid -ENOMEM. Before: 23.3 ( +- 4.31% ) seconds After: 43.9 ( +- 0.85% ) seconds Slowdown: 1.88x I believe we can live with that. Test case: #define _GNU_SOURCE #include #include #include #include #define MB (1024UL * 1024) #define SIZE (4096 * MB) int main(int argc, char **argv) { unsigned long *p; long i, pass; for (pass = 0; pass < 10; pass++) { p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { perror("mmap"); return -1; } for (i = 0; i < SIZE / 4096; i++) p[i * 4096 / sizeof(*p)] = i; for (i = 0; i < SIZE / 4096; i++) { if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096, 0, (SIZE - 4096 * (i + 1)) >> 12, 0)) { perror("remap_file_pages"); return -1; } } for (i = SIZE / 4096 - 1; i >= 0; i--) assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1); munmap(p, SIZE); } return 0; } [akpm@linux-foundation.org: fix spello] [sasha.levin@oracle.com: initialize populate before usage] [sasha.levin@oracle.com: grab file ref to prevent race while mmaping] Signed-off-by: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Jones Cc: Linus Torvalds Cc: Armin Rigo Signed-off-by: Sasha Levin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/remap_file_pages.txt | 7 +- include/linux/fs.h | 8 +- mm/Makefile | 2 +- mm/fremap.c | 283 -------------------------- mm/mmap.c | 69 +++++++ mm/nommu.c | 8 - 6 files changed, 79 insertions(+), 298 deletions(-) delete mode 100644 mm/fremap.c diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt index 560e4363a55d9d..f609142f406a1a 100644 --- a/Documentation/vm/remap_file_pages.txt +++ b/Documentation/vm/remap_file_pages.txt @@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit virtual address space. This use-case is not critical anymore since 64-bit systems are widely available. -The plan is to deprecate the syscall and replace it with an emulation. -The emulation will create new VMAs instead of nonlinear mappings. It's -going to work slower for rare users of remap_file_pages() but ABI is -preserved. +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. One side effect of emulation (apart from performance) is that user can hit vm.max_map_count limit more easily due to additional VMAs. See comment for diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13077b6c1..60c4996df7f3be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, - unsigned long size, pgoff_t pgoff); +static inline int generic_file_remap_pages(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/Makefile b/mm/Makefile index 4bf586e663783a..3548460ab7b65b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 2805d71cf47644..00000000000000 --- a/mm/fremap.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * linux/mm/fremap.c - * - * Explicit pagetable population and nonlinear (random) mappings support. - * - * started by Ingo Molnar, Copyright (C) 2002, 2003 - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "internal.h" - -static int mm_counter(struct page *page) -{ - return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; -} - -static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - struct page *page; - swp_entry_t entry; - - if (pte_present(pte)) { - flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush_notify(vma, addr, ptep); - page = vm_normal_page(vma, addr, pte); - if (page) { - if (pte_dirty(pte)) - set_page_dirty(page); - update_hiwater_rss(mm); - dec_mm_counter(mm, mm_counter(page)); - page_remove_rmap(page); - page_cache_release(page); - } - } else { /* zap_pte() is not called when pte_none() */ - if (!pte_file(pte)) { - update_hiwater_rss(mm); - entry = pte_to_swp_entry(pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - dec_mm_counter(mm, mm_counter(page)); - } - } else { - free_swap_and_cache(entry); - dec_mm_counter(mm, MM_SWAPENTS); - } - } - pte_clear_not_present_full(mm, addr, ptep, 0); - } -} - -/* - * Install a file pte to a given virtual memory address, release any - * previously existing mapping. - */ -static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long pgoff, pgprot_t prot) -{ - int err = -ENOMEM; - pte_t *pte, ptfile; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - ptfile = pgoff_to_pte(pgoff); - - if (!pte_none(*pte)) - zap_pte(mm, vma, addr, pte); - - set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); - /* - * We don't need to run update_mmu_cache() here because the "file pte" - * being installed by install_file_pte() is not a real pte - it's a - * non-present entry (like a swap entry), noting what file offset should - * be mapped there when there's a fault (in a non-linear vma where - * that's not obvious). - */ - pte_unmap_unlock(pte, ptl); - err = 0; -out: - return err; -} - -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - struct mm_struct *mm = vma->vm_mm; - int err; - - do { - err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); - if (err) - return err; - - size -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } while (size); - - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - -/** - * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma - * @start: start of the remapped virtual memory range - * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range (see NOTE) - * @pgoff: to-be-mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. - * - * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma - * (shared backing store file). - * - * This syscall works purely via pagetables, so it's the most efficient - * way to map the same (large) file into a given virtual window. Unlike - * mmap()/mremap() it does not create any new vmas. The new mappings are - * also safe across swapout. - * - * NOTE: the @prot parameter right now is ignored (but must be zero), - * and the vma's default protection is used. Arbitrary protections - * might be implemented in the future. - */ -SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, - unsigned long, prot, unsigned long, pgoff, unsigned long, flags) -{ - struct mm_struct *mm = current->mm; - struct address_space *mapping; - struct vm_area_struct *vma; - int err = -EINVAL; - int has_write_lock = 0; - vm_flags_t vm_flags = 0; - - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); - - if (prot) - return err; - /* - * Sanitize the syscall parameters: - */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; - - /* Does the address range wrap, or is the span zero-sized? */ - if (start + size <= start) - return err; - - /* Does pgoff wrap? */ - if (pgoff + (size >> PAGE_SHIFT) < pgoff) - return err; - - /* Can we represent this offset inside this architecture's pte's? */ -#if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) - return err; -#endif - - /* We need down_write() to change vma->vm_flags. */ - down_read(&mm->mmap_sem); - retry: - vma = find_vma(mm, start); - - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within - * the single existing vma. - */ - if (!vma || !(vma->vm_flags & VM_SHARED)) - goto out; - - if (!vma->vm_ops || !vma->vm_ops->remap_pages) - goto out; - - if (start < vma->vm_start || start + size > vma->vm_end) - goto out; - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (!(vma->vm_flags & VM_NONLINEAR)) { - /* - * vm_private_data is used as a swapout cursor - * in a VM_NONLINEAR vma. - */ - if (vma->vm_private_data) - goto out; - - /* Don't need a nonlinear mapping, exit success */ - if (pgoff == linear_page_index(vma, start)) { - err = 0; - goto out; - } - - if (!has_write_lock) { -get_write_lock: - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - /* - * page_mkclean doesn't work on nonlinear vmas, so if - * dirty pages need to be accounted, emulate with linear - * vmas. - */ - if (mapping_cap_account_dirty(mapping)) { - unsigned long addr; - struct file *file = get_file(vma->vm_file); - /* mmap_region may free vma; grab the info now */ - vm_flags = vma->vm_flags; - - addr = mmap_region(file, start, size, vm_flags, pgoff); - fput(file); - if (IS_ERR_VALUE(addr)) { - err = addr; - } else { - BUG_ON(addr != start); - err = 0; - } - goto out_freed; - } - i_mmap_lock_write(mapping); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_interval_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (vma->vm_flags & VM_LOCKED) { - /* - * drop PG_Mlocked flag for over-mapped range - */ - if (!has_write_lock) - goto get_write_lock; - vm_flags = vma->vm_flags; - munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = vm_flags; - } - - mmu_notifier_invalidate_range_start(mm, start, start + size); - err = vma->vm_ops->remap_pages(vma, start, size, pgoff); - mmu_notifier_invalidate_range_end(mm, start, start + size); - - /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). - */ - -out: - if (vma) - vm_flags = vma->vm_flags; -out_freed: - if (likely(!has_write_lock)) - up_read(&mm->mmap_sem); - else - up_write(&mm->mmap_sem); - if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) - mm_populate(start, size); - - return err; -} diff --git a/mm/mmap.c b/mm/mmap.c index 7f684d5a808738..e023dc5e59a821 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2634,6 +2634,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) return vm_munmap(addr, len); } + +/* + * Emulation of deprecated remap_file_pages() syscall. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; + struct file *file; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start < vma->vm_start || start + size > vma->vm_end) + goto out; + + if (pgoff == linear_page_index(vma, start)) { + ret = 0; + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) { + flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ + munlock_vma_pages_range(vma, start, start + size); + } + + file = get_file(vma->vm_file); + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); + fput(file); +out: + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM diff --git a/mm/nommu.c b/mm/nommu.c index 28bd8c4dff6feb..541bed64e34870 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1984,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_map_pages); -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { From 8a5f14a23177061ec11daeaa3d09d0765d785c47 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:49 -0800 Subject: [PATCH 34/78] mm: drop support of non-linear mapping from unmap/zap codepath We have remap_file_pages(2) emulation in -mm tree for few release cycles and we plan to have it mainline in v3.20. This patchset removes rest of VM_NONLINEAR infrastructure. Patches 1-8 take care about generic code. They are pretty straight-forward and can be applied without other of patches. Rest patches removes pte_file()-related stuff from architecture-specific code. It usually frees up one bit in non-present pte. I've tried to reuse that bit for swap offset, where I was able to figure out how to do that. For obvious reason I cannot test all that arch-specific code and would like to see acks from maintainers. In total, remap_file_pages(2) required about 1.4K lines of not-so-trivial kernel code. That's too much for functionality nobody uses. Tested-by: Felipe Balbi This patch (of 38): We don't create non-linear mappings anymore. Let's drop code which handles them on unmap/zap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/madvise.c | 9 +---- mm/memory.c | 82 ++++++++++++---------------------------------- 3 files changed, 22 insertions(+), 70 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c6fd3c5424a68..600ef5ed469822 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1146,7 +1146,6 @@ extern void user_shm_unlock(size_t, struct user_struct *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ diff --git a/mm/madvise.c b/mm/madvise.c index a271adc93289f7..917754d26c17cd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; - if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - struct zap_details details = { - .nonlinear_vma = vma, - .last_index = ULONG_MAX, - }; - zap_page_range(vma, start, end - start, &details); - } else - zap_page_range(vma, start, end - start, NULL); + zap_page_range(vma, start, end - start, NULL); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 2c3536cc6c6327..9a3e73b69dad87 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1082,6 +1082,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; + swp_entry_t entry; again: init_rss_vec(rss); @@ -1107,28 +1108,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (details->check_mapping && details->check_mapping != page->mapping) continue; - /* - * Each page->index must be checked when - * invalidating or truncating nonlinear. - */ - if (details->nonlinear_vma && - (page->index < details->first_index || - page->index > details->last_index)) - continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (unlikely(details) && details->nonlinear_vma - && linear_page_index(details->nonlinear_vma, - addr) != page->index) { - pte_t ptfile = pgoff_to_pte(page->index); - if (pte_soft_dirty(ptent)) - ptfile = pte_file_mksoft_dirty(ptfile); - set_pte_at(mm, addr, pte, ptfile); - } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -1151,33 +1136,25 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, } continue; } - /* - * If details->check_mapping, we leave swap entries; - * if details->nonlinear_vma, we leave file entries. - */ + /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; - if (pte_file(ptent)) { - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) - print_bad_pte(vma, addr, ptent, NULL); - } else { - swp_entry_t entry = pte_to_swp_entry(ptent); - if (!non_swap_entry(entry)) - rss[MM_SWAPENTS]--; - else if (is_migration_entry(entry)) { - struct page *page; + entry = pte_to_swp_entry(ptent); + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + else if (is_migration_entry(entry)) { + struct page *page; - page = migration_entry_to_page(entry); + page = migration_entry_to_page(entry); - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else - rss[MM_FILEPAGES]--; - } - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); + if (PageAnon(page)) + rss[MM_ANONPAGES]--; + else + rss[MM_FILEPAGES]--; } + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1277,7 +1254,7 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping && !details->nonlinear_vma) + if (details && !details->check_mapping) details = NULL; BUG_ON(addr >= end); @@ -1371,7 +1348,7 @@ void unmap_vmas(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation + * @details: details of shared cache invalidation * * Caller must protect the VMA list */ @@ -1397,7 +1374,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation + * @details: details of shared cache invalidation * * The range must fit into one VMA. */ @@ -2331,25 +2308,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, } } -static inline void unmap_mapping_range_list(struct list_head *head, - struct zap_details *details) -{ - struct vm_area_struct *vma; - - /* - * In nonlinear VMAs there is no correspondence between virtual address - * offset and file offset. So we must perform an exhaustive search - * across *all* the pages in each nonlinear VMA, not just the pages - * whose virtual address lies outside the file truncation point. - */ - list_for_each_entry(vma, head, shared.nonlinear) { - details->nonlinear_vma = vma; - unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); - } -} - /** - * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. + * unmap_mapping_range - unmap the portion of all mmaps in the specified + * address_space corresponding to the specified page range in the underlying + * file. + * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE @@ -2378,7 +2341,6 @@ void unmap_mapping_range(struct address_space *mapping, } details.check_mapping = even_cows? NULL: mapping; - details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) @@ -2388,8 +2350,6 @@ void unmap_mapping_range(struct address_space *mapping, i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); - if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) - unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); i_mmap_unlock_write(mapping); } EXPORT_SYMBOL(unmap_mapping_range); From 9b4bdd2ffab9557ac43af7dff02e7dab1c8c58bd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:51 -0800 Subject: [PATCH 35/78] mm: drop support of non-linear mapping from fault codepath We don't create non-linear mappings anymore. Let's drop code which handles them on page fault. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 16 +++++------- mm/memory.c | 65 +++++++--------------------------------------- 2 files changed, 16 insertions(+), 65 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 600ef5ed469822..376e5c325dee02 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,21 +206,19 @@ extern unsigned int kobjsize(const void *objp); extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ -#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ -#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED 0x40 /* second try */ -#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ +#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ +#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ +#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ +#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ +#define FAULT_FLAG_TRIED 0x20 /* Second try */ +#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * - * pgoff should be used in favour of virtual_address, if possible. If pgoff - * is used, one may implement ->remap_pages to get nonlinear mapping support. + * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { unsigned int flags; /* FAULT_FLAG_xxx flags */ diff --git a/mm/memory.c b/mm/memory.c index 9a3e73b69dad87..43a53743cbb453 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1899,12 +1899,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, EXPORT_SYMBOL_GPL(apply_to_page_range); /* - * handle_pte_fault chooses page fault handler according to an entry - * which was read non-atomically. Before making any commitment, on - * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault - * must check under lock before unmapping the pte and proceeding - * (but do_wp_page is only called after already making such a check; + * handle_pte_fault chooses page fault handler according to an entry which was + * read non-atomically. Before making any commitment, on those architectures + * or configurations (e.g. i386 with PAE) which might give a mix of unmatched + * parts, do_swap_page must check under lock before unmapping the pte and + * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, @@ -2710,8 +2709,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); @@ -2846,8 +2843,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && - fault_around_bytes >> PAGE_SHIFT > 1) { + if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) @@ -2992,7 +2988,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { @@ -3009,46 +3005,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -/* - * Fault of a previously existing named mapping. Repopulate the pte - * from the encoded file_pte if possible. This enables swappable - * nonlinear vmas. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with pte unmapped and unlocked. - * The mmap_sem may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). - */ -static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte) -{ - pgoff_t pgoff; - - flags |= FAULT_FLAG_NONLINEAR; - - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return 0; - - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { - /* - * Page table corrupted: show pte and kill process. - */ - print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_SIGBUS; - } - - pgoff = pte_to_pgoff(orig_pte); - if (!(flags & FAULT_FLAG_WRITE)) - return do_read_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); -} - static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) @@ -3176,15 +3132,12 @@ static int handle_pte_fault(struct mm_struct *mm, if (pte_none(entry)) { if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) - return do_linear_fault(mm, vma, address, - pte, pmd, flags, entry); + return do_fault(mm, vma, address, pte, + pmd, flags, entry); } return do_anonymous_page(mm, vma, address, pte, pmd, flags); } - if (pte_file(entry)) - return do_nonlinear_fault(mm, vma, address, - pte, pmd, flags, entry); return do_swap_page(mm, vma, address, pte, pmd, flags, entry); } From d83a08db5ba6072caa658745881f4baa9bad6a08 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:54 -0800 Subject: [PATCH 36/78] mm: drop vm_ops->remap_pages and generic_file_remap_pages() stub Nobody uses it anymore. [akpm@linux-foundation.org: fix filemap_xip.c] Signed-off-by: Kirill A. Shutemov Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 2 -- fs/btrfs/file.c | 1 - fs/ceph/addr.c | 1 - fs/cifs/file.c | 1 - fs/ext4/file.c | 1 - fs/f2fs/file.c | 1 - fs/fuse/file.c | 1 - fs/gfs2/file.c | 1 - fs/nfs/file.c | 1 - fs/nilfs2/file.c | 1 - fs/ocfs2/mmap.c | 1 - fs/ubifs/file.c | 1 - fs/xfs/xfs_file.c | 1 - include/linux/fs.h | 6 ------ include/linux/mm.h | 3 --- mm/filemap.c | 1 - mm/filemap_xip.c | 1 - mm/shmem.c | 1 - 18 files changed, 26 deletions(-) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 5594505e6e730e..b40133796b8734 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { @@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4090259569bcc..a606ab551296e1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c81c0e004588b9..24be059fd1f8e0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1569,7 +1569,6 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) static struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 74f12877493ac6..294ff302a2370f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3248,7 +3248,6 @@ static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8c0af316..7cb59238612119 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -195,7 +195,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3c27e0ecb3bcf1..5674ba13102be9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -92,7 +92,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int get_parent_ino(struct inode *inode, nid_t *pino) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 760b2c55219741..d769e594855b0a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6e600abf694a20..ec9c2d33477a84 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; /** diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2ab6f00dba5bd6..94712fc781fa58 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 3a03e0aea1fb6b..a8c728acb7a809 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 10d66c75cecbae..9581d190f6e123 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -173,7 +173,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 538519ee37d939..035e5101144475 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 13e974e6a889fa..ac7f1e8f92b35c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1384,5 +1384,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 60c4996df7f3be..47f557c7ef7eda 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,12 +2481,6 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -static inline int generic_file_remap_pages(struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 376e5c325dee02..2ddd9d1d6268af 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,9 +285,6 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr); #endif - /* called by sys_remap_file_pages() to populate non-linear mapping */ - int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff); }; struct mmu_gather; diff --git a/mm/filemap.c b/mm/filemap.c index 673e4581a2e541..bf7a2714270413 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0d105aeff82fe2..70c09da1a4198c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -301,7 +301,6 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, .page_mkwrite = filemap_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) diff --git a/mm/shmem.c b/mm/shmem.c index 993e6ba689ccd4..b3e4031819815b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif - .remap_pages = generic_file_remap_pages, }; static struct dentry *shmem_mount(struct file_system_type *fs_type, From 1da4b35b001481df99a6dcab12d5d39a876f7056 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:57 -0800 Subject: [PATCH 37/78] proc: drop handling non-linear mappings We have to handle non-linear mappings for /proc/PID/{smaps,clear_refs} which is unused now. Let's drop it. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 246eae84b13b1b..6396f88c668738 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -443,7 +443,6 @@ struct mem_size_stats { unsigned long anonymous; unsigned long anonymous_thp; unsigned long swap; - unsigned long nonlinear; u64 pss; }; @@ -484,7 +483,6 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; - pgoff_t pgoff = linear_page_index(vma, addr); struct page *page = NULL; if (pte_present(*pte)) { @@ -496,17 +494,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, mss->swap += PAGE_SIZE; else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); - } else if (pte_file(*pte)) { - if (pte_to_pgoff(*pte) != pgoff) - mss->nonlinear += PAGE_SIZE; } if (!page) return; - - if (page->index != pgoff) - mss->nonlinear += PAGE_SIZE; - smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); } @@ -596,7 +587,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", - [ilog2(VM_NONLINEAR)] = "nl", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_MEM_SOFT_DIRTY @@ -668,10 +658,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) (vma->vm_flags & VM_LOCKED) ? (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); - if (vma->vm_flags & VM_NONLINEAR) - seq_printf(m, "Nonlinear: %8lu kB\n", - mss.nonlinear >> 10); - show_smap_vma_flags(m, vma); m_cache_vma(m, vma); return 0; @@ -772,8 +758,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); - } else if (pte_file(ptent)) { - ptent = pte_file_clear_soft_dirty(ptent); } set_pte_at(vma->vm_mm, addr, pte, ptent); From 27ba0644ea9dfe6e7693abc85837b60e40583b96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:59 -0800 Subject: [PATCH 38/78] rmap: drop support of non-linear mappings We don't create non-linear mappings anymore. Let's drop code which handles them in rmap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cachetlb.txt | 8 +- fs/inode.c | 1 - include/linux/fs.h | 4 +- include/linux/mm.h | 6 - include/linux/mm_types.h | 4 +- include/linux/rmap.h | 2 - kernel/fork.c | 8 +- mm/migrate.c | 32 ------ mm/mmap.c | 24 ++-- mm/rmap.c | 225 +------------------------------------ mm/swap.c | 4 +- 11 files changed, 18 insertions(+), 300 deletions(-) diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index d79b008e4a3289..3f9f808b51198b 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -317,10 +317,10 @@ maps this page at its virtual address. about doing this. The idea is, first at flush_dcache_page() time, if - page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear - an empty list, just mark the architecture private page flag bit. - Later, in update_mmu_cache(), a check is made of this flag bit, - and if set the flush is done and the flag bit is cleared. + page->mapping->i_mmap is an empty tree, just mark the architecture + private page flag bit. Later, in update_mmu_cache(), a check is + made of this flag bit, and if set the flush is done and the flag + bit is cleared. IMPORTANT NOTE: It is often important, if you defer the flush, that the actual flush occurs on the same CPU diff --git a/fs/inode.c b/fs/inode.c index aa149e7262acff..c760fac33c92e5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -355,7 +355,6 @@ void address_space_init_once(struct address_space *mapping) INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index 47f557c7ef7eda..60acab20970156 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,6 @@ struct address_space { spinlock_t tree_lock; /* and lock protecting it */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ - struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ @@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping) */ static inline int mapping_mapped(struct address_space *mapping) { - return !RB_EMPTY_ROOT(&mapping->i_mmap) || - !list_empty(&mapping->i_mmap_nonlinear); + return !RB_EMPTY_ROOT(&mapping->i_mmap); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ddd9d1d6268af..18391eec486417 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) -static inline void vma_nonlinear_insert(struct vm_area_struct *vma, - struct list_head *list) -{ - list_add_tail(&vma->shared.nonlinear, list); -} - void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6d34aa266a8ce9..3b1d20fb084879 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -273,15 +273,13 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree, or - * linkage of vma in the address_space->i_mmap_nonlinear list. + * linkage into the address_space->i_mmap interval tree. */ union { struct { struct rb_node rb; unsigned long rb_subtree_last; } linear; - struct list_head nonlinear; } shared; /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d9d7e7e56352a8..b38f559130d5a2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); * arg: passed to rmap_one() and invalid_vma() * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition - * file_nonlinear: for handling file nonlinear mapping * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ @@ -255,7 +254,6 @@ struct rmap_walk_control { int (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); - int (*file_nonlinear)(struct page *, struct address_space *, void *arg); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; diff --git a/kernel/fork.c b/kernel/fork.c index 4dc2ddade9f1f2..b379d9abddc74c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - if (unlikely(tmp->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(tmp, - &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } diff --git a/mm/migrate.c b/mm/migrate.c index 344cdf692fc806..6e284bcca8bbfb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -178,37 +178,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, return SWAP_AGAIN; } -/* - * Congratulations to trinity for discovering this bug. - * mm/fremap.c's remap_file_pages() accepts any range within a single vma to - * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then - * replace the specified range by file ptes throughout (maybe populated after). - * If page migration finds a page within that range, while it's still located - * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: - * zap_pte() clears the temporary migration entry before mmap_sem is dropped. - * But if the migrating page is in a part of the vma outside the range to be - * remapped, then it will not be cleared, and remove_migration_ptes() needs to - * deal with it. Fortunately, this part of the vma is of course still linear, - * so we just need to use linear location on the nonlinear list. - */ -static int remove_linear_migration_ptes_from_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long addr; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr >= vma->vm_start && addr < vma->vm_end) - remove_migration_pte(page, vma, addr, arg); - } - return SWAP_AGAIN; -} - /* * Get rid of all migration entries and replace them by * references to the indicated page. @@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, - .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); diff --git a/mm/mmap.c b/mm/mmap.c index e023dc5e59a821..14d84666e8ba6d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.nonlinear); - else - vma_interval_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } @@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) { - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); - if (adjust_next) - uprobe_munmap(next, next->vm_start, - next->vm_end); - } + if (adjust_next) + uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); if (insert) { @@ -3177,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * * mmap_sem in write mode is required in order to block all operations * that could modify pagetables and free pages without need of - * altering the vma layout (for example populate_range() with - * nonlinear vmas). It's also needed in write mode to avoid new + * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row diff --git a/mm/rmap.c b/mm/rmap.c index 71cd5bd0c17d76..70b32498d4f24e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { - if (!vma->vm_file || - vma->vm_file->f_mapping != page->mapping) + } else if (page->mapping) { + if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) return -EFAULT; } else return -EFAULT; @@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); - BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ @@ -1316,211 +1314,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, return ret; } -/* - * objrmap doesn't work for nonlinear VMAs because the assumption that - * offset-into-file correlates with offset-into-virtual-addresses does not hold. - * Consequently, given a particular page and its ->index, we cannot locate the - * ptes which are mapping that page without an exhaustive linear search. - * - * So what this code does is a mini "virtual scan" of each nonlinear VMA which - * maps the file to which the target page belongs. The ->vm_private_data field - * holds the current cursor into that scan. Successive searches will circulate - * around the vma's virtual address space. - * - * So as more replacement pressure is applied to the pages in a nonlinear VMA, - * more scanning pressure is placed against them as well. Eventually pages - * will become fully unmapped and are eligible for eviction. - * - * For very sparsely populated VMAs this is a little inefficient - chances are - * there there won't be many ptes located within the scan cluster. In this case - * maybe we could scan further - to the end of the pte page, perhaps. - * - * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can - * acquire it without blocking. If vma locked, mlock the pages in the cluster, - * rather than unmapping them. If we encounter the "check_page" that vmscan is - * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. - */ -#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) -#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) - -static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, - struct vm_area_struct *vma, struct page *check_page) -{ - struct mm_struct *mm = vma->vm_mm; - pmd_t *pmd; - pte_t *pte; - pte_t pteval; - spinlock_t *ptl; - struct page *page; - unsigned long address; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - unsigned long end; - int ret = SWAP_AGAIN; - int locked_vma = 0; - - address = (vma->vm_start + cursor) & CLUSTER_MASK; - end = address + CLUSTER_SIZE; - if (address < vma->vm_start) - address = vma->vm_start; - if (end > vma->vm_end) - end = vma->vm_end; - - pmd = mm_find_pmd(mm, address); - if (!pmd) - return ret; - - mmun_start = address; - mmun_end = end; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - /* - * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, - * keep the sem while scanning the cluster for mlocking pages. - */ - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - locked_vma = (vma->vm_flags & VM_LOCKED); - if (!locked_vma) - up_read(&vma->vm_mm->mmap_sem); /* don't need it */ - } - - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - - for (; address < end; pte++, address += PAGE_SIZE) { - if (!pte_present(*pte)) - continue; - page = vm_normal_page(vma, address, *pte); - BUG_ON(!page || PageAnon(page)); - - if (locked_vma) { - if (page == check_page) { - /* we know we have check_page locked */ - mlock_vma_page(page); - ret = SWAP_MLOCK; - } else if (trylock_page(page)) { - /* - * If we can lock the page, perform mlock. - * Otherwise leave the page alone, it will be - * eventually encountered again later. - */ - mlock_vma_page(page); - unlock_page(page); - } - continue; /* don't unmap */ - } - - /* - * No need for _notify because we're within an - * mmu_notifier_invalidate_range_ {start|end} scope. - */ - if (ptep_clear_flush_young(vma, address, pte)) - continue; - - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); - - /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) { - pte_t ptfile = pgoff_to_pte(page->index); - if (pte_soft_dirty(pteval)) - ptfile = pte_file_mksoft_dirty(ptfile); - set_pte_at(mm, address, pte, ptfile); - } - - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) - set_page_dirty(page); - - page_remove_rmap(page); - page_cache_release(page); - dec_mm_counter(mm, MM_FILEPAGES); - (*mapcount)--; - } - pte_unmap_unlock(pte - 1, ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (locked_vma) - up_read(&vma->vm_mm->mmap_sem); - return ret; -} - -static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - int ret = SWAP_AGAIN; - unsigned long cursor; - unsigned long max_nl_cursor = 0; - unsigned long max_nl_size = 0; - unsigned int mapcount; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - if (cursor > max_nl_cursor) - max_nl_cursor = cursor; - cursor = vma->vm_end - vma->vm_start; - if (cursor > max_nl_size) - max_nl_size = cursor; - } - - if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - return SWAP_FAIL; - } - - /* - * We don't try to search for this page in the nonlinear vmas, - * and page_referenced wouldn't have found it anyway. Instead - * just walk the nonlinear vmas trying to age and unmap some. - * The mapcount of the page we came in with is irrelevant, - * but even so use it as a guide to how hard we should try? - */ - mapcount = page_mapcount(page); - if (!mapcount) - return ret; - - cond_resched(); - - max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; - if (max_nl_cursor == 0) - max_nl_cursor = CLUSTER_SIZE; - - do { - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - while (cursor < max_nl_cursor && - cursor < vma->vm_end - vma->vm_start) { - if (try_to_unmap_cluster(cursor, &mapcount, - vma, page) == SWAP_MLOCK) - ret = SWAP_MLOCK; - cursor += CLUSTER_SIZE; - vma->vm_private_data = (void *) cursor; - if ((int)mapcount <= 0) - return ret; - } - vma->vm_private_data = (void *) max_nl_cursor; - } - cond_resched(); - max_nl_cursor += CLUSTER_SIZE; - } while (max_nl_cursor <= max_nl_size); - - /* - * Don't loop forever (perhaps all the remaining pages are - * in locked vmas). Reset cursor on all unreserved nonlinear - * vmas, now forgetting on which ones it had fallen behind. - */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) - vma->vm_private_data = NULL; - - return ret; -} - bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = page_not_mapped, - .file_nonlinear = try_to_unmap_nonlinear, .anon_lock = page_lock_anon_vma_read, }; @@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page) .rmap_one = try_to_unmap_one, .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, - /* - * We don't bother to try to find the munlocked page in - * nonlinears. It's costly. Instead, later, page reclaim logic - * may call try_to_unmap() and recover PG_mlocked lazily. - */ - .file_nonlinear = NULL, .anon_lock = page_lock_anon_vma_read, }; @@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) goto done; } - if (!rwc->file_nonlinear) - goto done; - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto done; - - ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: i_mmap_unlock_read(mapping); return ret; diff --git a/mm/swap.c b/mm/swap.c index 8a12b33936b45c..5b3087228b99c2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1140,10 +1140,8 @@ void __init swap_setup(void) if (bdi_init(swapper_spaces[0].backing_dev_info)) panic("Failed to init swap bdi"); - for (i = 0; i < MAX_SWAPFILES; i++) { + for (i = 0; i < MAX_SWAPFILES; i++) spin_lock_init(&swapper_spaces[i].tree_lock); - INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); - } #endif /* Use a smaller cluster for small-memory machines */ From ac51b934f3912582d3c897c6c4d09b32ea57b2c7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:02 -0800 Subject: [PATCH 39/78] mm: replace vma->sharead.linear with vma->shared After removing vma->shared.nonlinear we have only one member of vma->shared union, which doesn't make much sense. This patch drops the union and move struct vma->shared.linear to vma->shared. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 8 +++----- mm/interval_tree.c | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b1d20fb084879..07c8bd3f7b48b9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -275,11 +275,9 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. */ - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } linear; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; } shared; /* diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 8da581fa906093..f2c2492681bf14 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; } -INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, - unsigned long, shared.linear.rb_subtree_last, +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, + unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ @@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); - if (!prev->shared.linear.rb.rb_right) { + if (!prev->shared.rb.rb_right) { parent = prev; - link = &prev->shared.linear.rb.rb_right; + link = &prev->shared.rb.rb_right; } else { - parent = rb_entry(prev->shared.linear.rb.rb_right, - struct vm_area_struct, shared.linear.rb); - if (parent->shared.linear.rb_subtree_last < last) - parent->shared.linear.rb_subtree_last = last; - while (parent->shared.linear.rb.rb_left) { - parent = rb_entry(parent->shared.linear.rb.rb_left, - struct vm_area_struct, shared.linear.rb); - if (parent->shared.linear.rb_subtree_last < last) - parent->shared.linear.rb_subtree_last = last; + parent = rb_entry(prev->shared.rb.rb_right, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; + while (parent->shared.rb.rb_left) { + parent = rb_entry(parent->shared.rb.rb_left, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; } - link = &parent->shared.linear.rb.rb_left; + link = &parent->shared.rb.rb_left; } - node->shared.linear.rb_subtree_last = last; - rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); - rb_insert_augmented(&node->shared.linear.rb, root, + node->shared.rb_subtree_last = last; + rb_link_node(&node->shared.rb, &parent->shared.rb, link); + rb_insert_augmented(&node->shared.rb, root, &vma_interval_tree_augment); } From 0661a33611fca12570cba48d9344ce68834ee86c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:04 -0800 Subject: [PATCH 40/78] mm: remove rest usage of VM_NONLINEAR and pte_file() One bit in ->vm_flags is unused now! Signed-off-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_vma_manager.c | 3 +- include/linux/mm.h | 1 - include/linux/swapops.h | 4 +- mm/debug.c | 1 - mm/gup.c | 2 +- mm/ksm.c | 2 +- mm/madvise.c | 4 +- mm/memcontrol.c | 7 +-- mm/memory.c | 78 +++++++++++++++---------------- mm/mincore.c | 9 +--- mm/mprotect.c | 2 +- mm/mremap.c | 2 - mm/msync.c | 5 +- 13 files changed, 49 insertions(+), 71 deletions(-) diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c index 63b471205072fb..68c1f32fb086ba 100644 --- a/drivers/gpu/drm/drm_vma_manager.c +++ b/drivers/gpu/drm/drm_vma_manager.c @@ -50,8 +50,7 @@ * * You must not use multiple offset managers on a single address_space. * Otherwise, mm-core will be unable to tear down memory mappings as the VM will - * no longer be linear. Please use VM_NONLINEAR in that case and implement your - * own offset managers. + * no longer be linear. * * This offset manager works on page-based addresses. That is, every argument * and return code (with the exception of drm_vma_node_offset_addr()) is given diff --git a/include/linux/mm.h b/include/linux/mm.h index 18391eec486417..a0da685bdb8231 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_ARCH_2 0x02000000 #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6adfb7bfbf4482..50cbc876be56a2 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); + return !pte_none(pte) && !pte_present_nonuma(pte); } #endif @@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; - BUG_ON(pte_file(pte)); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); @@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); - BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } diff --git a/mm/debug.c b/mm/debug.c index 0e58f3211f8970..d69cb5a7ba9af3 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_ACCOUNT, "account" }, {VM_NORESERVE, "noreserve" }, {VM_HUGETLB, "hugetlb" }, - {VM_NONLINEAR, "nonlinear" }, #if defined(CONFIG_X86) {VM_PAT, "pat" }, #elif defined(CONFIG_PPC) diff --git a/mm/gup.c b/mm/gup.c index 8dd50ce6326fd5..12bc2bc33da759 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -55,7 +55,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; - if (pte_none(pte) || pte_file(pte)) + if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) diff --git a/mm/ksm.c b/mm/ksm.c index 15647fb0394fab..4162dce2eb44c3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO diff --git a/mm/madvise.c b/mm/madvise.c index 917754d26c17cd..d79fb5e8f80a0b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, pte = *(orig_pte + ((index - start) / PAGE_SIZE)); pte_unmap_unlock(orig_pte, ptl); - if (pte_present(pte) || pte_none(pte) || pte_file(pte)) + if (pte_present(pte) || pte_none(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) @@ -296,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma, *prev = NULL; /* tell sys_madvise we drop mmap_sem */ - if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) return -EINVAL; f = vma->vm_file; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2f6893c2f01b6c..8b58701b9647b9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4926,10 +4926,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return NULL; mapping = vma->vm_file->f_mapping; - if (pte_none(ptent)) - pgoff = linear_page_index(vma, addr); - else /* pte_file(ptent) is true */ - pgoff = pte_to_pgoff(ptent); + pgoff = linear_page_index(vma, addr); /* page is moved even if it's not RSS of this task(page-faulted). */ #ifdef CONFIG_SWAP @@ -4961,7 +4958,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, addr, ptent, &ent); - else if (pte_none(ptent) || pte_file(ptent)) + else if (pte_none(ptent)) page = mc_handle_file_pte(vma, addr, ptent, &ent); if (!page && !ent.val) diff --git a/mm/memory.c b/mm/memory.c index 43a53743cbb453..9aa09217fe2091 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -811,42 +811,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { - if (!pte_file(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (likely(!non_swap_entry(entry))) { - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; - - if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both - * parent and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) - pte = pte_swp_mksoft_dirty(pte); - set_pte_at(src_mm, addr, src_pte, pte); - } + swp_entry_t entry = pte_to_swp_entry(pte); + + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + rss[MM_SWAPENTS]++; + } else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; + + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); + set_pte_at(src_mm, addr, src_pte, pte); } } goto out_set_pte; @@ -1020,11 +1018,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | - VM_PFNMAP | VM_MIXEDMAP))) { - if (!vma->anon_vma) - return 0; - } + if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !vma->anon_vma) + return 0; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); diff --git a/mm/mincore.c b/mm/mincore.c index c8c528b3664195..46527c023e0c84 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { pte_t pte = *ptep; - pgoff_t pgoff; next = addr + PAGE_SIZE; if (pte_none(pte)) mincore_unmapped_range(vma, addr, next, vec); else if (pte_present(pte)) *vec = 1; - else if (pte_file(pte)) { - pgoff = pte_to_pgoff(pte); - *vec = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { /* pte is a swap entry */ + else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (non_swap_entry(entry)) { @@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, *vec = 1; } else { #ifdef CONFIG_SWAP - pgoff = entry.val; *vec = mincore_page(swap_address_space(entry), - pgoff); + entry.val); #else WARN_ON(1); *vec = 1; diff --git a/mm/mprotect.c b/mm/mprotect.c index ace93454ce8ebe..33121662f08b50 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } if (updated) pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { + } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { diff --git a/mm/mremap.c b/mm/mremap.c index 17fa018f5f3909..57dadc025c6444 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) pte = pte_mksoft_dirty(pte); else if (is_swap_pte(pte)) pte = pte_swp_mksoft_dirty(pte); - else if (pte_file(pte)) - pte = pte_file_mksoft_dirty(pte); #endif return pte; } diff --git a/mm/msync.c b/mm/msync.c index 992a1673d488db..bb04d53ae85295 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - if (vma->vm_flags & VM_NONLINEAR) - error = vfs_fsync(file, 1); - else - error = vfs_fsync_range(file, fstart, fend, 1); + error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; From 5064c8e19dc215afae8ffae95570e7f22062d49c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:07 -0800 Subject: [PATCH 41/78] asm-generic: drop unused pte_file* helpers All users are gone. Signed-off-by: Kirill A. Shutemov Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 177d5973b132fb..129de9204d18e6 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -474,21 +474,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } - -static inline pte_t pte_file_clear_soft_dirty(pte_t pte) -{ - return pte; -} - -static inline pte_t pte_file_mksoft_dirty(pte_t pte) -{ - return pte; -} - -static inline int pte_file_soft_dirty(pte_t pte) -{ - return 0; -} #endif #ifndef __HAVE_PFNMAP_TRACKING From b816157a5366550c5ee29a6431ba1abb88721266 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:09 -0800 Subject: [PATCH 42/78] alpha: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index d8f9b7e8923483..fce22cf88ee9ca 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -73,7 +73,6 @@ struct vm_area_struct; /* .. and these are ours ... */ #define _PAGE_DIRTY 0x20000 #define _PAGE_ACCESSED 0x40000 -#define _PAGE_FILE 0x80000 /* set:pagecache, unset:swap */ /* * NOTE! The "accessed" bit isn't necessarily exact: it can be kept exactly @@ -268,7 +267,6 @@ extern inline void pgd_clear(pgd_t * pgdp) { pgd_val(*pgdp) = 0; } extern inline int pte_write(pte_t pte) { return !(pte_val(pte) & _PAGE_FOW); } extern inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -extern inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } extern inline int pte_special(pte_t pte) { return 0; } extern inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_FOW; return pte; } @@ -345,11 +343,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define pte_to_pgoff(pte) (pte_val(pte) >> 32) -#define pgoff_to_pte(off) ((pte_t) { ((off) << 32) | _PAGE_FILE }) - -#define PTE_FILE_MAX_BITS 32 - #ifndef CONFIG_DISCONTIGMEM #define kern_addr_valid(addr) (1) #endif From 18747151308f9e0fb63766057957617ec4afa190 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:12 -0800 Subject: [PATCH 43/78] arc: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Acked-by: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/include/asm/pgtable.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 6b0b7f7ef783ce..bdc8ccaf390d5a 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -61,7 +61,6 @@ #define _PAGE_WRITE (1<<4) /* Page has user write perm (H) */ #define _PAGE_READ (1<<5) /* Page has user read perm (H) */ #define _PAGE_MODIFIED (1<<6) /* Page modified (dirty) (S) */ -#define _PAGE_FILE (1<<7) /* page cache/ swap (S) */ #define _PAGE_GLOBAL (1<<8) /* Page is global (H) */ #define _PAGE_PRESENT (1<<10) /* TLB entry is valid (H) */ @@ -73,7 +72,6 @@ #define _PAGE_READ (1<<3) /* Page has user read perm (H) */ #define _PAGE_ACCESSED (1<<4) /* Page is accessed (S) */ #define _PAGE_MODIFIED (1<<5) /* Page modified (dirty) (S) */ -#define _PAGE_FILE (1<<6) /* page cache/ swap (S) */ #define _PAGE_GLOBAL (1<<8) /* Page is global (H) */ #define _PAGE_PRESENT (1<<9) /* TLB entry is valid (H) */ #define _PAGE_SHARED_CODE (1<<11) /* Shared Code page with cmn vaddr @@ -268,15 +266,6 @@ static inline void pmd_set(pmd_t *pmdp, pte_t *ptep) pte; \ }) -/* TBD: Non linear mapping stuff */ -static inline int pte_file(pte_t pte) -{ - return pte_val(pte) & _PAGE_FILE; -} - -#define PTE_FILE_MAX_BITS 30 -#define pgoff_to_pte(x) __pte(x) -#define pte_to_pgoff(x) (pte_val(x) >> 2) #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) (__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))) #define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -364,7 +353,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* Encode swap {type,off} tuple into PTE * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that - * both PAGE_FILE and PAGE_PRESENT are zero in a PTE holding swap "identifier" + * PAGE_PRESENT is zero in a PTE holding swap "identifier" */ #define __swp_entry(type, off) ((swp_entry_t) { \ ((type) & 0x1f) | ((off) << 13) }) From 9b3e661e58b90b0c2d5c2168c23408f1e59e9e35 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:15 -0800 Subject: [PATCH 44/78] arm64: drop PTE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also adjust __SWP_TYPE_SHIFT and increase number of bits availble for swap offset. Signed-off-by: Kirill A. Shutemov Acked-by: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/pgtable.h | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 210d632aa5ad38..4c445057169d47 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -25,7 +25,6 @@ * Software defined PTE bits definition. */ #define PTE_VALID (_AT(pteval_t, 1) << 0) -#define PTE_FILE (_AT(pteval_t, 1) << 2) /* only when !pte_present() */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) #define PTE_WRITE (_AT(pteval_t, 1) << 57) @@ -469,13 +468,12 @@ extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; /* * Encode and decode a swap entry: * bits 0-1: present (must be zero) - * bit 2: PTE_FILE - * bits 3-8: swap type - * bits 9-57: swap offset + * bits 2-7: swap type + * bits 8-57: swap offset */ -#define __SWP_TYPE_SHIFT 3 +#define __SWP_TYPE_SHIFT 2 #define __SWP_TYPE_BITS 6 -#define __SWP_OFFSET_BITS 49 +#define __SWP_OFFSET_BITS 50 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) #define __SWP_OFFSET_MASK ((1UL << __SWP_OFFSET_BITS) - 1) @@ -493,18 +491,6 @@ extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) -/* - * Encode and decode a file entry: - * bits 0-1: present (must be zero) - * bit 2: PTE_FILE - * bits 3-57: file offset / PAGE_SIZE - */ -#define pte_file(pte) (pte_val(pte) & PTE_FILE) -#define pte_to_pgoff(x) (pte_val(x) >> 3) -#define pgoff_to_pte(x) __pte(((x) << 3) | PTE_FILE) - -#define PTE_FILE_MAX_BITS 55 - extern int kern_addr_valid(unsigned long addr); #include From b007ea798f5c568d3f464d37288220ef570f062c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:17 -0800 Subject: [PATCH 45/78] arm: drop L_PTE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also adjust __SWP_TYPE_SHIFT, effectively increase size of possible swap file to 128G. Signed-off-by: Kirill A. Shutemov Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgtable-2level.h | 1 - arch/arm/include/asm/pgtable-3level.h | 1 - arch/arm/include/asm/pgtable-nommu.h | 2 -- arch/arm/include/asm/pgtable.h | 20 +++----------------- arch/arm/mm/proc-macros.S | 2 +- 5 files changed, 4 insertions(+), 22 deletions(-) diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index f0279411847d72..bcc5e300413f46 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -118,7 +118,6 @@ #define L_PTE_VALID (_AT(pteval_t, 1) << 0) /* Valid */ #define L_PTE_PRESENT (_AT(pteval_t, 1) << 0) #define L_PTE_YOUNG (_AT(pteval_t, 1) << 1) -#define L_PTE_FILE (_AT(pteval_t, 1) << 2) /* only when !PRESENT */ #define L_PTE_DIRTY (_AT(pteval_t, 1) << 6) #define L_PTE_RDONLY (_AT(pteval_t, 1) << 7) #define L_PTE_USER (_AT(pteval_t, 1) << 8) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index a31ecdad4b5966..18dbc82f85e55d 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -77,7 +77,6 @@ */ #define L_PTE_VALID (_AT(pteval_t, 1) << 0) /* Valid */ #define L_PTE_PRESENT (_AT(pteval_t, 3) << 0) /* Present */ -#define L_PTE_FILE (_AT(pteval_t, 1) << 2) /* only when !PRESENT */ #define L_PTE_USER (_AT(pteval_t, 1) << 6) /* AP[1] */ #define L_PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ #define L_PTE_YOUNG (_AT(pteval_t, 1) << 10) /* AF */ diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index 0642228ff78562..c35e53ee66630d 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -54,8 +54,6 @@ typedef pte_t *pte_addr_t; -static inline int pte_file(pte_t pte) { return 0; } - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index d5cac545ba3398..f40354198bad40 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -318,12 +318,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <--------------- offset ----------------------> < type -> 0 0 0 + * <--------------- offset ------------------------> < type -> 0 0 * - * This gives us up to 31 swap files and 64GB per swap file. Note that + * This gives us up to 31 swap files and 128GB per swap file. Note that * the offset field is always non-zero. */ -#define __SWP_TYPE_SHIFT 3 +#define __SWP_TYPE_SHIFT 2 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) @@ -342,20 +342,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) -/* - * Encode and decode a file entry. File entries are stored in the Linux - * page tables as follows: - * - * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 - * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <----------------------- offset ------------------------> 1 0 0 - */ -#define pte_file(pte) (pte_val(pte) & L_PTE_FILE) -#define pte_to_pgoff(x) (pte_val(x) >> 3) -#define pgoff_to_pte(x) __pte(((x) << 3) | L_PTE_FILE) - -#define PTE_FILE_MAX_BITS 29 - /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ /* FIXME: this is not correct */ #define kern_addr_valid(addr) (1) diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index ba1196c968d82f..082b9f2f7e9001 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -98,7 +98,7 @@ #endif #if !defined (CONFIG_ARM_LPAE) && \ (L_PTE_XN+L_PTE_USER+L_PTE_RDONLY+L_PTE_DIRTY+L_PTE_YOUNG+\ - L_PTE_FILE+L_PTE_PRESENT) > L_PTE_SHARED + L_PTE_PRESENT) > L_PTE_SHARED #error Invalid Linux PTE bit settings #endif #endif /* CONFIG_MMU */ From 7a7d2db4b8b3505a3195178619ffcc80985c4be1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:20 -0800 Subject: [PATCH 46/78] avr32: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Haavard Skinnemoen Acked-by: Hans-Christian Egtvedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/avr32/include/asm/pgtable.h | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h index 4beff97e2033f1..ac7a817e21267f 100644 --- a/arch/avr32/include/asm/pgtable.h +++ b/arch/avr32/include/asm/pgtable.h @@ -86,9 +86,6 @@ extern struct page *empty_zero_page; #define _PAGE_BIT_PRESENT 10 #define _PAGE_BIT_ACCESSED 11 /* software: page was accessed */ -/* The following flags are only valid when !PRESENT */ -#define _PAGE_BIT_FILE 0 /* software: pagecache or swap? */ - #define _PAGE_WT (1 << _PAGE_BIT_WT) #define _PAGE_DIRTY (1 << _PAGE_BIT_DIRTY) #define _PAGE_EXECUTE (1 << _PAGE_BIT_EXECUTE) @@ -101,7 +98,6 @@ extern struct page *empty_zero_page; /* Software flags */ #define _PAGE_ACCESSED (1 << _PAGE_BIT_ACCESSED) #define _PAGE_PRESENT (1 << _PAGE_BIT_PRESENT) -#define _PAGE_FILE (1 << _PAGE_BIT_FILE) /* * Page types, i.e. sizes. _PAGE_TYPE_NONE corresponds to what is @@ -210,14 +206,6 @@ static inline int pte_special(pte_t pte) return 0; } -/* - * The following only work if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) -{ - return pte_val(pte) & _PAGE_FILE; -} - /* Mutator functions for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { @@ -329,7 +317,6 @@ extern void update_mmu_cache(struct vm_area_struct * vma, * Encode and decode a swap entry * * Constraints: - * _PAGE_FILE at bit 0 * _PAGE_TYPE_* at bits 2-3 (for emulating _PAGE_PROTNONE) * _PAGE_PRESENT at bit 10 * @@ -346,18 +333,6 @@ extern void update_mmu_cache(struct vm_area_struct * vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* - * Encode and decode a nonlinear file mapping entry. We have to - * preserve _PAGE_FILE and _PAGE_PRESENT here. _PAGE_TYPE_* isn't - * necessary, since _PAGE_FILE implies !_PAGE_PROTNONE (?) - */ -#define PTE_FILE_MAX_BITS 30 -#define pte_to_pgoff(pte) (((pte_val(pte) >> 1) & 0x1ff) \ - | ((pte_val(pte) >> 11) << 9)) -#define pgoff_to_pte(off) ((pte_t) { ((((off) & 0x1ff) << 1) \ - | (((off) >> 9) << 11) \ - | _PAGE_FILE) }) - typedef pte_t *pte_addr_t; #define kern_addr_valid(addr) (1) From 2bc6ff14d46745a7728ed4ed90c5e0edca91f52e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:23 -0800 Subject: [PATCH 47/78] blackfin: drop pte_file() We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Steven Miao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/pgtable.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/blackfin/include/asm/pgtable.h b/arch/blackfin/include/asm/pgtable.h index 0b049019eba703..b88a1558b0b9bf 100644 --- a/arch/blackfin/include/asm/pgtable.h +++ b/arch/blackfin/include/asm/pgtable.h @@ -45,11 +45,6 @@ extern void paging_init(void); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_file(pte_t pte) -{ - return 0; -} - #define set_pte(pteptr, pteval) (*(pteptr) = pteval) #define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) From f5b45de9b00eb53d11ada85c61e4ea1c31ab8218 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:25 -0800 Subject: [PATCH 48/78] c6x: drop pte_file() We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Acked-by: Mark Salter Cc: Aurelien Jacquiot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/c6x/include/asm/pgtable.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h index c0eed5b18860fa..78d4483ba40c83 100644 --- a/arch/c6x/include/asm/pgtable.h +++ b/arch/c6x/include/asm/pgtable.h @@ -50,11 +50,6 @@ extern void paging_init(void); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_file(pte_t pte) -{ - return 0; -} - #define set_pte(pteptr, pteval) (*(pteptr) = pteval) #define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) From 103f3d9a26df944f4c29de190d72dfbf913c71af Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:28 -0800 Subject: [PATCH 49/78] cris: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/include/arch-v10/arch/mmu.h | 3 --- arch/cris/include/arch-v32/arch/mmu.h | 3 --- arch/cris/include/asm/pgtable.h | 4 ---- 3 files changed, 10 deletions(-) diff --git a/arch/cris/include/arch-v10/arch/mmu.h b/arch/cris/include/arch-v10/arch/mmu.h index e829e5a37bbe76..47a5dd21749d6d 100644 --- a/arch/cris/include/arch-v10/arch/mmu.h +++ b/arch/cris/include/arch-v10/arch/mmu.h @@ -58,7 +58,6 @@ typedef struct /* Bits the HW doesn't care about but the kernel uses them in SW */ #define _PAGE_PRESENT (1<<4) /* page present in memory */ -#define _PAGE_FILE (1<<5) /* set: pagecache, unset: swap (when !PRESENT) */ #define _PAGE_ACCESSED (1<<5) /* simulated in software using valid bit */ #define _PAGE_MODIFIED (1<<6) /* simulated in software using we bit */ #define _PAGE_READ (1<<7) /* read-enabled */ @@ -105,6 +104,4 @@ typedef struct #define __S110 PAGE_SHARED #define __S111 PAGE_SHARED -#define PTE_FILE_MAX_BITS 26 - #endif diff --git a/arch/cris/include/arch-v32/arch/mmu.h b/arch/cris/include/arch-v32/arch/mmu.h index c1a13e05e96305..e6db1616dee53f 100644 --- a/arch/cris/include/arch-v32/arch/mmu.h +++ b/arch/cris/include/arch-v32/arch/mmu.h @@ -53,7 +53,6 @@ typedef struct * software. */ #define _PAGE_PRESENT (1 << 5) /* Page is present in memory. */ -#define _PAGE_FILE (1 << 6) /* 1=pagecache, 0=swap (when !present) */ #define _PAGE_ACCESSED (1 << 6) /* Simulated in software using valid bit. */ #define _PAGE_MODIFIED (1 << 7) /* Simulated in software using we bit. */ #define _PAGE_READ (1 << 8) /* Read enabled. */ @@ -108,6 +107,4 @@ typedef struct #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC -#define PTE_FILE_MAX_BITS 25 - #endif /* _ASM_CRIS_ARCH_MMU_H */ diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h index 8b8c86793225fd..e824257971c4dc 100644 --- a/arch/cris/include/asm/pgtable.h +++ b/arch/cris/include/asm/pgtable.h @@ -114,7 +114,6 @@ extern unsigned long empty_zero_page; static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_MODIFIED; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) @@ -290,9 +289,6 @@ static inline void update_mmu_cache(struct vm_area_struct * vma, */ #define pgtable_cache_init() do { } while (0) -#define pte_to_pgoff(x) (pte_val(x) >> 6) -#define pgoff_to_pte(x) __pte(((x) << 6) | _PAGE_FILE) - typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ From ca5bfa7b390017f053d7581bc701518b87bc3d43 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:31 -0800 Subject: [PATCH 50/78] frv: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also increase number of bits availble for swap offset. Signed-off-by: Kirill A. Shutemov Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/include/asm/pgtable.h | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h index eb0110acd19b72..c49699d5902d21 100644 --- a/arch/frv/include/asm/pgtable.h +++ b/arch/frv/include/asm/pgtable.h @@ -62,10 +62,6 @@ typedef pte_t *pte_addr_t; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#ifndef __ASSEMBLY__ -static inline int pte_file(pte_t pte) { return 0; } -#endif - #define ZERO_PAGE(vaddr) ({ BUG(); NULL; }) #define swapper_pg_dir ((pgd_t *) NULL) @@ -298,7 +294,6 @@ static inline pmd_t *pmd_offset(pud_t *dir, unsigned long address) #define _PAGE_RESERVED_MASK (xAMPRx_RESERVED8 | xAMPRx_RESERVED13) -#define _PAGE_FILE 0x002 /* set:pagecache unset:swap */ #define _PAGE_PROTNONE 0x000 /* If not present */ #define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -463,27 +458,15 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * Handle swap and file entries * - the PTE is encoded in the following format: * bit 0: Must be 0 (!_PAGE_PRESENT) - * bit 1: Type: 0 for swap, 1 for file (_PAGE_FILE) - * bits 2-7: Swap type - * bits 8-31: Swap offset - * bits 2-31: File pgoff + * bits 1-6: Swap type + * bits 7-31: Swap offset */ -#define __swp_type(x) (((x).val >> 2) & 0x1f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 2) | ((offset) << 8) }) +#define __swp_type(x) (((x).val >> 1) & 0x1f) +#define __swp_offset(x) ((x).val >> 7) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 7) }) #define __pte_to_swp_entry(_pte) ((swp_entry_t) { (_pte).pte }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_file(pte_t pte) -{ - return pte.pte & _PAGE_FILE; -} - -#define PTE_FILE_MAX_BITS 29 - -#define pte_to_pgoff(PTE) ((PTE).pte >> 2) -#define pgoff_to_pte(off) __pte((off) << 2 | _PAGE_FILE) - /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ #define PageSkip(page) (0) #define kern_addr_valid(addr) (1) From d99f95e6522db22192c331c75de182023a49fbcc Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:33 -0800 Subject: [PATCH 51/78] hexagon: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also increase number of bits availble for swap offset. Signed-off-by: Kirill A. Shutemov Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/pgtable.h | 60 ++++++++---------------------- 1 file changed, 16 insertions(+), 44 deletions(-) diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index d8bd54fa431ef9..6e35e71d2aeaa3 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -61,13 +61,6 @@ extern unsigned long zero_page_mask; #define _PAGE_DIRTY (1<<1) #define _PAGE_ACCESSED (1<<2) -/* - * _PAGE_FILE is only meaningful if _PAGE_PRESENT is false, while - * _PAGE_DIRTY is only meaningful if _PAGE_PRESENT is true. - * So we can overload the bit... - */ -#define _PAGE_FILE _PAGE_DIRTY /* set: pagecache, unset = swap */ - /* * For now, let's say that Valid and Present are the same thing. * Alternatively, we could say that it's the "or" of R, W, and X @@ -456,57 +449,36 @@ static inline int pte_exec(pte_t pte) #define pgtable_cache_init() do { } while (0) /* - * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the - * PTE is interpreted as swap information. Depending on the _PAGE_FILE - * bit, the remaining free bits are eitehr interpreted as a file offset - * or a swap type/offset tuple. Rather than have the TLB fill handler - * test _PAGE_PRESENT, we're going to reserve the permissions bits - * and set them to all zeros for swap entries, which speeds up the - * miss handler at the cost of 3 bits of offset. That trade-off can - * be revisited if necessary, but Hexagon processor architecture and - * target applications suggest a lot of TLB misses and not much swap space. + * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is + * interpreted as swap information. The remaining free bits are interpreted as + * swap type/offset tuple. Rather than have the TLB fill handler test + * _PAGE_PRESENT, we're going to reserve the permissions bits and set them to + * all zeros for swap entries, which speeds up the miss handler at the cost of + * 3 bits of offset. That trade-off can be revisited if necessary, but Hexagon + * processor architecture and target applications suggest a lot of TLB misses + * and not much swap space. * * Format of swap PTE: * bit 0: Present (zero) - * bit 1: _PAGE_FILE (zero) - * bits 2-6: swap type (arch independent layer uses 5 bits max) - * bits 7-9: bits 2:0 of offset - * bits 10-12: effectively _PAGE_PROTNONE (all zero) - * bits 13-31: bits 21:3 of swap offset - * - * Format of file PTE: - * bit 0: Present (zero) - * bit 1: _PAGE_FILE (zero) - * bits 2-9: bits 7:0 of offset - * bits 10-12: effectively _PAGE_PROTNONE (all zero) - * bits 13-31: bits 26:8 of swap offset + * bits 1-5: swap type (arch independent layer uses 5 bits max) + * bits 6-9: bits 3:0 of offset + * bits 10-12: effectively _PAGE_PROTNONE (all zero) + * bits 13-31: bits 22:4 of swap offset * * The split offset makes some of the following macros a little gnarly, * but there's plenty of precedent for this sort of thing. */ -#define PTE_FILE_MAX_BITS 27 /* Used for swap PTEs */ -#define __swp_type(swp_pte) (((swp_pte).val >> 2) & 0x1f) +#define __swp_type(swp_pte) (((swp_pte).val >> 1) & 0x1f) #define __swp_offset(swp_pte) \ - ((((swp_pte).val >> 7) & 0x7) | (((swp_pte).val >> 10) & 0x003ffff8)) + ((((swp_pte).val >> 6) & 0xf) | (((swp_pte).val >> 9) & 0x7ffff0)) #define __swp_entry(type, offset) \ ((swp_entry_t) { \ - ((type << 2) | \ - ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) }) - -/* Used for file PTEs */ -#define pte_file(pte) \ - ((pte_val(pte) & (_PAGE_FILE | _PAGE_PRESENT)) == _PAGE_FILE) - -#define pte_to_pgoff(pte) \ - (((pte_val(pte) >> 2) & 0xff) | ((pte_val(pte) >> 5) & 0x07ffff00)) - -#define pgoff_to_pte(off) \ - ((pte_t) { ((((off) & 0x7ffff00) << 5) | (((off) & 0xff) << 2)\ - | _PAGE_FILE) }) + ((type << 1) | \ + ((offset & 0x7ffff0) << 9) | ((offset & 0xf) << 6)) }) /* Oh boy. There are a lot of possible arch overrides found in this file. */ #include From 636a002b704e0a36cefb5f4cf0293fab858fc46c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:36 -0800 Subject: [PATCH 52/78] ia64: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also increase number of bits availble for swap offset. Signed-off-by: Kirill A. Shutemov Cc: Tony Luck Cc: Fenghua Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgtable.h | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 7935115398a667..2f07bb3dda9122 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -57,9 +57,6 @@ #define _PAGE_ED (__IA64_UL(1) << 52) /* exception deferral */ #define _PAGE_PROTNONE (__IA64_UL(1) << 63) -/* Valid only for a PTE with the present bit cleared: */ -#define _PAGE_FILE (1 << 1) /* see swap & file pte remarks below */ - #define _PFN_MASK _PAGE_PPN_MASK /* Mask of bits which may be changed by pte_modify(); the odd bits are there for _PAGE_PROTNONE */ #define _PAGE_CHG_MASK (_PAGE_P | _PAGE_PROTNONE | _PAGE_PL_MASK | _PAGE_AR_MASK | _PAGE_ED) @@ -300,7 +297,6 @@ extern unsigned long VMALLOC_END; #define pte_exec(pte) ((pte_val(pte) & _PAGE_AR_RX) != 0) #define pte_dirty(pte) ((pte_val(pte) & _PAGE_D) != 0) #define pte_young(pte) ((pte_val(pte) & _PAGE_A) != 0) -#define pte_file(pte) ((pte_val(pte) & _PAGE_FILE) != 0) #define pte_special(pte) 0 /* @@ -472,27 +468,16 @@ extern void paging_init (void); * * Format of swap pte: * bit 0 : present bit (must be zero) - * bit 1 : _PAGE_FILE (must be zero) - * bits 2- 8: swap-type - * bits 9-62: swap offset - * bit 63 : _PAGE_PROTNONE bit - * - * Format of file pte: - * bit 0 : present bit (must be zero) - * bit 1 : _PAGE_FILE (must be one) - * bits 2-62: file_offset/PAGE_SIZE + * bits 1- 7: swap-type + * bits 8-62: swap offset * bit 63 : _PAGE_PROTNONE bit */ -#define __swp_type(entry) (((entry).val >> 2) & 0x7f) -#define __swp_offset(entry) (((entry).val << 1) >> 10) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 2) | ((long) (offset) << 9) }) +#define __swp_type(entry) (((entry).val >> 1) & 0x7f) +#define __swp_offset(entry) (((entry).val << 1) >> 9) +#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 1) | ((long) (offset) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define PTE_FILE_MAX_BITS 61 -#define pte_to_pgoff(pte) ((pte_val(pte) << 1) >> 3) -#define pgoff_to_pte(off) ((pte_t) { ((off) << 2) | _PAGE_FILE }) - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. From 406b16e26d0996516c8d1641008a7d326bf282d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:39 -0800 Subject: [PATCH 53/78] m32r: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/include/asm/pgtable-2level.h | 4 ---- arch/m32r/include/asm/pgtable.h | 11 ----------- 2 files changed, 15 deletions(-) diff --git a/arch/m32r/include/asm/pgtable-2level.h b/arch/m32r/include/asm/pgtable-2level.h index 9cdaf7350ef6da..8fd8ee70266a13 100644 --- a/arch/m32r/include/asm/pgtable-2level.h +++ b/arch/m32r/include/asm/pgtable-2level.h @@ -70,9 +70,5 @@ static inline pmd_t *pmd_offset(pgd_t * dir, unsigned long address) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define PTE_FILE_MAX_BITS 29 -#define pte_to_pgoff(pte) (((pte_val(pte) >> 2) & 0x7f) | (((pte_val(pte) >> 10)) << 7)) -#define pgoff_to_pte(off) ((pte_t) { (((off) & 0x7f) << 2) | (((off) >> 7) << 10) | _PAGE_FILE }) - #endif /* __KERNEL__ */ #endif /* _ASM_M32R_PGTABLE_2LEVEL_H */ diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h index 103ce6710f0724..050f7a686e3d53 100644 --- a/arch/m32r/include/asm/pgtable.h +++ b/arch/m32r/include/asm/pgtable.h @@ -80,8 +80,6 @@ extern unsigned long empty_zero_page[1024]; */ #define _PAGE_BIT_DIRTY 0 /* software: page changed */ -#define _PAGE_BIT_FILE 0 /* when !present: nonlinear file - mapping */ #define _PAGE_BIT_PRESENT 1 /* Valid: page is valid */ #define _PAGE_BIT_GLOBAL 2 /* Global */ #define _PAGE_BIT_LARGE 3 /* Large */ @@ -93,7 +91,6 @@ extern unsigned long empty_zero_page[1024]; #define _PAGE_BIT_PROTNONE 9 /* software: if not present */ #define _PAGE_DIRTY (1UL << _PAGE_BIT_DIRTY) -#define _PAGE_FILE (1UL << _PAGE_BIT_FILE) #define _PAGE_PRESENT (1UL << _PAGE_BIT_PRESENT) #define _PAGE_GLOBAL (1UL << _PAGE_BIT_GLOBAL) #define _PAGE_LARGE (1UL << _PAGE_BIT_LARGE) @@ -206,14 +203,6 @@ static inline int pte_write(pte_t pte) return pte_val(pte) & _PAGE_WRITE; } -/* - * The following only works if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) -{ - return pte_val(pte) & _PAGE_FILE; -} - static inline int pte_special(pte_t pte) { return 0; From 1eeda0abf4425c91e7ce3ca32f1908c3a51bf84e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:41 -0800 Subject: [PATCH 54/78] m68k: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/include/asm/mcf_pgtable.h | 23 ++--------------------- arch/m68k/include/asm/motorola_pgtable.h | 15 --------------- arch/m68k/include/asm/pgtable_no.h | 2 -- arch/m68k/include/asm/sun3_pgtable.h | 15 --------------- 4 files changed, 2 insertions(+), 53 deletions(-) diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 3c793682e5d99a..2500ce04fcc412 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -35,7 +35,6 @@ * hitting hardware. */ #define CF_PAGE_DIRTY 0x00000001 -#define CF_PAGE_FILE 0x00000200 #define CF_PAGE_ACCESSED 0x00001000 #define _PAGE_CACHE040 0x020 /* 68040 cache mode, cachable, copyback */ @@ -243,11 +242,6 @@ static inline int pte_young(pte_t pte) return pte_val(pte) & CF_PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) -{ - return pte_val(pte) & CF_PAGE_FILE; -} - static inline int pte_special(pte_t pte) { return 0; @@ -391,26 +385,13 @@ static inline void cache_page(void *vaddr) *ptep = pte_mkcache(*ptep); } -#define PTE_FILE_MAX_BITS 21 -#define PTE_FILE_SHIFT 11 - -static inline unsigned long pte_to_pgoff(pte_t pte) -{ - return pte_val(pte) >> PTE_FILE_SHIFT; -} - -static inline pte_t pgoff_to_pte(unsigned pgoff) -{ - return __pte((pgoff << PTE_FILE_SHIFT) + CF_PAGE_FILE); -} - /* * Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ #define __swp_type(x) ((x).val & 0xFF) -#define __swp_offset(x) ((x).val >> PTE_FILE_SHIFT) +#define __swp_offset(x) ((x).val >> 11) #define __swp_entry(typ, off) ((swp_entry_t) { (typ) | \ - (off << PTE_FILE_SHIFT) }) + (off << 11) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) (__pte((x).val)) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index e0fdd4d080750a..0085aab80e5a76 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -28,7 +28,6 @@ #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NOCACHE) #define _PAGE_PROTNONE 0x004 -#define _PAGE_FILE 0x008 /* pagecache or swap? */ #ifndef __ASSEMBLY__ @@ -168,7 +167,6 @@ static inline void pgd_set(pgd_t *pgdp, pmd_t *pmdp) static inline int pte_write(pte_t pte) { return !(pte_val(pte) & _PAGE_RONLY); } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_RONLY; return pte; } @@ -266,19 +264,6 @@ static inline void cache_page(void *vaddr) } } -#define PTE_FILE_MAX_BITS 28 - -static inline unsigned long pte_to_pgoff(pte_t pte) -{ - return pte.pte >> 4; -} - -static inline pte_t pgoff_to_pte(unsigned off) -{ - pte_t pte = { (off << 4) + _PAGE_FILE }; - return pte; -} - /* Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ #define __swp_type(x) (((x).val >> 4) & 0xff) #define __swp_offset(x) ((x).val >> 12) diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index 11859b86b1f956..ac7d87a02335e2 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -37,8 +37,6 @@ extern void paging_init(void); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_file(pte_t pte) { return 0; } - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index f55aa04161e8ac..48657f9fdeceac 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -38,8 +38,6 @@ #define _PAGE_PRESENT (SUN3_PAGE_VALID) #define _PAGE_ACCESSED (SUN3_PAGE_ACCESSED) -#define PTE_FILE_MAX_BITS 28 - /* Compound page protection values. */ //todo: work out which ones *should* have SUN3_PAGE_NOCACHE and fix... // is it just PAGE_KERNEL and PAGE_SHARED? @@ -168,7 +166,6 @@ static inline void pgd_clear (pgd_t *pgdp) {} static inline int pte_write(pte_t pte) { return pte_val(pte) & SUN3_PAGE_WRITEABLE; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & SUN3_PAGE_MODIFIED; } static inline int pte_young(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESSED; } static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_WRITEABLE; return pte; } @@ -202,18 +199,6 @@ static inline pmd_t *pmd_offset (pgd_t *pgd, unsigned long address) return (pmd_t *) pgd; } -static inline unsigned long pte_to_pgoff(pte_t pte) -{ - return pte.pte & SUN3_PAGE_PGNUM_MASK; -} - -static inline pte_t pgoff_to_pte(unsigned off) -{ - pte_t pte = { off + SUN3_PAGE_ACCESSED }; - return pte; -} - - /* Find an entry in the third-level pagetable. */ #define pte_index(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) ((pte_t *) __pmd_page(*pmd) + pte_index(address)) From 22f9bf3950f20d24198791685f2dccac2c4ef38a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:45 -0800 Subject: [PATCH 55/78] metag: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: James Hogan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/metag/include/asm/pgtable.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/metag/include/asm/pgtable.h b/arch/metag/include/asm/pgtable.h index 0d9dc548729686..d0604c0a870222 100644 --- a/arch/metag/include/asm/pgtable.h +++ b/arch/metag/include/asm/pgtable.h @@ -47,7 +47,6 @@ */ #define _PAGE_ACCESSED _PAGE_ALWAYS_ZERO_1 #define _PAGE_DIRTY _PAGE_ALWAYS_ZERO_2 -#define _PAGE_FILE _PAGE_ALWAYS_ZERO_3 /* Pages owned, and protected by, the kernel. */ #define _PAGE_KERNEL _PAGE_PRIV @@ -219,7 +218,6 @@ extern unsigned long empty_zero_page; static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= (~_PAGE_WRITE); return pte; } @@ -327,10 +325,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define PTE_FILE_MAX_BITS 22 -#define pte_to_pgoff(x) (pte_val(x) >> 10) -#define pgoff_to_pte(x) __pte(((x) << 10) | _PAGE_FILE) - #define kern_addr_valid(addr) (1) /* From 937fa39fb22fea1c1d8ca9e5f31c452b91ac7239 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:47 -0800 Subject: [PATCH 56/78] microblaze: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/pgtable.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index df19d0c47be8df..91b9b46fbb5d5f 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -40,10 +40,6 @@ extern int mem_init_done; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#ifndef __ASSEMBLY__ -static inline int pte_file(pte_t pte) { return 0; } -#endif /* __ASSEMBLY__ */ - #define ZERO_PAGE(vaddr) ({ BUG(); NULL; }) #define swapper_pg_dir ((pgd_t *) NULL) @@ -207,7 +203,6 @@ static inline pte_t pte_mkspecial(pte_t pte) { return pte; } /* Definitions for MicroBlaze. */ #define _PAGE_GUARDED 0x001 /* G: page is guarded from prefetch */ -#define _PAGE_FILE 0x001 /* when !present: nonlinear file mapping */ #define _PAGE_PRESENT 0x002 /* software: PTE contains a translation */ #define _PAGE_NO_CACHE 0x004 /* I: caching is inhibited */ #define _PAGE_WRITETHRU 0x008 /* W: caching is write-through */ @@ -337,7 +332,6 @@ static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; } static inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; } @@ -499,11 +493,6 @@ static inline pmd_t *pmd_offset(pgd_t *dir, unsigned long address) #define pte_unmap(pte) kunmap_atomic(pte) -/* Encode and decode a nonlinear file mapping entry */ -#define PTE_FILE_MAX_BITS 29 -#define pte_to_pgoff(pte) (pte_val(pte) >> 3) -#define pgoff_to_pte(off) ((pte_t) { ((off) << 3) | _PAGE_FILE }) - extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* From b32da82e28ce90bff4e371fc15d2816fa3175bb0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:50 -0800 Subject: [PATCH 57/78] mips: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/pgtable-32.h | 36 ---------------------------- arch/mips/include/asm/pgtable-64.h | 9 ------- arch/mips/include/asm/pgtable-bits.h | 9 ------- arch/mips/include/asm/pgtable.h | 2 -- 4 files changed, 56 deletions(-) diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index 68984b612f9dfd..16aa9f23e17b1e 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -161,22 +161,6 @@ pfn_pte(unsigned long pfn, pgprot_t prot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* - * Encode and decode a nonlinear file mapping entry - */ -#define pte_to_pgoff(_pte) ((((_pte).pte >> 1 ) & 0x07) | \ - (((_pte).pte >> 2 ) & 0x38) | \ - (((_pte).pte >> 10) << 6 )) - -#define pgoff_to_pte(off) ((pte_t) { (((off) & 0x07) << 1 ) | \ - (((off) & 0x38) << 2 ) | \ - (((off) >> 6 ) << 10) | \ - _PAGE_FILE }) - -/* - * Bits 0, 4, 8, and 9 are taken, split up 28 bits of offset into this range: - */ -#define PTE_FILE_MAX_BITS 28 #else #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) @@ -188,13 +172,6 @@ pfn_pte(unsigned long pfn, pgprot_t prot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t) { 0, (x).val }) -/* - * Bits 0 and 1 of pte_high are taken, use the rest for the page offset... - */ -#define pte_to_pgoff(_pte) ((_pte).pte_high >> 2) -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) << 2 }) - -#define PTE_FILE_MAX_BITS 30 #else /* * Constraints: @@ -209,19 +186,6 @@ pfn_pte(unsigned long pfn, pgprot_t prot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* - * Encode and decode a nonlinear file mapping entry - */ -#define pte_to_pgoff(_pte) ((((_pte).pte >> 1) & 0x7) | \ - (((_pte).pte >> 2) & 0x8) | \ - (((_pte).pte >> 8) << 4)) - -#define pgoff_to_pte(off) ((pte_t) { (((off) & 0x7) << 1) | \ - (((off) & 0x8) << 2) | \ - (((off) >> 4) << 8) | \ - _PAGE_FILE }) - -#define PTE_FILE_MAX_BITS 28 #endif /* defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) */ #endif /* defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX) */ diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index e1c49a96807d68..1659bb91ae2114 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -291,13 +291,4 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* - * Bits 0, 4, 6, and 7 are taken. Let's leave bits 1, 2, 3, and 5 alone to - * make things easier, and only use the upper 56 bits for the page offset... - */ -#define PTE_FILE_MAX_BITS 56 - -#define pte_to_pgoff(_pte) ((_pte).pte >> 8) -#define pgoff_to_pte(off) ((pte_t) { ((off) << 8) | _PAGE_FILE }) - #endif /* _ASM_PGTABLE_64_H */ diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h index ca11f14f40a3fe..fc807aa5ec8d75 100644 --- a/arch/mips/include/asm/pgtable-bits.h +++ b/arch/mips/include/asm/pgtable-bits.h @@ -48,8 +48,6 @@ /* * The following bits are implemented in software - * - * _PAGE_FILE semantics: set:pagecache unset:swap */ #define _PAGE_PRESENT_SHIFT (_CACHE_SHIFT + 3) #define _PAGE_PRESENT (1 << _PAGE_PRESENT_SHIFT) @@ -64,7 +62,6 @@ #define _PAGE_SILENT_READ _PAGE_VALID #define _PAGE_SILENT_WRITE _PAGE_DIRTY -#define _PAGE_FILE _PAGE_MODIFIED #define _PFN_SHIFT (PAGE_SHIFT - 12 + _CACHE_SHIFT + 3) @@ -72,8 +69,6 @@ /* * The following are implemented by software - * - * _PAGE_FILE semantics: set:pagecache unset:swap */ #define _PAGE_PRESENT_SHIFT 0 #define _PAGE_PRESENT (1 << _PAGE_PRESENT_SHIFT) @@ -85,8 +80,6 @@ #define _PAGE_ACCESSED (1 << _PAGE_ACCESSED_SHIFT) #define _PAGE_MODIFIED_SHIFT 4 #define _PAGE_MODIFIED (1 << _PAGE_MODIFIED_SHIFT) -#define _PAGE_FILE_SHIFT 4 -#define _PAGE_FILE (1 << _PAGE_FILE_SHIFT) /* * And these are the hardware TLB bits @@ -116,7 +109,6 @@ * The following bits are implemented in software * * _PAGE_READ / _PAGE_READ_SHIFT should be unused if cpu_has_rixi. - * _PAGE_FILE semantics: set:pagecache unset:swap */ #define _PAGE_PRESENT_SHIFT (0) #define _PAGE_PRESENT (1 << _PAGE_PRESENT_SHIFT) @@ -128,7 +120,6 @@ #define _PAGE_ACCESSED (1 << _PAGE_ACCESSED_SHIFT) #define _PAGE_MODIFIED_SHIFT (_PAGE_ACCESSED_SHIFT + 1) #define _PAGE_MODIFIED (1 << _PAGE_MODIFIED_SHIFT) -#define _PAGE_FILE (_PAGE_MODIFIED) #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT /* huge tlb page */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 62a6ba383d4fdf..583ff42154794d 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -231,7 +231,6 @@ extern pgd_t swapper_pg_dir[]; static inline int pte_write(pte_t pte) { return pte.pte_low & _PAGE_WRITE; } static inline int pte_dirty(pte_t pte) { return pte.pte_low & _PAGE_MODIFIED; } static inline int pte_young(pte_t pte) { return pte.pte_low & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte.pte_low & _PAGE_FILE; } static inline pte_t pte_wrprotect(pte_t pte) { @@ -287,7 +286,6 @@ static inline pte_t pte_mkyoung(pte_t pte) static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_MODIFIED; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline pte_t pte_wrprotect(pte_t pte) { From 6bf63a8ccb1dccd6ab81bc8bc46863493629cdb8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:53 -0800 Subject: [PATCH 58/78] mn10300: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also increases the number of bits availble for swap offset. Signed-off-by: Kirill A. Shutemov Cc: David Howells Cc: Koichi Yasutake Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mn10300/include/asm/pgtable.h | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h index 2ddaa67e79834a..629181ae111e5d 100644 --- a/arch/mn10300/include/asm/pgtable.h +++ b/arch/mn10300/include/asm/pgtable.h @@ -134,7 +134,6 @@ extern pte_t kernel_vmalloc_ptes[(VMALLOC_END - VMALLOC_START) / PAGE_SIZE]; #define _PAGE_NX 0 /* no-execute bit */ /* If _PAGE_VALID is clear, we use these: */ -#define _PAGE_FILE xPTEL2_C /* set:pagecache unset:swap */ #define _PAGE_PROTNONE 0x000 /* If not present */ #define __PAGE_PROT_UWAUX 0x010 @@ -241,11 +240,6 @@ static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & __PAGE_PROT_WRITE; } static inline int pte_special(pte_t pte){ return 0; } -/* - * The following only works if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } - static inline pte_t pte_rdprotect(pte_t pte) { pte_val(pte) &= ~(__PAGE_PROT_USER|__PAGE_PROT_UWAUX); return pte; @@ -338,16 +332,11 @@ static inline int pte_exec_kernel(pte_t pte) return 1; } -#define PTE_FILE_MAX_BITS 30 - -#define pte_to_pgoff(pte) (pte_val(pte) >> 2) -#define pgoff_to_pte(off) __pte((off) << 2 | _PAGE_FILE) - /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 2) & 0x3f) -#define __swp_offset(x) ((x).val >> 8) +#define __swp_type(x) (((x).val >> 1) & 0x3f) +#define __swp_offset(x) ((x).val >> 7) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 2) | ((offset) << 8) }) + ((swp_entry_t) { ((type) << 1) | ((offset) << 7) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) __pte((x).val) From 3ee802ead2de2346dd110e233d14602b2de26b3c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:56 -0800 Subject: [PATCH 59/78] nios2: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Ley Foon Tan Reviewed-by: Tobias Klauser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/nios2/include/asm/pgtable-bits.h | 1 - arch/nios2/include/asm/pgtable.h | 10 +--------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/nios2/include/asm/pgtable-bits.h b/arch/nios2/include/asm/pgtable-bits.h index ce9e7069aa969e..bfddff383e89a7 100644 --- a/arch/nios2/include/asm/pgtable-bits.h +++ b/arch/nios2/include/asm/pgtable-bits.h @@ -30,6 +30,5 @@ #define _PAGE_PRESENT (1<<25) /* PTE contains a translation */ #define _PAGE_ACCESSED (1<<26) /* page referenced */ #define _PAGE_DIRTY (1<<27) /* dirty page */ -#define _PAGE_FILE (1<<28) /* PTE used for file mapping or swap */ #endif /* _ASM_NIOS2_PGTABLE_BITS_H */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index ccbaffd47671d9..7b292e3a3138cc 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -112,8 +112,6 @@ static inline int pte_dirty(pte_t pte) \ { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) \ { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) \ - { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return 0; } #define pgprot_noncached pgprot_noncached @@ -272,8 +270,7 @@ static inline void pte_clear(struct mm_struct *mm, __FILE__, __LINE__, pgd_val(e)) /* - * Encode and decode a swap entry (must be !pte_none(pte) && !pte_present(pte) - * && !pte_file(pte)): + * Encode and decode a swap entry (must be !pte_none(pte) && !pte_present(pte): * * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 ... 1 0 * 0 0 0 0 type. 0 0 0 0 0 0 offset......... @@ -290,11 +287,6 @@ static inline void pte_clear(struct mm_struct *mm, #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -/* Encode and decode a nonlinear file mapping entry */ -#define PTE_FILE_MAX_BITS 25 -#define pte_to_pgoff(pte) (pte_val(pte) & 0x1ffffff) -#define pgoff_to_pte(off) __pte(((off) & 0x1ffffff) | _PAGE_FILE) - #define kern_addr_valid(addr) (1) #include From 3824e3cf7e865b2ff0b71de23b16e332fe6a853a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:58 -0800 Subject: [PATCH 60/78] openrisc: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Jonas Bonn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/openrisc/include/asm/pgtable.h | 8 -------- arch/openrisc/kernel/head.S | 5 ----- 2 files changed, 13 deletions(-) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 37bf6a3ef8f4f5..18994ccb1185db 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -125,7 +125,6 @@ extern void paging_init(void); #define _PAGE_CC 0x001 /* software: pte contains a translation */ #define _PAGE_CI 0x002 /* cache inhibit */ #define _PAGE_WBC 0x004 /* write back cache */ -#define _PAGE_FILE 0x004 /* set: pagecache, unset: swap (when !PRESENT) */ #define _PAGE_WOM 0x008 /* weakly ordered memory */ #define _PAGE_A 0x010 /* accessed */ @@ -240,7 +239,6 @@ static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkspecial(pte_t pte) { return pte; } @@ -438,12 +436,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* Encode and decode a nonlinear file mapping entry */ - -#define PTE_FILE_MAX_BITS 26 -#define pte_to_pgoff(x) (pte_val(x) >> 6) -#define pgoff_to_pte(x) __pte(((x) << 6) | _PAGE_FILE) - #define kern_addr_valid(addr) (1) #include diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index 1d3c9c28ac25ba..f14793306b03f3 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -754,11 +754,6 @@ _dc_enable: /* ===============================================[ page table masks ]=== */ -/* bit 4 is used in hardware as write back cache bit. we never use this bit - * explicitly, so we can reuse it as _PAGE_FILE bit and mask it out when - * writing into hardware pte's - */ - #define DTLB_UP_CONVERT_MASK 0x3fa #define ITLB_UP_CONVERT_MASK 0x3a From 8d55da810f1fabcf1d4c0bbc46205e5f2c0fa84b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:01 -0800 Subject: [PATCH 61/78] parisc: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: "James E.J. Bottomley" Cc: Helge Deller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/pgtable.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 22b89d1edba7a9..1d49a4a7749b44 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -146,7 +146,6 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long); #define _PAGE_GATEWAY_BIT 28 /* (0x008) privilege promotion allowed */ #define _PAGE_DMB_BIT 27 /* (0x010) Data Memory Break enable (B bit) */ #define _PAGE_DIRTY_BIT 26 /* (0x020) Page Dirty (D bit) */ -#define _PAGE_FILE_BIT _PAGE_DIRTY_BIT /* overload this bit */ #define _PAGE_REFTRAP_BIT 25 /* (0x040) Page Ref. Trap enable (T bit) */ #define _PAGE_NO_CACHE_BIT 24 /* (0x080) Uncached Page (U bit) */ #define _PAGE_ACCESSED_BIT 23 /* (0x100) Software: Page Accessed */ @@ -167,13 +166,6 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long); /* PFN_PTE_SHIFT defines the shift of a PTE value to access the PFN field */ #define PFN_PTE_SHIFT 12 - -/* this is how many bits may be used by the file functions */ -#define PTE_FILE_MAX_BITS (BITS_PER_LONG - PTE_SHIFT) - -#define pte_to_pgoff(pte) (pte_val(pte) >> PTE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { ((off) << PTE_SHIFT) | _PAGE_FILE }) - #define _PAGE_READ (1 << xlate_pabit(_PAGE_READ_BIT)) #define _PAGE_WRITE (1 << xlate_pabit(_PAGE_WRITE_BIT)) #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) @@ -186,7 +178,6 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long); #define _PAGE_ACCESSED (1 << xlate_pabit(_PAGE_ACCESSED_BIT)) #define _PAGE_PRESENT (1 << xlate_pabit(_PAGE_PRESENT_BIT)) #define _PAGE_USER (1 << xlate_pabit(_PAGE_USER_BIT)) -#define _PAGE_FILE (1 << xlate_pabit(_PAGE_FILE_BIT)) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -344,7 +335,6 @@ static inline void pgd_clear(pgd_t * pgdp) { } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; } From 6e76d4b20bf6b514408ab5bd07f4a76723259b64 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:04 -0800 Subject: [PATCH 62/78] s390: drop pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Acked-by: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 5e102422c9abd4..ffb1d8ce97aeac 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -249,10 +249,10 @@ static inline int is_module_addr(void *addr) _PAGE_YOUNG) /* - * handle_pte_fault uses pte_present, pte_none and pte_file to find out the - * pte type WITHOUT holding the page table lock. The _PAGE_PRESENT bit - * is used to distinguish present from not-present ptes. It is changed only - * with the page table lock held. + * handle_pte_fault uses pte_present and pte_none to find out the pte type + * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to + * distinguish present from not-present ptes. It is changed only with the page + * table lock held. * * The following table gives the different possible bit combinations for * the pte hardware and software bits in the last 12 bits of a pte: @@ -279,7 +279,6 @@ static inline int is_module_addr(void *addr) * * pte_present is true for the bit pattern .xx...xxxxx1, (pte & 0x001) == 0x001 * pte_none is true for the bit pattern .10...xxxx00, (pte & 0x603) == 0x400 - * pte_file is true for the bit pattern .11...xxxxx0, (pte & 0x601) == 0x600 * pte_swap is true for the bit pattern .10...xxxx10, (pte & 0x603) == 0x402 */ @@ -671,13 +670,6 @@ static inline int pte_swap(pte_t pte) == (_PAGE_INVALID | _PAGE_TYPE); } -static inline int pte_file(pte_t pte) -{ - /* Bit pattern: (pte & 0x601) == 0x600 */ - return (pte_val(pte) & (_PAGE_INVALID | _PAGE_PROTECT | _PAGE_PRESENT)) - == (_PAGE_INVALID | _PAGE_PROTECT); -} - static inline int pte_special(pte_t pte) { return (pte_val(pte) & _PAGE_SPECIAL); @@ -1756,19 +1748,6 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#ifndef CONFIG_64BIT -# define PTE_FILE_MAX_BITS 26 -#else /* CONFIG_64BIT */ -# define PTE_FILE_MAX_BITS 59 -#endif /* CONFIG_64BIT */ - -#define pte_to_pgoff(__pte) \ - ((((__pte).pte >> 12) << 7) + (((__pte).pte >> 1) & 0x7f)) - -#define pgoff_to_pte(__off) \ - ((pte_t) { ((((__off) & 0x7f) << 1) + (((__off) >> 7) << 12)) \ - | _PAGE_INVALID | _PAGE_PROTECT }) - #endif /* !__ASSEMBLY__ */ #define kern_addr_valid(addr) (1) From 917e401ea75478d4f4575bc8b0ef3d14ecf9ef69 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:06 -0800 Subject: [PATCH 63/78] score: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also increase number of bits availble for swap offset. Signed-off-by: Kirill A. Shutemov Cc: Chen Liqin Cc: Lennox Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/score/include/asm/pgtable-bits.h | 1 - arch/score/include/asm/pgtable.h | 18 ++---------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/score/include/asm/pgtable-bits.h b/arch/score/include/asm/pgtable-bits.h index 7d65a96a82e56f..0e5c6f466520ab 100644 --- a/arch/score/include/asm/pgtable-bits.h +++ b/arch/score/include/asm/pgtable-bits.h @@ -6,7 +6,6 @@ #define _PAGE_WRITE (1<<7) /* implemented in software */ #define _PAGE_PRESENT (1<<9) /* implemented in software */ #define _PAGE_MODIFIED (1<<10) /* implemented in software */ -#define _PAGE_FILE (1<<10) #define _PAGE_GLOBAL (1<<0) #define _PAGE_VALID (1<<1) diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h index db96ad9afc03ed..5170ffdea64350 100644 --- a/arch/score/include/asm/pgtable.h +++ b/arch/score/include/asm/pgtable.h @@ -90,15 +90,6 @@ static inline void pmd_clear(pmd_t *pmdp) ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) #define pte_unmap(pte) ((void)(pte)) -/* - * Bits 9(_PAGE_PRESENT) and 10(_PAGE_FILE)are taken, - * split up 30 bits of offset into this range: - */ -#define PTE_FILE_MAX_BITS 30 -#define pte_to_pgoff(_pte) \ - (((_pte).pte & 0x1ff) | (((_pte).pte >> 11) << 9)) -#define pgoff_to_pte(off) \ - ((pte_t) {((off) & 0x1ff) | (((off) >> 9) << 11) | _PAGE_FILE}) #define __pte_to_swp_entry(pte) \ ((swp_entry_t) { pte_val(pte)}) #define __swp_entry_to_pte(x) ((pte_t) {(x).val}) @@ -169,8 +160,8 @@ static inline pgprot_t pgprot_noncached(pgprot_t _prot) } #define __swp_type(x) ((x).val & 0x1f) -#define __swp_offset(x) ((x).val >> 11) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | ((offset) << 11)}) +#define __swp_offset(x) ((x).val >> 10) +#define __swp_entry(type, offset) ((swp_entry_t){(type) | ((offset) << 10)}) extern unsigned long empty_zero_page; extern unsigned long zero_page_mask; @@ -198,11 +189,6 @@ static inline int pte_young(pte_t pte) return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) -{ - return pte_val(pte) & _PAGE_FILE; -} - #define pte_special(pte) (0) static inline pte_t pte_wrprotect(pte_t pte) From 8b70beac99466b6d164de9fe647b3567e6f17e3a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:09 -0800 Subject: [PATCH 64/78] sh: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/pgtable_32.h | 30 ++++-------------------------- arch/sh/include/asm/pgtable_64.h | 9 +-------- 2 files changed, 5 insertions(+), 34 deletions(-) diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index 0bce3d81569ee2..c646e563abceca 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -26,8 +26,6 @@ * and timing control which (together with bit 0) are moved into the * old-style PTEA on the parts that support it. * - * XXX: Leave the _PAGE_FILE and _PAGE_WT overhaul for a rainy day. - * * SH-X2 MMUs and extended PTEs * * SH-X2 supports an extended mode TLB with split data arrays due to the @@ -51,7 +49,6 @@ #define _PAGE_PRESENT 0x100 /* V-bit : page is valid */ #define _PAGE_PROTNONE 0x200 /* software: if not present */ #define _PAGE_ACCESSED 0x400 /* software: page referenced */ -#define _PAGE_FILE _PAGE_WT /* software: pagecache or swap? */ #define _PAGE_SPECIAL 0x800 /* software: special page */ #define _PAGE_SZ_MASK (_PAGE_SZ0 | _PAGE_SZ1) @@ -105,14 +102,13 @@ static inline unsigned long copy_ptea_attributes(unsigned long x) /* Mask which drops unused bits from the PTEL value */ #if defined(CONFIG_CPU_SH3) #define _PAGE_CLEAR_FLAGS (_PAGE_PROTNONE | _PAGE_ACCESSED| \ - _PAGE_FILE | _PAGE_SZ1 | \ - _PAGE_HW_SHARED) + _PAGE_SZ1 | _PAGE_HW_SHARED) #elif defined(CONFIG_X2TLB) /* Get rid of the legacy PR/SZ bits when using extended mode */ #define _PAGE_CLEAR_FLAGS (_PAGE_PROTNONE | _PAGE_ACCESSED | \ - _PAGE_FILE | _PAGE_PR_MASK | _PAGE_SZ_MASK) + _PAGE_PR_MASK | _PAGE_SZ_MASK) #else -#define _PAGE_CLEAR_FLAGS (_PAGE_PROTNONE | _PAGE_ACCESSED | _PAGE_FILE) +#define _PAGE_CLEAR_FLAGS (_PAGE_PROTNONE | _PAGE_ACCESSED) #endif #define _PAGE_FLAGS_HARDWARE_MASK (phys_addr_mask() & ~(_PAGE_CLEAR_FLAGS)) @@ -343,7 +339,6 @@ static inline void set_pte(pte_t *ptep, pte_t pte) #define pte_not_present(pte) (!((pte).pte_low & _PAGE_PRESENT)) #define pte_dirty(pte) ((pte).pte_low & _PAGE_DIRTY) #define pte_young(pte) ((pte).pte_low & _PAGE_ACCESSED) -#define pte_file(pte) ((pte).pte_low & _PAGE_FILE) #define pte_special(pte) ((pte).pte_low & _PAGE_SPECIAL) #ifdef CONFIG_X2TLB @@ -445,7 +440,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * Encode and de-code a swap entry * * Constraints: - * _PAGE_FILE at bit 0 * _PAGE_PRESENT at bit 8 * _PAGE_PROTNONE at bit 9 * @@ -453,9 +447,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * swap offset into bits 10:30. For the 64-bit PTE case, we keep the * preserved bits in the low 32-bits and use the upper 32 as the swap * offset (along with a 5-bit type), following the same approach as x86 - * PAE. This keeps the logic quite simple, and allows for a full 32 - * PTE_FILE_MAX_BITS, as opposed to the 29-bits we're constrained with - * in the pte_low case. + * PAE. This keeps the logic quite simple. * * As is evident by the Alpha code, if we ever get a 64-bit unsigned * long (swp_entry_t) to match up with the 64-bit PTEs, this all becomes @@ -471,13 +463,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) -/* - * Encode and decode a nonlinear file mapping entry - */ -#define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) - -#define PTE_FILE_MAX_BITS 32 #else #define __swp_type(x) ((x).val & 0xff) #define __swp_offset(x) ((x).val >> 10) @@ -485,13 +470,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 1 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 1 }) - -/* - * Encode and decode a nonlinear file mapping entry - */ -#define PTE_FILE_MAX_BITS 29 -#define pte_to_pgoff(pte) (pte_val(pte) >> 1) -#define pgoff_to_pte(off) ((pte_t) { ((off) << 1) | _PAGE_FILE }) #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/sh/include/asm/pgtable_64.h b/arch/sh/include/asm/pgtable_64.h index dda8c82601b9c3..07424968df6210 100644 --- a/arch/sh/include/asm/pgtable_64.h +++ b/arch/sh/include/asm/pgtable_64.h @@ -107,7 +107,6 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval) #define _PAGE_DEVICE 0x001 /* CB0: if uncacheable, 1->device (i.e. no write-combining or reordering at bus level) */ #define _PAGE_CACHABLE 0x002 /* CB1: uncachable/cachable */ #define _PAGE_PRESENT 0x004 /* software: page referenced */ -#define _PAGE_FILE 0x004 /* software: only when !present */ #define _PAGE_SIZE0 0x008 /* SZ0-bit : size of page */ #define _PAGE_SIZE1 0x010 /* SZ1-bit : size of page */ #define _PAGE_SHARED 0x020 /* software: reflects PTEH's SH */ @@ -129,7 +128,7 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval) #define _PAGE_WIRED _PAGE_EXT(0x001) /* software: wire the tlb entry */ #define _PAGE_SPECIAL _PAGE_EXT(0x002) -#define _PAGE_CLEAR_FLAGS (_PAGE_PRESENT | _PAGE_FILE | _PAGE_SHARED | \ +#define _PAGE_CLEAR_FLAGS (_PAGE_PRESENT | _PAGE_SHARED | \ _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_WIRED) /* Mask which drops software flags */ @@ -260,7 +259,6 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval) */ static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } static inline int pte_special(pte_t pte){ return pte_val(pte) & _PAGE_SPECIAL; } @@ -304,11 +302,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* Encode and decode a nonlinear file mapping entry */ -#define PTE_FILE_MAX_BITS 29 -#define pte_to_pgoff(pte) (pte_val(pte)) -#define pgoff_to_pte(off) ((pte_t) { (off) | _PAGE_FILE }) - #endif /* !__ASSEMBLY__ */ #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) From 6a8c4820895cf1dd2a128aef67ce079ba6eded80 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:12 -0800 Subject: [PATCH 65/78] sparc: drop pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. This patch also increase number of bits availble for swap offset. Signed-off-by: Kirill A. Shutemov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_32.h | 24 ----------------- arch/sparc/include/asm/pgtable_64.h | 40 ----------------------------- arch/sparc/include/asm/pgtsrmmu.h | 14 ++++------ 3 files changed, 5 insertions(+), 73 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index b9b91ae19fe125..b2f7dc46a7d142 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -221,14 +221,6 @@ static inline int pte_young(pte_t pte) return pte_val(pte) & SRMMU_REF; } -/* - * The following only work if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) -{ - return pte_val(pte) & SRMMU_FILE; -} - static inline int pte_special(pte_t pte) { return 0; @@ -375,22 +367,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* file-offset-in-pte helpers */ -static inline unsigned long pte_to_pgoff(pte_t pte) -{ - return pte_val(pte) >> SRMMU_PTE_FILE_SHIFT; -} - -static inline pte_t pgoff_to_pte(unsigned long pgoff) -{ - return __pte((pgoff << SRMMU_PTE_FILE_SHIFT) | SRMMU_FILE); -} - -/* - * This is made a constant because mm/fremap.c required a constant. - */ -#define PTE_FILE_MAX_BITS 24 - static inline unsigned long __get_phys (unsigned long addr) { diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1ff9e786416883..2ac7873ad6fd80 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -137,7 +137,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_SOFT_4U _AC(0x0000000000001F80,UL) /* Software bits: */ #define _PAGE_EXEC_4U _AC(0x0000000000001000,UL) /* Executable SW bit */ #define _PAGE_MODIFIED_4U _AC(0x0000000000000800,UL) /* Modified (dirty) */ -#define _PAGE_FILE_4U _AC(0x0000000000000800,UL) /* Pagecache page */ #define _PAGE_ACCESSED_4U _AC(0x0000000000000400,UL) /* Accessed (ref'd) */ #define _PAGE_READ_4U _AC(0x0000000000000200,UL) /* Readable SW Bit */ #define _PAGE_WRITE_4U _AC(0x0000000000000100,UL) /* Writable SW Bit */ @@ -167,7 +166,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_EXEC_4V _AC(0x0000000000000080,UL) /* Executable Page */ #define _PAGE_W_4V _AC(0x0000000000000040,UL) /* Writable */ #define _PAGE_SOFT_4V _AC(0x0000000000000030,UL) /* Software bits */ -#define _PAGE_FILE_4V _AC(0x0000000000000020,UL) /* Pagecache page */ #define _PAGE_PRESENT_4V _AC(0x0000000000000010,UL) /* Present */ #define _PAGE_RESV_4V _AC(0x0000000000000008,UL) /* Reserved */ #define _PAGE_SZ16GB_4V _AC(0x0000000000000007,UL) /* 16GB Page */ @@ -332,22 +330,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #endif -static inline pte_t pgoff_to_pte(unsigned long off) -{ - off <<= PAGE_SHIFT; - - __asm__ __volatile__( - "\n661: or %0, %2, %0\n" - " .section .sun4v_1insn_patch, \"ax\"\n" - " .word 661b\n" - " or %0, %3, %0\n" - " .previous\n" - : "=r" (off) - : "0" (off), "i" (_PAGE_FILE_4U), "i" (_PAGE_FILE_4V)); - - return __pte(off); -} - static inline pgprot_t pgprot_noncached(pgprot_t prot) { unsigned long val = pgprot_val(prot); @@ -609,22 +591,6 @@ static inline unsigned long pte_exec(pte_t pte) return (pte_val(pte) & mask); } -static inline unsigned long pte_file(pte_t pte) -{ - unsigned long val = pte_val(pte); - - __asm__ __volatile__( - "\n661: and %0, %2, %0\n" - " .section .sun4v_1insn_patch, \"ax\"\n" - " .word 661b\n" - " and %0, %3, %0\n" - " .previous\n" - : "=r" (val) - : "0" (val), "i" (_PAGE_FILE_4U), "i" (_PAGE_FILE_4V)); - - return val; -} - static inline unsigned long pte_present(pte_t pte) { unsigned long val = pte_val(pte); @@ -971,12 +937,6 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -/* File offset in PTE support. */ -unsigned long pte_file(pte_t); -#define pte_to_pgoff(pte) (pte_val(pte) >> PAGE_SHIFT) -pte_t pgoff_to_pte(unsigned long); -#define PTE_FILE_MAX_BITS (64UL - PAGE_SHIFT - 1UL) - int page_in_phys_avail(unsigned long paddr); /* diff --git a/arch/sparc/include/asm/pgtsrmmu.h b/arch/sparc/include/asm/pgtsrmmu.h index 79da17866fa899..ae51a111a8c7ad 100644 --- a/arch/sparc/include/asm/pgtsrmmu.h +++ b/arch/sparc/include/asm/pgtsrmmu.h @@ -80,10 +80,6 @@ #define SRMMU_PRIV 0x1c #define SRMMU_PRIV_RDONLY 0x18 -#define SRMMU_FILE 0x40 /* Implemented in software */ - -#define SRMMU_PTE_FILE_SHIFT 8 /* == 32-PTE_FILE_MAX_BITS */ - #define SRMMU_CHG_MASK (0xffffff00 | SRMMU_REF | SRMMU_DIRTY) /* SRMMU swap entry encoding @@ -94,13 +90,13 @@ * oooooooooooooooooootttttRRRRRRRR * fedcba9876543210fedcba9876543210 * - * The bottom 8 bits are reserved for protection and status bits, especially - * FILE and PRESENT. + * The bottom 7 bits are reserved for protection and status bits, especially + * PRESENT. */ #define SRMMU_SWP_TYPE_MASK 0x1f -#define SRMMU_SWP_TYPE_SHIFT SRMMU_PTE_FILE_SHIFT -#define SRMMU_SWP_OFF_MASK 0x7ffff -#define SRMMU_SWP_OFF_SHIFT (SRMMU_PTE_FILE_SHIFT + 5) +#define SRMMU_SWP_TYPE_SHIFT 7 +#define SRMMU_SWP_OFF_MASK 0xfffff +#define SRMMU_SWP_OFF_SHIFT (SRMMU_SWP_TYPE_SHIFT + 5) /* Some day I will implement true fine grained access bits for * user pages because the SRMMU gives us the capabilities to From eb12f4872a3845a8803f689646dea5b92a30aff7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:14 -0800 Subject: [PATCH 66/78] tile: drop pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Acked-by: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/include/asm/pgtable.h | 11 ----------- arch/tile/mm/homecache.c | 4 ---- 2 files changed, 15 deletions(-) diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 5d1950788c69c3..bc75b6ef2e7933 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -284,17 +284,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) extern void start_mm_caching(struct mm_struct *mm); extern void check_mm_caching(struct mm_struct *prev, struct mm_struct *next); -/* - * Support non-linear file mappings (see sys_remap_file_pages). - * This is defined by CLIENT1 set but CLIENT0 and _PAGE_PRESENT clear, and the - * file offset in the 32 high bits. - */ -#define _PAGE_FILE HV_PTE_CLIENT1 -#define PTE_FILE_MAX_BITS 32 -#define pte_file(pte) (hv_pte_get_client1(pte) && !hv_pte_get_client0(pte)) -#define pte_to_pgoff(pte) ((pte).val >> 32) -#define pgoff_to_pte(off) ((pte_t) { (((long long)(off)) << 32) | _PAGE_FILE }) - /* * Encode and de-code a swap entry (see ). * We put the swap file type+offset in the 32 high bits; diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index cd3387370ebbb5..0029b3fb651b34 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -263,10 +263,6 @@ static int pte_to_home(pte_t pte) /* Update the home of a PTE if necessary (can also be used for a pgprot_t). */ pte_t pte_set_home(pte_t pte, int home) { - /* Check for non-linear file mapping "PTEs" and pass them through. */ - if (pte_file(pte)) - return pte; - #if CHIP_HAS_MMIO() /* Check for MMIO mappings and pass them through. */ if (hv_pte_get_mode(pte) == HV_PTE_MODE_MMIO) From 3513006a5691ae3629eef9ddef0b71a47c40dfbc Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:17 -0800 Subject: [PATCH 67/78] um: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Jeff Dike Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/asm/pgtable-2level.h | 9 --------- arch/um/include/asm/pgtable-3level.h | 20 -------------------- arch/um/include/asm/pgtable.h | 9 --------- 3 files changed, 38 deletions(-) diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index f534b73e753e74..7afe86035fa768 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h @@ -41,13 +41,4 @@ static inline void pgd_mkuptodate(pgd_t pgd) { } #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) -/* - * Bits 0 through 4 are taken - */ -#define PTE_FILE_MAX_BITS 27 - -#define pte_to_pgoff(pte) (pte_val(pte) >> 5) - -#define pgoff_to_pte(off) ((pte_t) { ((off) << 5) + _PAGE_FILE }) - #endif diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 0032f9212e74a2..344c559c0a1787 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h @@ -112,25 +112,5 @@ static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot) return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); } -/* - * Bits 0 through 3 are taken in the low part of the pte, - * put the 32 bits of offset into the high part. - */ -#define PTE_FILE_MAX_BITS 32 - -#ifdef CONFIG_64BIT - -#define pte_to_pgoff(p) ((p).pte >> 32) - -#define pgoff_to_pte(off) ((pte_t) { ((off) << 32) | _PAGE_FILE }) - -#else - -#define pte_to_pgoff(pte) ((pte).pte_high) - -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) - -#endif - #endif diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index bf974f712af7e9..2324b624f19577 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -18,7 +18,6 @@ #define _PAGE_ACCESSED 0x080 #define _PAGE_DIRTY 0x100 /* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE 0x008 /* nonlinear file mapping, saved PTE; unset:swap */ #define _PAGE_PROTNONE 0x010 /* if the user mapped it with PROT_NONE; pte_present gives true */ @@ -151,14 +150,6 @@ static inline int pte_write(pte_t pte) !(pte_get_bits(pte, _PAGE_PROTNONE))); } -/* - * The following only works if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) -{ - return pte_get_bits(pte, _PAGE_FILE); -} - static inline int pte_dirty(pte_t pte) { return pte_get_bits(pte, _PAGE_DIRTY); From 40171798fe11a6dc1d963058b097b2c4c9d34a9c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:20 -0800 Subject: [PATCH 68/78] unicore32: drop pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/pgtable-hwdef.h | 1 - arch/unicore32/include/asm/pgtable.h | 14 -------------- 2 files changed, 15 deletions(-) diff --git a/arch/unicore32/include/asm/pgtable-hwdef.h b/arch/unicore32/include/asm/pgtable-hwdef.h index 7314e859cca0b6..e37fa471c2be88 100644 --- a/arch/unicore32/include/asm/pgtable-hwdef.h +++ b/arch/unicore32/include/asm/pgtable-hwdef.h @@ -44,7 +44,6 @@ #define PTE_TYPE_INVALID (3 << 0) #define PTE_PRESENT (1 << 2) -#define PTE_FILE (1 << 3) /* only when !PRESENT */ #define PTE_YOUNG (1 << 3) #define PTE_DIRTY (1 << 4) #define PTE_CACHEABLE (1 << 5) diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index ed6f7d000fba7b..818d0f5598e324 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h @@ -283,20 +283,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; #define MAX_SWAPFILES_CHECK() \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) -/* - * Encode and decode a file entry. File entries are stored in the Linux - * page tables as follows: - * - * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 - * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <----------------------- offset ----------------------> 1 0 0 0 - */ -#define pte_file(pte) (pte_val(pte) & PTE_FILE) -#define pte_to_pgoff(x) (pte_val(x) >> 4) -#define pgoff_to_pte(x) __pte(((x) << 4) | PTE_FILE) - -#define PTE_FILE_MAX_BITS 28 - /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ /* FIXME: this is not correct */ #define kern_addr_valid(addr) (1) From 0a191362058391878cc2a4d4ccddcd8223eb4f79 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:22 -0800 Subject: [PATCH 69/78] x86: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level.h | 38 +-------------------------- arch/x86/include/asm/pgtable-3level.h | 12 --------- arch/x86/include/asm/pgtable.h | 20 -------------- arch/x86/include/asm/pgtable_64.h | 6 +---- arch/x86/include/asm/pgtable_types.h | 3 --- 5 files changed, 2 insertions(+), 77 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 206a87fdd22dac..fd74a11959de0d 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* - * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range. - */ -#define PTE_FILE_MAX_BITS 29 -#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) -#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) - -#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) -#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) - -#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) -#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) - -static __always_inline pgoff_t pte_to_pgoff(pte_t pte) -{ - return (pgoff_t) - (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3)); -} - -static __always_inline pte_t pgoff_to_pte(pgoff_t off) -{ - return (pte_t){ - .pte_low = - pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + - pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + - pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) + - _PAGE_FILE, - }; -} - /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_TYPE_BITS 5 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 81bb91b49a88fa..cdaa58c9b39ed3 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif -/* - * Bits 0, 6 and 7 are taken in the low part of the pte, - * put the 32 bits of offset into the high part. - * - * For soft-dirty tracking 11 bit is taken from - * the low part of pte as well. - */ -#define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) \ - ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) -#define PTE_FILE_MAX_BITS 32 - /* Encode and de-code a swap entry */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e8a5454acc9922..0fe03f834fb123 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -115,11 +115,6 @@ static inline int pte_write(pte_t pte) return pte_flags(pte) & _PAGE_RW; } -static inline int pte_file(pte_t pte) -{ - return pte_flags(pte) & _PAGE_FILE; -} - static inline int pte_huge(pte_t pte) { return pte_flags(pte) & _PAGE_PSE; @@ -329,21 +324,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_file_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_file_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline int pte_file_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SOFT_DIRTY; -} - #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4572b2f302379e..e227970f983ee0 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* PUD - Level3 access */ /* PMD - Level 2 access */ -#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ - _PAGE_FILE }) -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT /* PTE - Level 1 access. */ @@ -145,7 +141,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_TYPE_BITS 5 #ifdef CONFIG_NUMA_BALANCING /* Automatic NUMA balancing needs to be distinguishable from swap entries */ #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 25bcd4a89517af..5185a4f599ec03 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -38,8 +38,6 @@ /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL -/* - set: nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) @@ -114,7 +112,6 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ From d9ecee281b8f89da6d3203be62802eda991e37cc Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:25 -0800 Subject: [PATCH 70/78] xtensa: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Acked-by: Max Filippov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/xtensa/include/asm/pgtable.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 872bf0194e6d25..01b80dce9d65d2 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -89,8 +89,6 @@ * (PAGE_NONE)| PPN | 0 | 00 | ADW | 01 | 11 | 11 | * +-----------------------------------------+ * swap | index | type | 01 | 11 | 00 | - * +- - - - - - - - - - - - - - - - - - - - -+ - * file | file offset | 01 | 11 | 10 | * +-----------------------------------------+ * * For T1050 hardware and earlier the layout differs for present and (PAGE_NONE) @@ -111,7 +109,6 @@ * index swap offset / PAGE_SIZE (bit 11-31: 21 bits -> 8 GB) * (note that the index is always non-zero) * type swap type (5 bits -> 32 types) - * file offset 26-bit offset into the file, in increments of PAGE_SIZE * * Notes: * - (PROT_NONE) is a special case of 'present' but causes an exception for @@ -144,7 +141,6 @@ #define _PAGE_HW_VALID 0x00 #define _PAGE_NONE 0x0f #endif -#define _PAGE_FILE (1<<1) /* file mapped page, only if !present */ #define _PAGE_USER (1<<4) /* user access (ring=1) */ @@ -260,7 +256,6 @@ static inline void pgtable_cache_init(void) { } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITABLE; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) @@ -390,11 +385,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define PTE_FILE_MAX_BITS 26 -#define pte_to_pgoff(pte) (pte_val(pte) >> 6) -#define pgoff_to_pte(off) \ - ((pte_t) { ((off) << 6) | _PAGE_CA_INVALID | _PAGE_FILE | _PAGE_USER }) - #endif /* !defined (__ASSEMBLY__) */ From 74ec67511d36f9c731065b1dae7d9638a3b639d3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Feb 2015 14:11:28 -0800 Subject: [PATCH 71/78] mm: memory: remove ->vm_file check on shared writable vmas Shared anonymous mmaps are implemented with shmem files, so all VMAs with shared writable semantics also have an underlying backing file. Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 9aa09217fe2091..0e9b3261065587 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2127,9 +2127,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, balance_dirty_pages_ratelimited(mapping); } - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); + file_update_time(vma->vm_file); } put_page(dirty_page); if (page_mkwrite) { @@ -2971,8 +2969,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, balance_dirty_pages_ratelimited(mapping); } - /* file_update_time outside page_lock */ - if (vma->vm_file && !vma->vm_ops->page_mkwrite) + if (!vma->vm_ops->page_mkwrite) file_update_time(vma->vm_file); return ret; From f38b4b310d402055702c63b0989dbcd16adf9537 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Feb 2015 14:11:30 -0800 Subject: [PATCH 72/78] mm: memory: merge shared-writable dirtying branches in do_wp_page() Whether there is a vm_ops->page_mkwrite or not, the page dirtying is pretty much the same. Make sure the page references are the same in both cases, then merge the two branches. It's tempting to go even further and page-lock the !page_mkwrite case, to get it in line with everybody else setting the page table and thus further simplify the model. But that's not quite compelling enough to justify dropping the pte lock, then relocking and verifying the entry for filesystems without ->page_mkwrite, which notably includes tmpfs. Leave it for now and lock the page late in the !page_mkwrite case. Signed-off-by: Johannes Weiner Acked-by: Kirill A. Shutemov Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 48 +++++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0e9b3261065587..988d3099a25d0c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2005,7 +2005,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t entry; int ret = 0; int page_mkwrite = 0; - struct page *dirty_page = NULL; + bool dirty_shared = false; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ struct mem_cgroup *memcg; @@ -2056,6 +2056,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { + page_cache_get(old_page); /* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by @@ -2063,7 +2064,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); tmp = do_page_mkwrite(vma, old_page, address); if (unlikely(!tmp || (tmp & @@ -2083,11 +2084,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(old_page); goto unlock; } - page_mkwrite = 1; } - dirty_page = old_page; - get_page(dirty_page); + + dirty_shared = true; reuse: /* @@ -2106,43 +2106,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - if (!dirty_page) - return ret; - - if (!page_mkwrite) { + if (dirty_shared) { struct address_space *mapping; int dirtied; - lock_page(dirty_page); - dirtied = set_page_dirty(dirty_page); - VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); - mapping = dirty_page->mapping; - unlock_page(dirty_page); + if (!page_mkwrite) + lock_page(old_page); - if (dirtied && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + dirtied = set_page_dirty(old_page); + VM_BUG_ON_PAGE(PageAnon(old_page), old_page); + mapping = old_page->mapping; + unlock_page(old_page); + page_cache_release(old_page); - file_update_time(vma->vm_file); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { + if ((dirtied || page_mkwrite) && mapping) { /* * Some device drivers do not set page.mapping * but still dirty their pages */ balance_dirty_pages_ratelimited(mapping); } + + if (!page_mkwrite) + file_update_time(vma->vm_file); } return ret; From 3cd7645de624939c38f5124b4ac15f8b35a1a8b7 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Feb 2015 14:11:33 -0800 Subject: [PATCH 73/78] mm, hugetlb: remove unnecessary lower bound on sysctl handlers"? Commit ed4d4902ebdd ("mm, hugetlb: remove hugetlb_zero and hugetlb_infinity") replaced 'unsigned long hugetlb_zero' with 'int zero' leading to out-of-bounds access in proc_doulongvec_minmax(). Use '.extra1 = NULL' instead of '.extra1 = &zero'. Passing NULL is equivalent to passing minimal value, which is 0 for unsigned types. Fixes: ed4d4902ebdd ("mm, hugetlb: remove hugetlb_zero and hugetlb_infinity") Signed-off-by: Andrey Ryabinin Reported-by: Dmitry Vyukov Suggested-by: Manfred Spraul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 137c7f69b2642f..88ea2d6e003140 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = &zero, }, #endif { @@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = &zero, }, #endif { From 753162cd849c45580fb5aaa7f3597c81e74e391c Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Feb 2015 14:11:36 -0800 Subject: [PATCH 74/78] mm: hugetlb: fix type of hugetlb_treat_as_movable variable hugetlb_treat_as_movable declared as unsigned long, but proc_dointvec() used for parsing it: static struct ctl_table vm_table[] = { ... { .procname = "hugepages_treat_as_movable", .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, This seems harmless, but it's better to use int type here. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Manfred Spraul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 431b7fc605c9bc..7d785635992079 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -86,7 +86,7 @@ void free_huge_page(struct page *page); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif -extern unsigned long hugepages_treat_as_movable; +extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85032de5e20f88..be0e5d0db5ec86 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,7 @@ #include #include "internal.h" -unsigned long hugepages_treat_as_movable; +int hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; From 4c5018ce06c6be292d4ed96cecf2c8dda361a923 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Tue, 10 Feb 2015 14:11:39 -0800 Subject: [PATCH 75/78] mm/page_alloc.c: place zone_id check before VM_BUG_ON_PAGE check If the freeing page and its buddy page are not at the same zone, the current holding zone->lock for the freeing page cann't prevent buddy page getting allocated, this could trigger VM_BUG_ON_PAGE in page_is_buddy() at a very tiny chance, such as: cpu 0: cpu 1: hold zone_1 lock check page and it buddy PageBuddy(buddy) is true hold zone_2 lock page_order(buddy) == order is true alloc buddy trigger VM_BUG_ON_PAGE(page_count(buddy) != 0) zone_1->lock prevents the freeing page getting allocated zone_2->lock prevents the buddy page getting allocated they are not the same zone->lock. If we can't remove the zone_id check statement, it's better handle this rare race. This patch fixes this by placing the zone_id check before the VM_BUG_ON_PAGE check. Signed-off-by: Weijie Yang Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e20f9c2fa5ab7..f121050e8530b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -552,17 +552,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (page_is_guard(buddy) && page_order(buddy) == order) { - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - if (page_zone_id(page) != page_zone_id(buddy)) return 0; + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - /* * zone check is done late to avoid uselessly * calculating zone/node ids for pages that could @@ -571,6 +569,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + return 1; } return 0; From dbf22eb6d8675fc173154d9f1bd1bd0fda53a001 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:41 -0800 Subject: [PATCH 76/78] memcg: zap __memcg_{charge,uncharge}_slab They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's zap them and call these functions directly. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 21 +++------------------ mm/slab.h | 4 ++-- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7c95af8d552cff..18ccb2988979cf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -403,8 +403,9 @@ void memcg_update_array_size(int num_groups); struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages); +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); int __memcg_cleanup_cache_params(struct kmem_cache *s); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b58701b9647b9..e229e3ad615cc2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2495,8 +2495,8 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages) { struct page_counter *counter; int ret = 0; @@ -2533,8 +2533,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, - unsigned long nr_pages) +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) { page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) @@ -2767,20 +2766,6 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, current->memcg_kmem_skip_account = 0; } -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) -{ - unsigned int nr_pages = 1 << order; - - return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); -} - -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) -{ - unsigned int nr_pages = 1 << order; - - memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); -} - /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. diff --git a/mm/slab.h b/mm/slab.h index 1cf4005482dd1d..90430d6f665e62 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return __memcg_charge_slab(s, gfp, order); + return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - __memcg_uncharge_slab(s, order); + memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); } #else static inline bool is_root_cache(struct kmem_cache *s) From 3e0350a36414a73c5c2d1e354f8c0ab4ace1296d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:44 -0800 Subject: [PATCH 77/78] memcg: zap memcg_name argument of memcg_create_kmem_cache Instead of passing the name of the memory cgroup which the cache is created for in the memcg_name_argument, let's obtain it immediately in memcg_create_kmem_cache. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 +-- mm/memcontrol.c | 5 +---- mm/slab_common.c | 9 +++++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 9a139b63706973..eca9ed303a1b2b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -117,8 +117,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *, - const char *); + struct kmem_cache *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e229e3ad615cc2..baf7eb27e3aeef 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2607,8 +2607,6 @@ void memcg_update_array_size(int num) static void memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - static char memcg_name_buf[NAME_MAX + 1]; /* protected by - memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -2624,8 +2622,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); - cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); + cachep = memcg_create_kmem_cache(memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/slab_common.c b/mm/slab_common.c index 67f182c10f24d2..1b782a2d3b3d1a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -430,16 +430,15 @@ EXPORT_SYMBOL(kmem_cache_create); * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. - * @memcg_name: The name of the memory cgroup (used for naming the new cache). * * This function attempts to create a kmem cache that will serve allocation * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache, - const char *memcg_name) + struct kmem_cache *root_cache) { + static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ struct kmem_cache *s = NULL; char *cache_name; @@ -448,8 +447,10 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + cgroup_name(mem_cgroup_css(memcg)->cgroup, + memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), memcg_name); + memcg_cache_id(memcg), memcg_name_buf); if (!cache_name) goto out_unlock; From d5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:47 -0800 Subject: [PATCH 78/78] memcg: zap memcg_slab_caches and memcg_slab_mutex mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to the given cgroup. Currently, it is only used on css free in order to destroy all caches corresponding to the memory cgroup being freed. The list is protected by memcg_slab_mutex. The mutex is also used to protect kmem_cache->memcg_params->memcg_caches arrays and synchronizes kmem_cache_destroy vs memcg_unregister_all_caches. However, we can perfectly get on without these two. To destroy all caches corresponding to a memory cgroup, we can walk over the global list of kmem caches, slab_caches, and we can do all the synchronization stuff using the slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid of the memcg_slab_caches and memcg_slab_mutex. Apart from this nice cleanup, it also: - assures that rcu_barrier() is called once at max when a root cache is destroyed or a memory cgroup is freed, no matter how many caches have SLAB_DESTROY_BY_RCU flag set; - fixes the race between kmem_cache_destroy and kmem_cache_create that exists, because memcg_cleanup_cache_params, which is called from kmem_cache_destroy after checking that kmem_cache->refcount=0, releases the slab_mutex, which gives kmem_cache_create a chance to make an alias to a cache doomed to be destroyed. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 - include/linux/slab.h | 6 +- mm/memcontrol.c | 156 ++++--------------------------------- mm/slab_common.c | 142 +++++++++++++++++++++++---------- 4 files changed, 120 insertions(+), 186 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 18ccb2988979cf..fb212e1d700d29 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -407,8 +407,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -int __memcg_cleanup_cache_params(struct kmem_cache *s); - /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/include/linux/slab.h b/include/linux/slab.h index eca9ed303a1b2b..2e3b448cfa2d8b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,8 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *); +void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); +void memcg_destroy_kmem_caches(struct mem_cgroup *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -490,7 +490,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * Child caches will hold extra metadata needed for its operation. Fields are: * * @memcg: pointer to the memcg this cache belongs to - * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from */ struct memcg_cache_params { @@ -502,7 +501,6 @@ struct memcg_cache_params { }; struct { struct mem_cgroup *memcg; - struct list_head list; struct kmem_cache *root_cache; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index baf7eb27e3aeef..f3f8a4f52a0ca3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -343,9 +343,6 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list, but per-memcg; - * protected by memcg_slab_mutex */ - struct list_head memcg_slab_caches; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -2476,25 +2473,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -/* - * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or - * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. - */ -static DEFINE_MUTEX(memcg_slab_mutex); - -/* - * This is a bit cumbersome, but it is rarely used and avoids a backpointer - * in the memcg_cache_params struct. - */ -static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) -{ - struct kmem_cache *cachep; - - VM_BUG_ON(p->is_root_cache); - cachep = p->root_cache; - return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); -} - int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages) { @@ -2578,10 +2556,7 @@ static int memcg_alloc_cache_id(void) else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(size); - mutex_unlock(&memcg_slab_mutex); - if (err) { ida_simple_remove(&kmem_limited_groups, id); return err; @@ -2604,120 +2579,20 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = num; } -static void memcg_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - struct kmem_cache *cachep; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - id = memcg_cache_id(memcg); - - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can try to - * create the same cache, but only one of them may succeed. - */ - if (cache_from_memcg_idx(root_cache, id)) - return; - - cachep = memcg_create_kmem_cache(memcg, root_cache); - /* - * If we could not create a memcg cache, do not complain, because - * that's not critical at all as we can always proceed with the root - * cache. - */ - if (!cachep) - return; - - list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); - - /* - * Since readers won't lock (see cache_from_memcg_idx()), we need a - * barrier here to ensure nobody will see the kmem_cache partially - * initialized. - */ - smp_wmb(); - - BUG_ON(root_cache->memcg_params->memcg_caches[id]); - root_cache->memcg_params->memcg_caches[id] = cachep; -} - -static void memcg_unregister_cache(struct kmem_cache *cachep) -{ - struct kmem_cache *root_cache; - struct mem_cgroup *memcg; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - BUG_ON(is_root_cache(cachep)); - - root_cache = cachep->memcg_params->root_cache; - memcg = cachep->memcg_params->memcg; - id = memcg_cache_id(memcg); - - BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); - root_cache->memcg_params->memcg_caches[id] = NULL; - - list_del(&cachep->memcg_params->list); - - kmem_cache_destroy(cachep); -} - -int __memcg_cleanup_cache_params(struct kmem_cache *s) -{ - struct kmem_cache *c; - int i, failed = 0; - - mutex_lock(&memcg_slab_mutex); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - - memcg_unregister_cache(c); - - if (cache_from_memcg_idx(s, i)) - failed++; - } - mutex_unlock(&memcg_slab_mutex); - return failed; -} - -static void memcg_unregister_all_caches(struct mem_cgroup *memcg) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *params, *tmp; - - if (!memcg_kmem_is_active(memcg)) - return; - - mutex_lock(&memcg_slab_mutex); - list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { - cachep = memcg_params_to_cache(params); - memcg_unregister_cache(cachep); - } - mutex_unlock(&memcg_slab_mutex); -} - -struct memcg_register_cache_work { +struct memcg_kmem_cache_create_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_register_cache_func(struct work_struct *w) +static void memcg_kmem_cache_create_func(struct work_struct *w) { - struct memcg_register_cache_work *cw = - container_of(w, struct memcg_register_cache_work, work); + struct memcg_kmem_cache_create_work *cw = + container_of(w, struct memcg_kmem_cache_create_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - mutex_lock(&memcg_slab_mutex); - memcg_register_cache(memcg, cachep); - mutex_unlock(&memcg_slab_mutex); + memcg_create_kmem_cache(memcg, cachep); css_put(&memcg->css); kfree(cw); @@ -2726,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct memcg_register_cache_work *cw; + struct memcg_kmem_cache_create_work *cw; cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (!cw) @@ -2739,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; + INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_register_cache will recurse. + * in __memcg_schedule_kmem_cache_create will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -2759,7 +2634,7 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ current->memcg_kmem_skip_account = 1; - __memcg_schedule_register_cache(memcg, cachep); + __memcg_schedule_kmem_cache_create(memcg, cachep); current->memcg_kmem_skip_account = 0; } @@ -2807,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) * could happen with the slab_mutex held. So it's better to * defer everything. */ - memcg_schedule_register_cache(memcg, cachep); + memcg_schedule_kmem_cache_create(memcg, cachep); out: css_put(&memcg->css); return cachep; @@ -4136,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { - memcg_unregister_all_caches(memcg); + memcg_destroy_kmem_caches(memcg); mem_cgroup_sockets_destroy(memcg); } #else @@ -4664,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) spin_lock_init(&memcg->event_list_lock); #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->memcg_slab_caches); #endif return &memcg->css; diff --git a/mm/slab_common.c b/mm/slab_common.c index 1b782a2d3b3d1a..6e1e4cf65836cb 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -425,6 +425,49 @@ kmem_cache_create(const char *name, size_t size, size_t align, } EXPORT_SYMBOL(kmem_cache_create); +static int do_kmem_cache_shutdown(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + if (__kmem_cache_shutdown(s) != 0) { + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + return -EBUSY; + } + + if (s->flags & SLAB_DESTROY_BY_RCU) + *need_rcu_barrier = true; + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) { + struct kmem_cache *root_cache = s->memcg_params->root_cache; + int memcg_id = memcg_cache_id(s->memcg_params->memcg); + + BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); + root_cache->memcg_params->memcg_caches[memcg_id] = NULL; + } +#endif + list_move(&s->list, release); + return 0; +} + +static void do_kmem_cache_release(struct list_head *release, + bool need_rcu_barrier) +{ + struct kmem_cache *s, *s2; + + if (need_rcu_barrier) + rcu_barrier(); + + list_for_each_entry_safe(s, s2, release, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + } +} + #ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. @@ -435,10 +478,11 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) +void memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ + int memcg_id = memcg_cache_id(memcg); struct kmem_cache *s = NULL; char *cache_name; @@ -447,6 +491,14 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, memcg_id)) + goto out_unlock; + cgroup_name(mem_cgroup_css(memcg)->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, @@ -458,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); + /* + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. + */ if (IS_ERR(s)) { kfree(cache_name); - s = NULL; + goto out_unlock; } + /* + * Since readers won't lock (see cache_from_memcg_idx()), we need a + * barrier here to ensure nobody will see the kmem_cache partially + * initialized. + */ + smp_wmb(); + root_cache->memcg_params->memcg_caches[memcg_id] = s; + out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - return s; } -static int memcg_cleanup_cache_params(struct kmem_cache *s) +void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { - int rc; + LIST_HEAD(release); + bool need_rcu_barrier = false; + struct kmem_cache *s, *s2; - if (!s->memcg_params || - !s->memcg_params->is_root_cache) - return 0; + get_online_cpus(); + get_online_mems(); - mutex_unlock(&slab_mutex); - rc = __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex); + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (is_root_cache(s) || s->memcg_params->memcg != memcg) + continue; + /* + * The cgroup is about to be freed and therefore has no charges + * left. Hence, all its caches must be empty by now. + */ + BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + } + mutex_unlock(&slab_mutex); - return rc; -} -#else -static int memcg_cleanup_cache_params(struct kmem_cache *s) -{ - return 0; + put_online_mems(); + put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { + memcg_free_cache_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } void kmem_cache_destroy(struct kmem_cache *s) { + int i; + LIST_HEAD(release); + bool need_rcu_barrier = false; + bool busy = false; + get_online_cpus(); get_online_mems(); @@ -510,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - if (memcg_cleanup_cache_params(s) != 0) - goto out_unlock; + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (__kmem_cache_shutdown(s) != 0) { - printk(KERN_ERR "kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); - goto out_unlock; + if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + busy = true; } - list_del(&s->list); - - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); -#else - slab_kmem_cache_release(s); -#endif - goto out; + if (!busy) + do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); out_unlock: mutex_unlock(&slab_mutex); -out: + put_online_mems(); put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy);