From 46df55b51734fc98be75b2148a463069a65685be Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Thu, 29 Apr 2021 22:53:18 -0700 Subject: [PATCH 001/175] arch/ia64/kernel/head.S: remove duplicate include 'linux/pgtable.h' included in 'arch/ia64/kernel/head.S' is duplicated. Link: https://lkml.kernel.org/r/20210303084549.179346-1-zhang.yunkai@zte.com.cn Signed-off-by: Zhang Yunkai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/head.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index 30f1ef760136dc..81b64d1adad5f8 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -33,7 +33,6 @@ #include #include #include -#include #include #ifdef CONFIG_HOTPLUG_CPU From 3eac094b93e757a297c2807bec41503fe8241d17 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 29 Apr 2021 22:53:21 -0700 Subject: [PATCH 002/175] arch/ia64/kernel/fsys.S: fix typos Mundane spelling fixes. Link: https://lkml.kernel.org/r/20210311061058.29492-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Cc: John Paul Adrian Glaubitz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/fsys.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 0750a716adc72a..2094f324901982 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -172,7 +172,7 @@ ENTRY(fsys_gettimeofday) // r25 = itc_lastcycle value // r26 = address clocksource cycle_last // r27 = (not used) - // r28 = sequence number at the beginning of critcal section + // r28 = sequence number at the beginning of critical section // r29 = address of itc_jitter // r30 = time processing flags / memory address // r31 = pointer to result @@ -432,7 +432,7 @@ GLOBAL_ENTRY(fsys_bubble_down) * - r29: psr * * We used to clear some PSR bits here but that requires slow - * serialization. Fortuntely, that isn't really necessary. + * serialization. Fortunately, that isn't really necessary. * The rationale is as follows: we used to clear bits * ~PSR_PRESERVED_BITS in PSR.L. Since * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we From 8b30c6256d2bddc080ac13f39363d4efbb0b292e Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 29 Apr 2021 22:53:24 -0700 Subject: [PATCH 003/175] arch/ia64/include/asm/pgtable.h: minor typo fixes s/migraton/migration/ Link: https://lkml.kernel.org/r/20210313045519.9310-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 9b4efe89e62d87..00a76ed7e01847 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -328,7 +328,7 @@ extern void __ia64_sync_icache_dcache(pte_t pteval); static inline void set_pte(pte_t *ptep, pte_t pteval) { /* page is present && page is user && page is executable - * && (page swapin or new page or page migraton + * && (page swapin or new page or page migration * || copy_on_write with page copying.) */ if (pte_present_exec_user(pteval) && From b22a8f7b4bde4e4ab73b64908ffd5d90ecdcdbfd Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 29 Apr 2021 22:53:27 -0700 Subject: [PATCH 004/175] ia64: ensure proper NUMA distance and possible map initialization John Paul reported a warning about bogus NUMA distance values spurred by commit: 620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort") In this case, the afflicted machine comes up with a reported 256 possible nodes, all of which are 0 distance away from one another. This was previously silently ignored, but is now caught by the aforementioned commit. The culprit is ia64's node_possible_map which remains unchanged from its initialization value of NODE_MASK_ALL. In John's case, the machine doesn't have any SRAT nor SLIT table, but AIUI the possible map remains untouched regardless of what ACPI tables end up being parsed. Thus, !online && possible nodes remain with a bogus distance of 0 (distances \in [0, 9] are "reserved and have no meaning" as per the ACPI spec). Follow x86 / drivers/base/arch_numa's example and set the possible map to the parsed map, which in this case seems to be the online map. Link: http://lore.kernel.org/r/255d6b5d-194e-eb0e-ecdd-97477a534441@physik.fu-berlin.de Link: https://lkml.kernel.org/r/20210318130617.896309-1-valentin.schneider@arm.com Fixes: 620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort") Signed-off-by: Valentin Schneider Reported-by: John Paul Adrian Glaubitz Tested-by: John Paul Adrian Glaubitz Tested-by: Sergei Trofimovich Cc: "Peter Zijlstra (Intel)" Cc: Ingo Molnar Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Anatoly Pugachev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/acpi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index a5636524af7693..e2af6b172200ed 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -446,7 +446,8 @@ void __init acpi_numa_fixup(void) if (srat_num_cpus == 0) { node_set_online(0); node_cpuid[0].phys_id = hard_smp_processor_id(); - return; + slit_distance(0, 0) = LOCAL_DISTANCE; + goto out; } /* @@ -489,7 +490,7 @@ void __init acpi_numa_fixup(void) for (j = 0; j < MAX_NUMNODES; j++) slit_distance(i, j) = i == j ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return; + goto out; } memset(numa_slit, -1, sizeof(numa_slit)); @@ -514,6 +515,8 @@ void __init acpi_numa_fixup(void) printk("\n"); } #endif +out: + node_possible_map = node_online_map; } #endif /* CONFIG_ACPI_NUMA */ From d732f47db10f292657356b3be1fb479777e2117c Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:53:30 -0700 Subject: [PATCH 005/175] ia64: drop unused IA64_FW_EMU ifdef It's a remnant of deleted hpsim emulation target removed in fc5bad037 ("ia64: remove the hpsim platform"). Link: https://lkml.kernel.org/r/20210323224009.240625-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/head.S | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index 81b64d1adad5f8..f22469f1c1fccc 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -404,11 +404,6 @@ start_ap: // This is executed by the bootstrap processor (bsp) only: -#ifdef CONFIG_IA64_FW_EMU - // initialize PAL & SAL emulator: - br.call.sptk.many rp=sys_fw_init -.ret1: -#endif br.call.sptk.many rp=start_kernel .ret2: addl r3=@ltoff(halt_msg),gp ;; From 6d073dad9754c28ab23409f794b3e1ece37d0609 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:53:33 -0700 Subject: [PATCH 006/175] ia64: simplify code flow around swiotlb init Before the change CONFIG_INTEL_IOMMU && !CONFIG_SWIOTLB && !CONFIG_FLATMEM could skip `set_max_mapnr(max_low_pfn);` if iommu is not present on system. Link: https://lkml.kernel.org/r/20210328202439.403601-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Cc: John Paul Adrian Glaubitz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 16d0d7d2265784..a63585db94fe83 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -644,13 +644,16 @@ mem_init (void) * _before_ any drivers that may need the PCI DMA interface are * initialized or bootmem has been freed. */ + do { #ifdef CONFIG_INTEL_IOMMU - detect_intel_iommu(); - if (!iommu_detected) + detect_intel_iommu(); + if (iommu_detected) + break; #endif #ifdef CONFIG_SWIOTLB swiotlb_init(1); #endif + } while (0); #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); From 454534366c6faf286f5dac8db011d461e9c82320 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 29 Apr 2021 22:53:36 -0700 Subject: [PATCH 007/175] ia64: trivial spelling fixes s/seralize/serialize/ .....three different places Link: https://lkml.kernel.org/r/YFY+9uwvNLeb/3Ab@Gentoo Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/pal.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S index d3e22c018b68ac..06d01a070aae24 100644 --- a/arch/ia64/kernel/pal.S +++ b/arch/ia64/kernel/pal.S @@ -86,7 +86,7 @@ GLOBAL_ENTRY(ia64_pal_call_static) mov ar.pfs = loc1 mov rp = loc0 ;; - srlz.d // seralize restoration of psr.l + srlz.d // serialize restoration of psr.l br.ret.sptk.many b0 END(ia64_pal_call_static) EXPORT_SYMBOL(ia64_pal_call_static) @@ -194,7 +194,7 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static) mov rp = loc0 ;; mov ar.rsc=loc4 // restore RSE configuration - srlz.d // seralize restoration of psr.l + srlz.d // serialize restoration of psr.l br.ret.sptk.many b0 END(ia64_pal_call_phys_static) EXPORT_SYMBOL(ia64_pal_call_phys_static) @@ -252,7 +252,7 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) mov rp = loc0 ;; mov ar.rsc=loc4 // restore RSE configuration - srlz.d // seralize restoration of psr.l + srlz.d // serialize restoration of psr.l br.ret.sptk.many b0 END(ia64_pal_call_phys_stacked) EXPORT_SYMBOL(ia64_pal_call_phys_stacked) From e3db00b79d74caaf84cd9e1d4927979abfd0d7c9 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:53:39 -0700 Subject: [PATCH 008/175] ia64: fix EFI_DEBUG build When enabled local debugging via `#define EFI_DEBUG 1` noticed build failure: arch/ia64/kernel/efi.c:564:8: error: 'i' undeclared (first use in this function) While at it fixed benign string format mismatches visible only when EFI_DEBUG is enabled: arch/ia64/kernel/efi.c:589:11: warning: format '%lx' expects argument of type 'long unsigned int', but argument 5 has type 'u64' {aka 'long long unsigned int'} [-Wformat=] Link: https://lkml.kernel.org/r/20210328212246.685601-1-slyfox@gentoo.org Fixes: 14fb42090943559 ("efi: Merge EFI system table revision and vendor checks") Signed-off-by: Sergei Trofimovich Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/efi.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index c5fe21de46a81d..31149e41f9be09 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -415,10 +415,10 @@ efi_get_pal_addr (void) mask = ~((1 << IA64_GRANULE_SHIFT) - 1); printk(KERN_INFO "CPU %d: mapping PAL code " - "[0x%lx-0x%lx) into [0x%lx-0x%lx)\n", - smp_processor_id(), md->phys_addr, - md->phys_addr + efi_md_size(md), - vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE); + "[0x%llx-0x%llx) into [0x%llx-0x%llx)\n", + smp_processor_id(), md->phys_addr, + md->phys_addr + efi_md_size(md), + vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE); #endif return __va(md->phys_addr); } @@ -560,6 +560,7 @@ efi_init (void) { efi_memory_desc_t *md; void *p; + unsigned int i; for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) @@ -586,7 +587,7 @@ efi_init (void) } printk("mem%02d: %s " - "range=[0x%016lx-0x%016lx) (%4lu%s)\n", + "range=[0x%016llx-0x%016llx) (%4lu%s)\n", i, efi_md_typeattr_format(buf, sizeof(buf), md), md->phys_addr, md->phys_addr + efi_md_size(md), size, unit); From 5f28bdee7084dc560a3b3154a3345bfd73135ea4 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:53:42 -0700 Subject: [PATCH 009/175] ia64: mca: always make IA64_MCA_DEBUG an expression At least ia64_mca_log_sal_error_record() expects some statement: static void ia64_mca_log_sal_error_record(int sal_info_type) { ... if (irq_safe) IA64_MCA_DEBUG("CPU %d: SAL log contains %s error record ", smp_processor_id(), sal_info_type < ARRAY_SIZE(rec_name) ? rec_name[sal_info_type] : "UNKNOWN"); ... } Instead of fixing all callers the change expicitly makes IA64_MCA_DEBUG a non-empty expression. Link: https://lkml.kernel.org/r/20210328215549.830420-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/mca.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index adf6521525f4bd..cdbac4b52f3092 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -109,9 +109,9 @@ #include "irq.h" #if defined(IA64_MCA_DEBUG_INFO) -# define IA64_MCA_DEBUG(fmt...) printk(fmt) +# define IA64_MCA_DEBUG(fmt...) printk(fmt) #else -# define IA64_MCA_DEBUG(fmt...) +# define IA64_MCA_DEBUG(fmt...) do {} while (0) #endif #define NOTIFY_INIT(event, regs, arg, spin) \ From 9187592b96385e5060dfb2b182aa9ec93d5c0332 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:53:45 -0700 Subject: [PATCH 010/175] ia64: drop marked broken DISCONTIGMEM and VIRTUAL_MEM_MAP DISCONTIGMEM was marked BROKEN in 5.11. Let's remove it. Booted SPARSEMEM successfully on rx3600. Link: https://lkml.kernel.org/r/20210404193440.2615358-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig | 23 ---- arch/ia64/configs/bigsur_defconfig | 1 - arch/ia64/include/asm/meminit.h | 11 -- arch/ia64/include/asm/page.h | 25 +--- arch/ia64/include/asm/pgtable.h | 5 - arch/ia64/kernel/Makefile | 2 +- arch/ia64/kernel/ia64_ksyms.c | 12 -- arch/ia64/kernel/machine_kexec.c | 2 +- arch/ia64/mm/Makefile | 1 - arch/ia64/mm/contig.c | 4 - arch/ia64/mm/discontig.c | 21 --- arch/ia64/mm/fault.c | 15 -- arch/ia64/mm/init.c | 213 ----------------------------- 13 files changed, 4 insertions(+), 331 deletions(-) delete mode 100644 arch/ia64/kernel/ia64_ksyms.c diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 2ad7a8d29fcc18..81e2b893b1e7ec 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -286,15 +286,6 @@ config FORCE_CPEI_RETARGET config ARCH_SELECT_MEMORY_MODEL def_bool y -config ARCH_DISCONTIGMEM_ENABLE - def_bool y - depends on BROKEN - help - Say Y to support efficient handling of discontiguous physical memory, - for architectures which are either NUMA (Non-Uniform Memory Access) - or have huge holes in the physical address space for other reasons. - See for more. - config ARCH_FLATMEM_ENABLE def_bool y @@ -325,22 +316,8 @@ config NODES_SHIFT MAX_NUMNODES will be 2^(This value). If in doubt, use the default. -# VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. -# VIRTUAL_MEM_MAP has been retained for historical reasons. -config VIRTUAL_MEM_MAP - bool "Virtual mem map" - depends on !SPARSEMEM && !FLATMEM - default y - help - Say Y to compile the kernel with support for a virtual mem map. - This code also only takes effect if a memory hole of greater than - 1 Gb is found during boot. You must turn this option on if you - require the DISCONTIGMEM option for your machine. If you are - unsure, say Y. - config HOLES_IN_ZONE bool - default y if VIRTUAL_MEM_MAP config HAVE_ARCH_NODEDATA_EXTENSION def_bool y diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig index c409756b539629..0341a67cc1bf2c 100644 --- a/arch/ia64/configs/bigsur_defconfig +++ b/arch/ia64/configs/bigsur_defconfig @@ -9,7 +9,6 @@ CONFIG_SGI_PARTITION=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_PREEMPT=y -# CONFIG_VIRTUAL_MEM_MAP is not set CONFIG_IA64_PALINFO=y CONFIG_EFI_VARS=y CONFIG_BINFMT_MISC=m diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h index e789c0818edbcf..6c47a239fc26df 100644 --- a/arch/ia64/include/asm/meminit.h +++ b/arch/ia64/include/asm/meminit.h @@ -58,15 +58,4 @@ extern int reserve_elfcorehdr(u64 *start, u64 *end); extern int register_active_ranges(u64 start, u64 len, int nid); -#ifdef CONFIG_VIRTUAL_MEM_MAP - extern unsigned long VMALLOC_END; - extern struct page *vmem_map; - extern int create_mem_map_page_table(u64 start, u64 end, void *arg); - extern int vmemmap_find_next_valid_pfn(int, int); -#else -static inline int vmemmap_find_next_valid_pfn(int node, int i) -{ - return i + 1; -} -#endif #endif /* meminit_h */ diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index b69a5499d75b8a..f4dc81fa714626 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -95,31 +95,10 @@ do { \ #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_VIRTUAL_MEM_MAP -extern int ia64_pfn_valid (unsigned long pfn); -#else -# define ia64_pfn_valid(pfn) 1 -#endif - -#ifdef CONFIG_VIRTUAL_MEM_MAP -extern struct page *vmem_map; -#ifdef CONFIG_DISCONTIGMEM -# define page_to_pfn(page) ((unsigned long) (page - vmem_map)) -# define pfn_to_page(pfn) (vmem_map + (pfn)) -# define __pfn_to_phys(pfn) PFN_PHYS(pfn) -#else -# include -#endif -#else -# include -#endif +#include #ifdef CONFIG_FLATMEM -# define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) -#elif defined(CONFIG_DISCONTIGMEM) -extern unsigned long min_low_pfn; -extern unsigned long max_low_pfn; -# define pfn_valid(pfn) (((pfn) >= min_low_pfn) && ((pfn) < max_low_pfn) && ia64_pfn_valid(pfn)) +# define pfn_valid(pfn) ((pfn) < max_mapnr) #endif #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 00a76ed7e01847..d765fd948faec3 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -223,10 +223,6 @@ ia64_phys_addr_valid (unsigned long addr) #define VMALLOC_START (RGN_BASE(RGN_GATE) + 0x200000000UL) -#ifdef CONFIG_VIRTUAL_MEM_MAP -# define VMALLOC_END_INIT (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9))) -extern unsigned long VMALLOC_END; -#else #if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP) /* SPARSEMEM_VMEMMAP uses half of vmalloc... */ # define VMALLOC_END (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 10))) @@ -234,7 +230,6 @@ extern unsigned long VMALLOC_END; #else # define VMALLOC_END (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9))) #endif -#endif /* fs/proc/kcore.c */ #define kc_vaddr_to_offset(v) ((v) - RGN_BASE(RGN_GATE)) diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 78717819131cc2..08d4a2ba06520f 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -9,7 +9,7 @@ endif extra-y := head.o vmlinux.lds -obj-y := entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ +obj-y := entry.o efi.o efi_stub.o gate-data.o fsys.o irq.o irq_ia64.o \ irq_lsapic.o ivt.o pal.o patch.o process.o ptrace.o sal.o \ salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ unwind.o mca.o mca_asm.o topology.o dma-mapping.o iosapic.o acpi.o \ diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c deleted file mode 100644 index f8150ee74f2974..00000000000000 --- a/arch/ia64/kernel/ia64_ksyms.c +++ /dev/null @@ -1,12 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Architecture-specific kernel symbols - */ - -#if defined(CONFIG_VIRTUAL_MEM_MAP) || defined(CONFIG_DISCONTIGMEM) -#include -#include -#include -EXPORT_SYMBOL(min_low_pfn); /* defined by bootmem.c, but not exported by generic code */ -EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic code */ -#endif diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index af310dc8a356b9..4db9ca144fa5ee 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -143,7 +143,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { -#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_SPARSEMEM) +#if defined(CONFIG_SPARSEMEM) VMCOREINFO_SYMBOL(pgdat_list); VMCOREINFO_LENGTH(pgdat_list, MAX_NUMNODES); #endif diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile index 99a35039b54850..c03f63c62ac447 100644 --- a/arch/ia64/mm/Makefile +++ b/arch/ia64/mm/Makefile @@ -7,6 +7,5 @@ obj-y := init.o fault.o tlb.o extable.o ioremap.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_DISCONTIGMEM) += discontig.o obj-$(CONFIG_SPARSEMEM) += discontig.o obj-$(CONFIG_FLATMEM) += contig.o diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 62fe80a16f426b..42e025cfbd088c 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -153,11 +153,7 @@ find_memory (void) efi_memmap_walk(find_max_min_low_pfn, NULL); max_pfn = max_low_pfn; -#ifdef CONFIG_VIRTUAL_MEM_MAP - efi_memmap_walk(filter_memory, register_active_ranges); -#else memblock_add_node(0, PFN_PHYS(max_low_pfn), 0); -#endif find_initrd(); diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c310b4c99fb306..791d4176e4a6bb 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -585,25 +585,6 @@ void call_pernode_memory(unsigned long start, unsigned long len, void *arg) } } -static void __init virtual_map_init(void) -{ -#ifdef CONFIG_VIRTUAL_MEM_MAP - int node; - - VMALLOC_END -= PAGE_ALIGN(ALIGN(max_low_pfn, MAX_ORDER_NR_PAGES) * - sizeof(struct page)); - vmem_map = (struct page *) VMALLOC_END; - efi_memmap_walk(create_mem_map_page_table, NULL); - printk("Virtual mem_map starts at 0x%p\n", vmem_map); - - for_each_online_node(node) { - unsigned long pfn_offset = mem_data[node].min_pfn; - - NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; - } -#endif -} - /** * paging_init - setup page tables * @@ -619,8 +600,6 @@ void __init paging_init(void) sparse_init(); - virtual_map_init(); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA32] = max_dma; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index cd9766d2b6e0e0..02de2e70c5874d 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -84,18 +84,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re if (faulthandler_disabled() || !mm) goto no_context; -#ifdef CONFIG_VIRTUAL_MEM_MAP - /* - * If fault is in region 5 and we are in the kernel, we may already - * have the mmap_lock (pfn_valid macro is called during mmap). There - * is no vma for region 5 addr's anyway, so skip getting the semaphore - * and go directly to the exception handling code. - */ - - if ((REGION_NUMBER(address) == 5) && !user_mode(regs)) - goto bad_area_no_up; -#endif - /* * This is to handle the kprobes on user space access instructions */ @@ -213,9 +201,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re bad_area: mmap_read_unlock(mm); -#ifdef CONFIG_VIRTUAL_MEM_MAP - bad_area_no_up: -#endif if ((isr & IA64_ISR_SP) || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) { diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index a63585db94fe83..97a13eda81bfea 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -43,13 +43,6 @@ extern void ia64_tlb_init (void); unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL; -#ifdef CONFIG_VIRTUAL_MEM_MAP -unsigned long VMALLOC_END = VMALLOC_END_INIT; -EXPORT_SYMBOL(VMALLOC_END); -struct page *vmem_map; -EXPORT_SYMBOL(vmem_map); -#endif - struct page *zero_page_memmap_ptr; /* map entry for zero page */ EXPORT_SYMBOL(zero_page_memmap_ptr); @@ -373,212 +366,6 @@ void ia64_mmu_init(void *my_cpu_data) #endif } -#ifdef CONFIG_VIRTUAL_MEM_MAP -int vmemmap_find_next_valid_pfn(int node, int i) -{ - unsigned long end_address, hole_next_pfn; - unsigned long stop_address; - pg_data_t *pgdat = NODE_DATA(node); - - end_address = (unsigned long) &vmem_map[pgdat->node_start_pfn + i]; - end_address = PAGE_ALIGN(end_address); - stop_address = (unsigned long) &vmem_map[pgdat_end_pfn(pgdat)]; - - do { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = pgd_offset_k(end_address); - if (pgd_none(*pgd)) { - end_address += PGDIR_SIZE; - continue; - } - - p4d = p4d_offset(pgd, end_address); - if (p4d_none(*p4d)) { - end_address += P4D_SIZE; - continue; - } - - pud = pud_offset(p4d, end_address); - if (pud_none(*pud)) { - end_address += PUD_SIZE; - continue; - } - - pmd = pmd_offset(pud, end_address); - if (pmd_none(*pmd)) { - end_address += PMD_SIZE; - continue; - } - - pte = pte_offset_kernel(pmd, end_address); -retry_pte: - if (pte_none(*pte)) { - end_address += PAGE_SIZE; - pte++; - if ((end_address < stop_address) && - (end_address != ALIGN(end_address, 1UL << PMD_SHIFT))) - goto retry_pte; - continue; - } - /* Found next valid vmem_map page */ - break; - } while (end_address < stop_address); - - end_address = min(end_address, stop_address); - end_address = end_address - (unsigned long) vmem_map + sizeof(struct page) - 1; - hole_next_pfn = end_address / sizeof(struct page); - return hole_next_pfn - pgdat->node_start_pfn; -} - -int __init create_mem_map_page_table(u64 start, u64 end, void *arg) -{ - unsigned long address, start_page, end_page; - struct page *map_start, *map_end; - int node; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - map_start = vmem_map + (__pa(start) >> PAGE_SHIFT); - map_end = vmem_map + (__pa(end) >> PAGE_SHIFT); - - start_page = (unsigned long) map_start & PAGE_MASK; - end_page = PAGE_ALIGN((unsigned long) map_end); - node = paddr_to_nid(__pa(start)); - - for (address = start_page; address < end_page; address += PAGE_SIZE) { - pgd = pgd_offset_k(address); - if (pgd_none(*pgd)) { - p4d = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); - if (!p4d) - goto err_alloc; - pgd_populate(&init_mm, pgd, p4d); - } - p4d = p4d_offset(pgd, address); - - if (p4d_none(*p4d)) { - pud = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); - if (!pud) - goto err_alloc; - p4d_populate(&init_mm, p4d, pud); - } - pud = pud_offset(p4d, address); - - if (pud_none(*pud)) { - pmd = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); - if (!pmd) - goto err_alloc; - pud_populate(&init_mm, pud, pmd); - } - pmd = pmd_offset(pud, address); - - if (pmd_none(*pmd)) { - pte = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); - if (!pte) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, pte); - } - pte = pte_offset_kernel(pmd, address); - - if (pte_none(*pte)) { - void *page = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, - node); - if (!page) - goto err_alloc; - set_pte(pte, pfn_pte(__pa(page) >> PAGE_SHIFT, - PAGE_KERNEL)); - } - } - return 0; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d\n", - __func__, PAGE_SIZE, PAGE_SIZE, node); - return -ENOMEM; -} - -struct memmap_init_callback_data { - struct page *start; - struct page *end; - int nid; - unsigned long zone; -}; - -static int __meminit -virtual_memmap_init(u64 start, u64 end, void *arg) -{ - struct memmap_init_callback_data *args; - struct page *map_start, *map_end; - - args = (struct memmap_init_callback_data *) arg; - map_start = vmem_map + (__pa(start) >> PAGE_SHIFT); - map_end = vmem_map + (__pa(end) >> PAGE_SHIFT); - - if (map_start < args->start) - map_start = args->start; - if (map_end > args->end) - map_end = args->end; - - /* - * We have to initialize "out of bounds" struct page elements that fit completely - * on the same pages that were allocated for the "in bounds" elements because they - * may be referenced later (and found to be "reserved"). - */ - map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page); - map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end) - / sizeof(struct page)); - - if (map_start < map_end) - memmap_init_range((unsigned long)(map_end - map_start), - args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end), - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); - return 0; -} - -void __meminit memmap_init_zone(struct zone *zone) -{ - int nid = zone_to_nid(zone), zone_id = zone_idx(zone); - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long size = zone->spanned_pages; - - if (!vmem_map) { - memmap_init_range(size, nid, zone_id, start_pfn, start_pfn + size, - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); - } else { - struct page *start; - struct memmap_init_callback_data args; - - start = pfn_to_page(start_pfn); - args.start = start; - args.end = start + size; - args.nid = nid; - args.zone = zone_id; - - efi_memmap_walk(virtual_memmap_init, &args); - } -} - -int -ia64_pfn_valid (unsigned long pfn) -{ - char byte; - struct page *pg = pfn_to_page(pfn); - - return (__get_user(byte, (char __user *) pg) == 0) - && ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK)) - || (__get_user(byte, (char __user *) (pg + 1) - 1) == 0)); -} -EXPORT_SYMBOL(ia64_pfn_valid); - -#endif /* CONFIG_VIRTUAL_MEM_MAP */ - int __init register_active_ranges(u64 start, u64 len, int nid) { u64 end = start + len; From 99e729bd40fb3272fa4b0140839d5e957b58588a Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:53:48 -0700 Subject: [PATCH 011/175] ia64: module: fix symbolizer crash on fdescr Noticed failure as a crash on ia64 when tried to symbolize all backtraces collected by page_owner=on: $ cat /sys/kernel/debug/page_owner CPU: 1 PID: 2074 Comm: cat Not tainted 5.12.0-rc4 #226 Hardware name: hp server rx3600, BIOS 04.03 04/08/2008 ip is at dereference_module_function_descriptor+0x41/0x100 Crash happens at dereference_module_function_descriptor() due to use-after-free when dereferencing ".opd" section header. All section headers are already freed after module is laoded successfully. To keep symbolizer working the change stores ".opd" address and size after module is relocated to a new place and before section headers are discarded. To make similar errors less obscure module_finalize() now zeroes out all variables relevant to module loading only. Link: https://lkml.kernel.org/r/20210403074803.3309096-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/module.h | 6 +++++- arch/ia64/kernel/module.c | 29 +++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h index 5a29652e6defcb..7271b9c5fc7605 100644 --- a/arch/ia64/include/asm/module.h +++ b/arch/ia64/include/asm/module.h @@ -14,16 +14,20 @@ struct elf64_shdr; /* forward declration */ struct mod_arch_specific { + /* Used only at module load time. */ struct elf64_shdr *core_plt; /* core PLT section */ struct elf64_shdr *init_plt; /* init PLT section */ struct elf64_shdr *got; /* global offset table */ struct elf64_shdr *opd; /* official procedure descriptors */ struct elf64_shdr *unwind; /* unwind-table section */ unsigned long gp; /* global-pointer for module */ + unsigned int next_got_entry; /* index of next available got entry */ + /* Used at module run and cleanup time. */ void *core_unw_table; /* core unwind-table cookie returned by unwinder */ void *init_unw_table; /* init unwind-table cookie returned by unwinder */ - unsigned int next_got_entry; /* index of next available got entry */ + void *opd_addr; /* symbolize uses .opd to get to actual function */ + unsigned long opd_size; }; #define ARCH_SHF_SMALL SHF_IA_64_SHORT diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 00a496cb346f6e..2cba53c1da82e9 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -905,9 +905,31 @@ register_unwind_table (struct module *mod) int module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { + struct mod_arch_specific *mas = &mod->arch; + DEBUGP("%s: init: entry=%p\n", __func__, mod->init); - if (mod->arch.unwind) + if (mas->unwind) register_unwind_table(mod); + + /* + * ".opd" was already relocated to the final destination. Store + * it's address for use in symbolizer. + */ + mas->opd_addr = (void *)mas->opd->sh_addr; + mas->opd_size = mas->opd->sh_size; + + /* + * Module relocation was already done at this point. Section + * headers are about to be deleted. Wipe out load-time context. + */ + mas->core_plt = NULL; + mas->init_plt = NULL; + mas->got = NULL; + mas->opd = NULL; + mas->unwind = NULL; + mas->gp = 0; + mas->next_got_entry = 0; + return 0; } @@ -926,10 +948,9 @@ module_arch_cleanup (struct module *mod) void *dereference_module_function_descriptor(struct module *mod, void *ptr) { - Elf64_Shdr *opd = mod->arch.opd; + struct mod_arch_specific *mas = &mod->arch; - if (ptr < (void *)opd->sh_addr || - ptr >= (void *)(opd->sh_addr + opd->sh_size)) + if (ptr < mas->opd_addr || ptr >= mas->opd_addr + mas->opd_size) return ptr; return dereference_function_descriptor(ptr); From d991bb1c8da842a2a0b9dc83b1005e655783f861 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Thu, 29 Apr 2021 22:53:50 -0700 Subject: [PATCH 012/175] include/linux/compiler-gcc.h: sparse can do constant folding of __builtin_bswap*() Sparse can do constant folding of __builtin_bswap*() since 2017. Also, a much recent version of Sparse is needed anyway, see commit 6ec4476ac825 ("Raise gcc version requirement to 4.9"). So, remove the comment about sparse not being yet able to constant fold __builtin_bswap*() and remove the corresponding test of __CHECKER__. Link: https://lkml.kernel.org/r/20210226092236.99369-1-luc.vanoostenryck@gmail.com Signed-off-by: Luc Van Oostenryck Acked-by: Arnd Bergmann Acked-by: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 48750243db4c8e..5d97ef738a575e 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -90,15 +90,11 @@ */ #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) -/* - * sparse (__CHECKER__) pretends to be gcc, but can't do constant - * folding in __builtin_bswap*() (yet), so don't set these for it. - */ -#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) && !defined(__CHECKER__) +#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ #define __HAVE_BUILTIN_BSWAP16__ -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */ +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ #if GCC_VERSION >= 70000 #define KASAN_ABI_VERSION 5 From 439baedad52d3242ec1d2ed728bc195fd5577c05 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 29 Apr 2021 22:53:53 -0700 Subject: [PATCH 013/175] scripts/spelling.txt: add entries for recent discoveries Add a few entries for recent spelling fixes found. Opportunistically de-dupe: exeeds||exceeds Link: https://lore.kernel.org/lkml/31acb3239b7ab8989db0c9951e8740050aef0205.1616727528.git.tom.saeger@oracle.com/ Link: https://lore.kernel.org/lkml/fa193b3c9e346ff3fc157b54802c29b25f79c402.1615597995.git.tom.saeger@oracle.com/ Link: https://lkml.kernel.org/r/4a594a9e1536b1d9e5ba57f684c1e41457dd383b.1616861645.git.tom.saeger@oracle.com Signed-off-by: Tom Saeger Cc: Jens Axboe Cc: Colin Ian King Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 2e3ba91a50720d..7beb4262f71973 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -84,6 +84,7 @@ againt||against agaist||against aggreataon||aggregation aggreation||aggregation +ajust||adjust albumns||albums alegorical||allegorical algined||aligned @@ -161,10 +162,13 @@ asign||assign asser||assert assertation||assertion assertting||asserting +assgined||assigned assiged||assigned assigment||assignment assigments||assignments assistent||assistant +assocaited||associated +assocating||associating assocation||association associcated||associated assotiated||associated @@ -177,9 +181,11 @@ asynchnous||asynchronous asynchromous||asynchronous asymetric||asymmetric asymmeric||asymmetric +atleast||at least atomatically||automatically atomicly||atomically atempt||attempt +atrributes||attributes attachement||attachment attatch||attach attched||attached @@ -315,6 +321,7 @@ comminucation||communication commited||committed commiting||committing committ||commit +commnunication||communication commoditiy||commodity comsume||consume comsumer||consumer @@ -349,6 +356,7 @@ condtion||condition conected||connected conector||connector configration||configuration +configred||configured configuartion||configuration configuation||configuration configued||configured @@ -402,6 +410,7 @@ cunter||counter curently||currently cylic||cyclic dafault||default +deactive||deactivate deafult||default deamon||daemon debouce||debounce @@ -417,6 +426,7 @@ deffered||deferred defferred||deferred definate||definite definately||definitely +definiation||definition defintion||definition defintions||definitions defualt||default @@ -571,8 +581,9 @@ errror||error estbalishment||establishment etsablishment||establishment etsbalishment||establishment +evalute||evaluate +evalutes||evaluates evalution||evaluation -exeeds||exceeds excecutable||executable exceded||exceeded exceds||exceeds @@ -696,6 +707,7 @@ hardare||hardware harware||hardware havind||having heirarchically||hierarchically +heirarchy||hierarchy helpfull||helpful heterogenous||heterogeneous hexdecimal||hexadecimal @@ -796,6 +808,7 @@ interanl||internal interchangable||interchangeable interferring||interfering interger||integer +intergrated||integrated intermittant||intermittent internel||internal interoprability||interoperability @@ -808,6 +821,7 @@ interrup||interrupt interrups||interrupts interruptted||interrupted interupted||interrupted +intiailized||initialized intial||initial intialisation||initialisation intialised||initialised @@ -1091,11 +1105,14 @@ preemptable||preemptible prefered||preferred prefferably||preferably prefitler||prefilter +preform||perform premption||preemption prepaired||prepared preperation||preparation preprare||prepare pressre||pressure +presuambly||presumably +previosuly||previously primative||primitive princliple||principle priorty||priority @@ -1265,6 +1282,7 @@ scarch||search schdule||schedule seach||search searchs||searches +secion||section secquence||sequence secund||second segement||segment @@ -1312,6 +1330,8 @@ singed||signed sleeped||slept sliped||slipped softwares||software +soley||solely +souce||source speach||speech specfic||specific specfield||specified @@ -1320,7 +1340,9 @@ specifc||specific specifed||specified specificatin||specification specificaton||specification +specificed||specified specifing||specifying +specifiy||specify specifiying||specifying speficied||specified speicify||specify @@ -1436,6 +1458,7 @@ timout||timeout tmis||this toogle||toggle torerable||tolerable +traget||target traking||tracking tramsmitted||transmitted tramsmit||transmit @@ -1558,6 +1581,7 @@ wiil||will wirte||write withing||within wnat||want +wont||won't workarould||workaround writeing||writing writting||writing From 21917bded72cf33bdf02a153f7b477ab186a52ee Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 29 Apr 2021 22:53:56 -0700 Subject: [PATCH 014/175] scripts: a new script for checking duplicate struct declaration checkdeclares: find struct declared more than once. Inspired by checkincludes.pl. This script checks for duplicate struct declares. Note that this will not take into consideration macros, so you should run this only if you know you do have real dups and do not have them under #ifdef's. You could also just review the results. [akpm@linux-foundation.org: fix usage message, grammar] Link: https://lkml.kernel.org/r/20210401110943.1010796-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Cc: Masahiro Yamada Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkdeclares.pl | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 scripts/checkdeclares.pl diff --git a/scripts/checkdeclares.pl b/scripts/checkdeclares.pl new file mode 100644 index 00000000000000..f6d551c84fc644 --- /dev/null +++ b/scripts/checkdeclares.pl @@ -0,0 +1,53 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 +# +# checkdeclares: find struct declared more than once +# +# Copyright 2021 Wan Jiabing +# Inspired by checkincludes.pl +# +# This script checks for duplicate struct declares. +# Note that this will not take into consideration macros so +# you should run this only if you know you do have real dups +# and do not have them under #ifdef's. +# You could also just review the results. + +use strict; + +sub usage { + print "Usage: checkdeclares.pl file1.h ...\n"; + print "Warns of struct declaration duplicates\n"; + exit 1; +} + +if ($#ARGV < 0) { + usage(); +} + +my $dup_counter = 0; + +foreach my $file (@ARGV) { + open(my $f, '<', $file) + or die "Cannot open $file: $!.\n"; + + my %declaredstructs = (); + + while (<$f>) { + if (m/^\s*struct\s*(\w*);$/o) { + ++$declaredstructs{$1}; + } + } + + close($f); + + foreach my $structname (keys %declaredstructs) { + if ($declaredstructs{$structname} > 1) { + print "$file: struct $structname is declared more than once.\n"; + ++$dup_counter; + } + } +} + +if ($dup_counter == 0) { + print "No duplicate struct declares found.\n"; +} From 91a8528e8a28c258a96ec8af4a30238f7c11ff81 Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Thu, 29 Apr 2021 22:53:59 -0700 Subject: [PATCH 015/175] arch/sh/include/asm/tlb.h: remove duplicate include 'asm-generic/tlb.h' included in 'asm/tlb.h' is duplicated. Link: https://lkml.kernel.org/r/20210304132020.196811-1-zhang.yunkai@zte.com.cn Signed-off-by: Zhang Yunkai Cc: Yoshinori Sato Cc: Rich Felker Cc: Zhang Yunkai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/tlb.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 360f713d009b02..aeb8915e925496 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -4,12 +4,11 @@ #ifndef __ASSEMBLY__ #include +#include #ifdef CONFIG_MMU #include -#include - #if defined(CONFIG_CPU_SH4) extern void tlb_wire_entry(struct vm_area_struct *, unsigned long, pte_t); extern void tlb_unwire_entry(void); @@ -24,12 +23,7 @@ static inline void tlb_unwire_entry(void) { BUG(); } -#endif - -#else /* CONFIG_MMU */ - -#include - +#endif /* CONFIG_CPU_SH4 */ #endif /* CONFIG_MMU */ #endif /* __ASSEMBLY__ */ #endif /* __ASM_SH_TLB_H */ From 1634852df7f0cc1223e454de2d1ad2786e0aa9f3 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 29 Apr 2021 22:54:02 -0700 Subject: [PATCH 016/175] ocfs2: replace DEFINE_SIMPLE_ATTRIBUTE with DEFINE_DEBUGFS_ATTRIBUTE Fix the following coccicheck warning: fs/ocfs2/blockcheck.c:232:0-23: WARNING: blockcheck_fops should be defined with DEFINE_DEBUGFS_ATTRIBUTE Link: https://lkml.kernel.org/r/1614155230-57292-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/blockcheck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 6e07ddb0e3c01e..dabfef9c2bc0b9 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -229,7 +229,7 @@ static int blockcheck_u64_get(void *data, u64 *val) *val = *(u64 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { From f9630ec9d9e6c31e5c17dda4cbca53c504604cce Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 29 Apr 2021 22:54:05 -0700 Subject: [PATCH 017/175] ocfs2: map flags directly in flags_to_o2dlm() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use macro map_flag() is tricky and coccicheck outputs the following warning: fs/ocfs2/stack_o2cb.c:69:5-16: Unneeded variable: "o2dlm_flags" So map flags directly in flags_to_o2dlm() to make coccicheck happy. And remove BUG_ON() here as well to simplify code since it runs well a long time. Link: https://lkml.kernel.org/r/1616138664-35935-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Wengang Wang Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_o2cb.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index dbf8b5735808b7..f70012038383df 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -59,31 +59,31 @@ static inline int mode_to_o2dlm(int mode) return mode; } -#define map_flag(_generic, _o2dlm) \ - if (flags & (_generic)) { \ - flags &= ~(_generic); \ - o2dlm_flags |= (_o2dlm); \ - } static int flags_to_o2dlm(u32 flags) { int o2dlm_flags = 0; - map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE); - map_flag(DLM_LKF_CANCEL, LKM_CANCEL); - map_flag(DLM_LKF_CONVERT, LKM_CONVERT); - map_flag(DLM_LKF_VALBLK, LKM_VALBLK); - map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK); - map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN); - map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE); - map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT); - map_flag(DLM_LKF_LOCAL, LKM_LOCAL); - - /* map_flag() should have cleared every flag passed in */ - BUG_ON(flags != 0); + if (flags & DLM_LKF_NOQUEUE) + o2dlm_flags |= LKM_NOQUEUE; + if (flags & DLM_LKF_CANCEL) + o2dlm_flags |= LKM_CANCEL; + if (flags & DLM_LKF_CONVERT) + o2dlm_flags |= LKM_CONVERT; + if (flags & DLM_LKF_VALBLK) + o2dlm_flags |= LKM_VALBLK; + if (flags & DLM_LKF_IVVALBLK) + o2dlm_flags |= LKM_INVVALBLK; + if (flags & DLM_LKF_ORPHAN) + o2dlm_flags |= LKM_ORPHAN; + if (flags & DLM_LKF_FORCEUNLOCK) + o2dlm_flags |= LKM_FORCE; + if (flags & DLM_LKF_TIMEOUT) + o2dlm_flags |= LKM_TIMEOUT; + if (flags & DLM_LKF_LOCAL) + o2dlm_flags |= LKM_LOCAL; return o2dlm_flags; } -#undef map_flag /* * Map an o2dlm status to standard errno values. From f13604a2b9ffb5bcd8ecfb505804adb890080078 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 29 Apr 2021 22:54:08 -0700 Subject: [PATCH 018/175] ocfs2: fix a typo s/cluter/cluster/ Link: https://lkml.kernel.org/r/20210324072931.5056-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index a191094694c619..8d33ebc6b6fc32 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -731,7 +731,7 @@ static void __exit ocfs2_stack_glue_exit(void) } MODULE_AUTHOR("Oracle"); -MODULE_DESCRIPTION("ocfs2 cluter stack glue layer"); +MODULE_DESCRIPTION("ocfs2 cluster stack glue layer"); MODULE_LICENSE("GPL"); module_init(ocfs2_stack_glue_init); module_exit(ocfs2_stack_glue_exit); From ccf33ec4a7326066b544cdc6c6628a89a658dec8 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 29 Apr 2021 22:54:11 -0700 Subject: [PATCH 019/175] ocfs2/dlm: remove unused function Fix the following clang warning: fs/ocfs2/dlm/dlmrecovery.c:129:20: warning: unused function 'dlm_reset_recovery' [-Wunused-function]. Link: https://lkml.kernel.org/r/1618382761-5784-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Jiapeng Chong Reported-by: Abaci Robot Reviewed-by: Wengang Wang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 4b566e88582f43..afc51736686c93 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -126,13 +126,6 @@ static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } -static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) -{ - spin_lock(&dlm->spinlock); - __dlm_reset_recovery(dlm); - spin_unlock(&dlm->spinlock); -} - /* Worker function used during recovery. */ void dlm_dispatch_work(struct work_struct *work) { From 926ee00ea24320052b46745ef4b00d91c05bd03d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 29 Apr 2021 22:54:15 -0700 Subject: [PATCH 020/175] kfifo: fix ternary sign extension bugs The intent with this code was to return negative error codes but instead it returns positives. The problem is how type promotion works with ternary operations. These functions return long, "ret" is an int and "copied" is a u32. The negative error code is first cast to u32 so it becomes a high positive and then cast to long where it's still a positive. We could fix this by declaring "ret" as a ssize_t but let's just get rid of the ternaries instead. Link: https://lkml.kernel.org/r/YIE+/cK1tBzSuQPU@mwanda Fixes: 5bf2b19320ec ("kfifo: add example files to the kernel sample directory") Signed-off-by: Dan Carpenter Cc: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- samples/kfifo/bytestream-example.c | 8 ++++++-- samples/kfifo/inttype-example.c | 8 ++++++-- samples/kfifo/record-example.c | 8 ++++++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c index c406f03ee5519f..5a90aa52787755 100644 --- a/samples/kfifo/bytestream-example.c +++ b/samples/kfifo/bytestream-example.c @@ -122,8 +122,10 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, ret = kfifo_from_user(&test, buf, count, &copied); mutex_unlock(&write_lock); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -138,8 +140,10 @@ static ssize_t fifo_read(struct file *file, char __user *buf, ret = kfifo_to_user(&test, buf, count, &copied); mutex_unlock(&read_lock); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static const struct proc_ops fifo_proc_ops = { diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c index 78977fc4a23f74..e5403d8c971a5b 100644 --- a/samples/kfifo/inttype-example.c +++ b/samples/kfifo/inttype-example.c @@ -115,8 +115,10 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, ret = kfifo_from_user(&test, buf, count, &copied); mutex_unlock(&write_lock); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -131,8 +133,10 @@ static ssize_t fifo_read(struct file *file, char __user *buf, ret = kfifo_to_user(&test, buf, count, &copied); mutex_unlock(&read_lock); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static const struct proc_ops fifo_proc_ops = { diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c index c507998a2617cc..f64f3d62d6c2a4 100644 --- a/samples/kfifo/record-example.c +++ b/samples/kfifo/record-example.c @@ -129,8 +129,10 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, ret = kfifo_from_user(&test, buf, count, &copied); mutex_unlock(&write_lock); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -145,8 +147,10 @@ static ssize_t fifo_read(struct file *file, char __user *buf, ret = kfifo_to_user(&test, buf, count, &copied); mutex_unlock(&read_lock); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static const struct proc_ops fifo_proc_ops = { From 21ae3ad1632cbe6f5e998222ffc5668aff36b79c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 29 Apr 2021 22:54:17 -0700 Subject: [PATCH 021/175] vfs: fs_parser: clean up kernel-doc warnings Fix kernel-doc notation function arguments to eliminate two kernel-doc warnings: fs_parser.c:322: warning: Excess function parameter 'name' description in 'validate_constant_table' fs_parser.c:367: warning: Function parameter or member 'name' not described in 'fs_validate_description' Link: https://lkml.kernel.org/r/20210407033743.9701-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Alexander Viro Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs_parser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 68b0148f4bb819..980d44fd3a3639 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -310,7 +310,6 @@ EXPORT_SYMBOL(fs_param_is_path); #ifdef CONFIG_VALIDATE_FS_PARSER /** * validate_constant_table - Validate a constant table - * @name: Name to use in reporting * @tbl: The constant table to validate. * @tbl_size: The size of the table. * @low: The lowest permissible value. @@ -360,6 +359,7 @@ bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, /** * fs_validate_description - Validate a parameter description + * @name: The parameter name to search for. * @desc: The parameter description to validate. */ bool fs_validate_description(const char *name, From 7c0012f522c802d25be102bafe54f333168e6119 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 29 Apr 2021 22:54:20 -0700 Subject: [PATCH 022/175] watchdog: rename __touch_watchdog() to a better descriptive name Patch series "watchdog/softlockup: Report overall time and some cleanup", v2. I dug deep into the softlockup watchdog history when time permitted this year. And reworked the patchset that fixed timestamps and cleaned up the code[2]. I split it into very small steps and did even more code clean up. The result looks quite strightforward and I am pretty confident with the changes. [1] v2: https://lore.kernel.org/r/20201210160038.31441-1-pmladek@suse.com [2] v1: https://lore.kernel.org/r/20191024114928.15377-1-pmladek@suse.com This patch (of 6): There are many touch_*watchdog() functions. They are called in situations where the watchdog could report false positives or create unnecessary noise. For example, when CPU is entering idle mode, a virtual machine is stopped, or a lot of messages are printed in the atomic context. These functions set SOFTLOCKUP_RESET instead of a real timestamp. It allows to call them even in a context where jiffies might be outdated. For example, in an atomic context. The real timestamp is set by __touch_watchdog() that is called from the watchdog timer callback. Rename this callback to update_touch_ts(). It better describes the effect and clearly distinguish is from the other touch_*watchdog() functions. Another motivation is that two timestamps are going to be used. One will be used for the total softlockup time. The other will be used to measure time since the last report. The new function name will help to distinguish which timestamp is being updated. Link: https://lkml.kernel.org/r/20210311122130.6788-1-pmladek@suse.com Link: https://lkml.kernel.org/r/20210311122130.6788-2-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Laurence Oberman Cc: Vincent Whitchurch Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 107bc38b19450a..8440e62bfec47d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -236,7 +236,7 @@ static void set_sample_period(void) } /* Commands for resetting the watchdog */ -static void __touch_watchdog(void) +static void update_touch_ts(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); } @@ -332,7 +332,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __touch_watchdog(); + update_touch_ts(); complete(this_cpu_ptr(&softlockup_completion)); return 0; @@ -375,7 +375,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Clear the guest paused flag on watchdog reset */ kvm_check_and_clear_guest_paused(); - __touch_watchdog(); + update_touch_ts(); return HRTIMER_RESTART; } @@ -461,7 +461,7 @@ static void watchdog_enable(unsigned int cpu) HRTIMER_MODE_REL_PINNED_HARD); /* Initialize timestamp */ - __touch_watchdog(); + update_touch_ts(); /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); From c9ad17c991492f4390f42598f6ab0531f87eed07 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 29 Apr 2021 22:54:23 -0700 Subject: [PATCH 023/175] watchdog: explicitly update timestamp when reporting softlockup The softlockup situation might stay for a long time or even forever. When it happens, the softlockup debug messages are printed in regular intervals defined by get_softlockup_thresh(). There is a mystery. The repeated message is printed after the full interval that is defined by get_softlockup_thresh(). But the timer callback is called more often as defined by sample_period. The code looks like the soflockup should get reported in every sample_period when it was once behind the thresh. It works only by chance. The watchdog is touched when printing the stall report, for example, in printk_stack_address(). Make the behavior clear and predictable by explicitly updating the timestamp in watchdog_timer_fn() when the report gets printed. Link: https://lkml.kernel.org/r/20210311122130.6788-3-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Ingo Molnar Cc: Laurence Oberman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Whitchurch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8440e62bfec47d..8efd2a8d9f1036 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -410,6 +410,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } + /* Start period for the next softlockup warning. */ + update_touch_ts(); + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); From fef06efc2ebaa94c8aee299b863e870467dbab8d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 29 Apr 2021 22:54:26 -0700 Subject: [PATCH 024/175] watchdog/softlockup: report the overall time of softlockups The softlockup detector currently shows the time spent since the last report. As a result it is not clear whether a CPU is infinitely hogged by a single task or if it is a repeated event. The situation can be simulated with a simply busy loop: while (true) cpu_relax(); The softlockup detector produces: [ 168.277520] watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [cat:4865] [ 196.277604] watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [cat:4865] [ 236.277522] watchdog: BUG: soft lockup - CPU#1 stuck for 23s! [cat:4865] But it should be, something like: [ 480.372418] watchdog: BUG: soft lockup - CPU#2 stuck for 26s! [cat:4943] [ 508.372359] watchdog: BUG: soft lockup - CPU#2 stuck for 52s! [cat:4943] [ 548.372359] watchdog: BUG: soft lockup - CPU#2 stuck for 89s! [cat:4943] [ 576.372351] watchdog: BUG: soft lockup - CPU#2 stuck for 115s! [cat:4943] For the better output, add an additional timestamp of the last report. Only this timestamp is reset when the watchdog is intentionally touched from slow code paths or when printing the report. Link: https://lkml.kernel.org/r/20210311122130.6788-4-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Ingo Molnar Cc: Laurence Oberman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Whitchurch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8efd2a8d9f1036..6bc5113d3d7498 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -154,7 +154,11 @@ static void lockup_detector_update_enable(void) #ifdef CONFIG_SOFTLOCKUP_DETECTOR -#define SOFTLOCKUP_RESET ULONG_MAX +/* + * Delay the soflockup report when running a known slow code. + * It does _not_ affect the timestamp of the last successdul reschedule. + */ +#define SOFTLOCKUP_DELAY_REPORT ULONG_MAX #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; @@ -169,7 +173,10 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +/* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +/* Timestamp of the last softlockup report. */ +static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); @@ -235,10 +242,16 @@ static void set_sample_period(void) watchdog_update_hrtimer_threshold(sample_period); } +static void update_report_ts(void) +{ + __this_cpu_write(watchdog_report_ts, get_timestamp()); +} + /* Commands for resetting the watchdog */ static void update_touch_ts(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); + update_report_ts(); } /** @@ -252,10 +265,10 @@ static void update_touch_ts(void) notrace void touch_softlockup_watchdog_sched(void) { /* - * Preemption can be enabled. It doesn't matter which CPU's timestamp - * gets zeroed here, so use the raw_ operation. + * Preemption can be enabled. It doesn't matter which CPU's watchdog + * report period gets restarted here, so use the raw_ operation. */ - raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); + raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } notrace void touch_softlockup_watchdog(void) @@ -279,7 +292,7 @@ void touch_all_softlockup_watchdogs(void) * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) { - per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET; + per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT; wq_watchdog_touch(cpu); } } @@ -287,16 +300,16 @@ void touch_all_softlockup_watchdogs(void) void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); - __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); + __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } -static int is_softlockup(unsigned long touch_ts) +static int is_softlockup(unsigned long touch_ts, unsigned long period_ts) { unsigned long now = get_timestamp(); if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ - if (time_after(now, touch_ts + get_softlockup_thresh())) + if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; } return 0; @@ -342,6 +355,7 @@ static int softlockup_fn(void *data) static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); + unsigned long period_ts = __this_cpu_read(watchdog_report_ts); struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; @@ -363,7 +377,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - if (touch_ts == SOFTLOCKUP_RESET) { + /* Reset the interval when touched externally by a known slow code. */ + if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically @@ -375,7 +390,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Clear the guest paused flag on watchdog reset */ kvm_check_and_clear_guest_paused(); - update_touch_ts(); + update_report_ts(); + return HRTIMER_RESTART; } @@ -385,7 +401,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) * indicate it is getting cpu time. If it hasn't then * this is a good indication some task is hogging the cpu */ - duration = is_softlockup(touch_ts); + duration = is_softlockup(touch_ts, period_ts); if (unlikely(duration)) { /* * If a virtual machine is stopped by the host it can look to @@ -411,7 +427,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } /* Start period for the next softlockup warning. */ - update_touch_ts(); + update_report_ts(); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, From 1bc503cb4a2638fb1c57801a7796aca57845ce63 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 29 Apr 2021 22:54:30 -0700 Subject: [PATCH 025/175] watchdog/softlockup: remove logic that tried to prevent repeated reports The softlockup detector does some gymnastic with the variable soft_watchdog_warn. It was added by the commit 58687acba59266735ad ("lockup_detector: Combine nmi_watchdog and softlockup detector"). The purpose is not completely clear. There are the following clues. They describe the situation how it looked after the above mentioned commit: 1. The variable was checked with a comment "only warn once". 2. The variable was set when softlockup was reported. It was cleared only when the CPU was not longer in the softlockup state. 3. watchdog_touch_ts was not explicitly updated when the softlockup was reported. Without this variable, the report would normally be printed again during every following watchdog_timer_fn() invocation. The logic has got even more tangled up by the commit ed235875e2ca98 ("kernel/watchdog.c: print traces for all cpus on lockup detection"). After this commit, soft_watchdog_warn is set only when softlockup_all_cpu_backtrace is enabled. But multiple reports from all CPUs are prevented by a new variable soft_lockup_nmi_warn. Conclusion: The variable probably never worked as intended. In each case, it has not worked last many years because the softlockup was reported repeatedly after the full period defined by watchdog_thresh. The reason is that watchdog gets touched in many known slow paths, for example, in printk_stack_address(). This code is called also when printing the softlockup report. It means that the watchdog timestamp gets updated after each report. Solution: Simply remove the logic. People want the periodic report anyway. Link: https://lkml.kernel.org/r/20210311122130.6788-5-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Ingo Molnar Cc: Laurence Oberman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Whitchurch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6bc5113d3d7498..b8d4182d14ccb4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -179,7 +179,6 @@ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -411,19 +410,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (kvm_check_and_clear_guest_paused()) return HRTIMER_RESTART; - /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) - return HRTIMER_RESTART; - if (softlockup_all_cpu_backtrace) { /* Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping cpu back traces */ - if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { - /* Someone else will report us. Let's give up */ - __this_cpu_write(soft_watchdog_warn, true); + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) return HRTIMER_RESTART; - } } /* Start period for the next softlockup warning. */ @@ -453,9 +445,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); - __this_cpu_write(soft_watchdog_warn, true); - } else - __this_cpu_write(soft_watchdog_warn, false); + } return HRTIMER_RESTART; } From 9f113bf760ca90d709f8f89a733d10abb1f04a83 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 29 Apr 2021 22:54:33 -0700 Subject: [PATCH 026/175] watchdog: fix barriers when printing backtraces from all CPUs Any parallel softlockup reports are skipped when one CPU is already printing backtraces from all CPUs. The exclusive rights are synchronized using one bit in soft_lockup_nmi_warn. There is also one memory barrier that does not make much sense. Use two barriers on the right location to prevent mixing two reports. [pmladek@suse.com: use bit lock operations to prevent multiple soft-lockup reports] Link: https://lkml.kernel.org/r/YFSVsLGVWMXTvlbk@alley Link: https://lkml.kernel.org/r/20210311122130.6788-6-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Laurence Oberman Cc: Michal Hocko Cc: Thomas Gleixner Cc: Vincent Whitchurch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b8d4182d14ccb4..8cf0678378d297 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -410,11 +410,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (kvm_check_and_clear_guest_paused()) return HRTIMER_RESTART; + /* + * Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping all cpu back traces. + */ if (softlockup_all_cpu_backtrace) { - /* Prevent multiple soft-lockup reports if one cpu is already - * engaged in dumping cpu back traces - */ - if (test_and_set_bit(0, &soft_lockup_nmi_warn)) + if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn)) return HRTIMER_RESTART; } @@ -432,14 +433,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) dump_stack(); if (softlockup_all_cpu_backtrace) { - /* Avoid generating two back traces for current - * given that one is already made above - */ trigger_allbutself_cpu_backtrace(); - - clear_bit(0, &soft_lockup_nmi_warn); - /* Barrier to sync with other cpus */ - smp_mb__after_atomic(); + clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); From 9bf3bc949f8aeefeacea4b1198db833b722a8e27 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 29 Apr 2021 22:54:36 -0700 Subject: [PATCH 027/175] watchdog: cleanup handling of false positives Commit d6ad3e286d2c ("softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume") introduced touch_softlockup_watchdog_sync(). It solved a problem when the watchdog was touched in an atomic context, the timer callback was proceed right after releasing interrupts, and the local clock has not been updated yet. In this case, sched_clock_tick() was called in watchdog_timer_fn() before updating the timer. So far so good. Later commit 5d1c0f4a80a6 ("watchdog: add check for suspended vm in softlockup detector") added two kvm_check_and_clear_guest_paused() calls. They touch the watchdog when the guest has been sleeping. The code makes my head spin around. Scenario 1: + guest did sleep: + PVCLOCK_GUEST_STOPPED is set + 1st watchdog_timer_fn() invocation: + the watchdog is not touched yet + is_softlockup() returns too big delay + kvm_check_and_clear_guest_paused(): + clear PVCLOCK_GUEST_STOPPED + call touch_softlockup_watchdog_sync() + set SOFTLOCKUP_DELAY_REPORT + set softlockup_touch_sync + return from the timer callback + 2nd watchdog_timer_fn() invocation: + call sched_clock_tick() even though it is not needed. The timer callback was invoked again only because the clock has already been updated in the meantime. + call kvm_check_and_clear_guest_paused() that does nothing because PVCLOCK_GUEST_STOPPED has been cleared already. + call update_report_ts() and return. This is fine. Except that sched_clock_tick() might allow to set it already during the 1st invocation. Scenario 2: + guest did sleep + 1st watchdog_timer_fn() invocation + same as in 1st scenario + guest did sleep again: + set PVCLOCK_GUEST_STOPPED again + 2nd watchdog_timer_fn() invocation + SOFTLOCKUP_DELAY_REPORT is set from 1st invocation + call sched_clock_tick() + call kvm_check_and_clear_guest_paused() + clear PVCLOCK_GUEST_STOPPED + call touch_softlockup_watchdog_sync() + set SOFTLOCKUP_DELAY_REPORT + set softlockup_touch_sync + call update_report_ts() (set real timestamp immediately) + return from the timer callback + 3rd watchdog_timer_fn() invocation + timestamp is set from 2nd invocation + softlockup_touch_sync is set but not checked because the real timestamp is already set Make the code more straightforward: 1. Always call kvm_check_and_clear_guest_paused() at the very beginning to handle PVCLOCK_GUEST_STOPPED. It touches the watchdog when the quest did sleep. 2. Handle the situation when the watchdog has been touched (SOFTLOCKUP_DELAY_REPORT is set). Call sched_clock_tick() when touch_*sync() variant was used. It makes sure that the timestamp will be up to date even when it has been touched in atomic context or quest did sleep. As a result, kvm_check_and_clear_guest_paused() is called on a single location. And the right timestamp is always set when returning from the timer callback. Link: https://lkml.kernel.org/r/20210311122130.6788-7-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Ingo Molnar Cc: Laurence Oberman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Whitchurch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8cf0678378d297..7c397907d0e950 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -376,7 +376,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - /* Reset the interval when touched externally by a known slow code. */ + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a soft lockup. Check to see if the host + * stopped the vm before we process the timestamps. + */ + kvm_check_and_clear_guest_paused(); + + /* Reset the interval when touched by known problematic code. */ if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* @@ -387,10 +394,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) sched_clock_tick(); } - /* Clear the guest paused flag on watchdog reset */ - kvm_check_and_clear_guest_paused(); update_report_ts(); - return HRTIMER_RESTART; } @@ -402,14 +406,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ duration = is_softlockup(touch_ts, period_ts); if (unlikely(duration)) { - /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like a soft lockup, check to see if the host - * stopped the vm before we issue the warning - */ - if (kvm_check_and_clear_guest_paused()) - return HRTIMER_RESTART; - /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. From 82edd9d52e6dda7cd12047969ae8d357652e2e57 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Thu, 29 Apr 2021 22:54:39 -0700 Subject: [PATCH 028/175] mm/slab_common: provide "slab_merge" option for !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT) builds This is a minor addition to the allocator setup options to provide a simple way to on demand enable back cache merging for builds that by default run with CONFIG_SLAB_MERGE_DEFAULT not set. Link: https://lkml.kernel.org/r/20210319194506.200159-1-aquini@redhat.com Signed-off-by: Rafael Aquini Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ mm/slab_common.c | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5c949c12ed0725..1c0a3cf6fcc90e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4996,6 +4996,10 @@ slram= [HW,MTD] + slab_merge [MM] + Enable merging of slabs with similar size when the + kernel is built without CONFIG_SLAB_MERGE_DEFAULT. + slab_nomerge [MM] Disable merging of slabs with similar size. May be necessary if there is some reason to distinguish @@ -5043,6 +5047,9 @@ lower than slub_max_order. For more information see Documentation/vm/slub.rst. + slub_merge [MM, SLUB] + Same with slab_merge. + slub_nomerge [MM, SLUB] Same with slab_nomerge. This is supported for legacy. See slab_nomerge for more information. diff --git a/mm/slab_common.c b/mm/slab_common.c index 4c6107e39f9a99..f8833d3e5d47e1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -71,11 +71,19 @@ static int __init setup_slab_nomerge(char *str) return 1; } +static int __init setup_slab_merge(char *str) +{ + slab_nomerge = false; + return 1; +} + #ifdef CONFIG_SLUB __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); +__setup_param("slub_merge", slub_merge, setup_slab_merge, 0); #endif __setup("slab_nomerge", setup_slab_nomerge); +__setup("slab_merge", setup_slab_merge); /* * Determine the size of a slab object From 1f0723a4c0df36cbdffc6fac82cd3c5d57e06d66 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 29 Apr 2021 22:54:42 -0700 Subject: [PATCH 029/175] mm, slub: enable slub_debug static key when creating cache with explicit debug flags Commit ca0cab65ea2b ("mm, slub: introduce static key for slub_debug()") introduced a static key to optimize the case where no debugging is enabled for any cache. The static key is enabled when slub_debug boot parameter is passed, or CONFIG_SLUB_DEBUG_ON enabled. However, some caches might be created with one or more debugging flags explicitly passed to kmem_cache_create(), and the commit missed this. Thus the debugging functionality would not be actually performed for these caches unless the static key gets enabled by boot param or config. This patch fixes it by checking for debugging flags passed to kmem_cache_create() and enabling the static key accordingly. Note such explicit debugging flags should not be used outside of debugging and testing as they will now enable the static key globally. btrfs_init_cachep() creates a cache with SLAB_RED_ZONE but that's a mistake that's being corrected [1]. rcu_torture_stats() creates a cache with SLAB_STORE_USER, but that is a testing module so it's OK and will start working as intended after this patch. Also note that in case of backports to kernels before v5.12 that don't have 59450bbc12be ("mm, slab, slub: stop taking cpu hotplug lock"), static_branch_enable_cpuslocked() should be used. [1] https://lore.kernel.org/linux-btrfs/20210315141824.26099-1-dsterba@suse.com/ Link: https://lkml.kernel.org/r/20210315153415.24404-1-vbabka@suse.cz Fixes: ca0cab65ea2b ("mm, slub: introduce static key for slub_debug()") Signed-off-by: Vlastimil Babka Reported-by: Oliver Glitta Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 722f95e1ea0b16..b1f31f7b81bba9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3828,6 +3828,15 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { +#ifdef CONFIG_SLUB_DEBUG + /* + * If no slub_debug was enabled globally, the static key is not yet + * enabled by setup_slub_debug(). Enable it if the cache is being + * created with any of the debugging flags passed explicitly. + */ + if (flags & SLAB_DEBUG_FLAGS) + static_branch_enable(&slub_debug_enabled); +#endif s->flags = kmem_cache_flags(s->size, flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); From dc84207d00bef4a5d826e68bc0a310327b464fcf Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 29 Apr 2021 22:54:51 -0700 Subject: [PATCH 030/175] mm/slub.c: trivial typo fixes s/operatios/operations/ s/Mininum/Minimum/ s/mininum/minimum/ ......two different places. Link: https://lkml.kernel.org/r/20210325044940.14516-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b1f31f7b81bba9..a178c738fc92a1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3,7 +3,7 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks or atomic operatios + * The allocator synchronizes using per slab locks or atomic operations * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter @@ -160,7 +160,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #undef SLUB_DEBUG_CMPXCHG /* - * Mininum number of partial slabs. These will be left on the partial + * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. */ #define MIN_PARTIAL 5 @@ -833,7 +833,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at mininum + * C. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -3422,7 +3422,7 @@ static unsigned int slub_min_objects; * * Higher order allocations also allow the placement of more objects in a * slab and thereby reduce object handling overhead. If the user has - * requested a higher mininum order then we start with that one instead of + * requested a higher minimum order then we start with that one instead of * the smallest order which will fit the object. */ static inline unsigned int slab_order(unsigned int size, From 0b5121ef85102edc936b199fb239a1f8cce48018 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 29 Apr 2021 22:54:54 -0700 Subject: [PATCH 031/175] mm/kmemleak.c: fix a typo s/interruptable/interruptible/ Link: https://lkml.kernel.org/r/20210319214140.23304-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index fe6e3ae8e8c671..92a2d48858086c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1203,7 +1203,7 @@ static void update_refs(struct kmemleak_object *object) } /* - * Memory scanning is a long process and it needs to be interruptable. This + * Memory scanning is a long process and it needs to be interruptible. This * function checks whether such interrupt condition occurred. */ static int scan_should_stop(void) From 866b485262173a2b873386162b2ddcfbcb542b4a Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Thu, 29 Apr 2021 22:54:57 -0700 Subject: [PATCH 032/175] mm/page_owner: record the timestamp of all pages during free Collect the time when each allocation is freed, to help with memory analysis with kdump/ramdump. Add the timestamp also in the page_owner debugfs file and print it in dump_page(). Having another timestamp when we free the page helps for debugging page migration issues. For example both alloc and free timestamps being the same can gave hints that there is an issue with migrating memory, as opposed to a page just being dropped during migration. Link: https://lkml.kernel.org/r/20210203175905.12267-1-georgi.djakov@linaro.org Signed-off-by: Georgi Djakov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 2 +- mm/page_owner.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 4e67c2e9bbed3f..2175465c9bf2a6 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -47,7 +47,7 @@ size change due to this facility. text data bss dec hex filename 48800 2445 644 51889 cab1 mm/page_alloc.o - 6574 108 29 6711 1a37 mm/page_owner.o + 6662 108 29 6799 1a8f mm/page_owner.o 1025 8 8 1041 411 mm/page_ext.o Although, roughly, 8 KB code is added in total, page_alloc.o increase by diff --git a/mm/page_owner.c b/mm/page_owner.c index d15c7c4994f550..9ceae81ba7b892 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -27,6 +27,7 @@ struct page_owner { depot_stack_handle_t handle; depot_stack_handle_t free_handle; u64 ts_nsec; + u64 free_ts_nsec; pid_t pid; }; @@ -148,6 +149,7 @@ void __reset_page_owner(struct page *page, unsigned int order) struct page_ext *page_ext; depot_stack_handle_t handle = 0; struct page_owner *page_owner; + u64 free_ts_nsec = local_clock(); handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); @@ -158,6 +160,7 @@ void __reset_page_owner(struct page *page, unsigned int order) __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; + page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } } @@ -243,6 +246,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; + new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -356,10 +360,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->ts_nsec); + page_owner->ts_nsec, page_owner->free_ts_nsec); if (ret >= count) goto err; @@ -435,9 +439,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec); + page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { From 64ea78d2fdee1f68983ae3bec23f5d2bce71dc5a Mon Sep 17 00:00:00 2001 From: zhongjiang-ali Date: Thu, 29 Apr 2021 22:55:00 -0700 Subject: [PATCH 033/175] mm, page_owner: remove unused parameter in __set_page_owner_handle Since commit 5556cfe8d994 ("mm, page_owner: fix off-by-one error in __set_page_owner_handle()") introduced, the parameter 'page' will not used, hence it need to be removed. Link: https://lkml.kernel.org/r/1616602022-43545-1-git-send-email-zhongjiang-ali@linux.alibaba.com Signed-off-by: zhongjiang-ali Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 9ceae81ba7b892..b29a049b734aab 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -165,9 +165,9 @@ void __reset_page_owner(struct page *page, unsigned int order) } } -static inline void __set_page_owner_handle(struct page *page, - struct page_ext *page_ext, depot_stack_handle_t handle, - unsigned int order, gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned int order, gfp_t gfp_mask) { struct page_owner *page_owner; int i; @@ -197,7 +197,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order, return; handle = save_stack(gfp_mask); - __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); + __set_page_owner_handle(page_ext, handle, order, gfp_mask); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -616,7 +616,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* Found early allocated page */ - __set_page_owner_handle(page, page_ext, early_handle, + __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; } From fab765c210130113ede5f8754c6a158fa0e4f960 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:55:02 -0700 Subject: [PATCH 034/175] mm: page_owner: fetch backtrace only for tracked pages Very minor optimization. Link: https://lkml.kernel.org/r/20210401212445.3534721-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Acked-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index b29a049b734aab..590c2c89af744c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -147,15 +147,15 @@ void __reset_page_owner(struct page *page, unsigned int order) { int i; struct page_ext *page_ext; - depot_stack_handle_t handle = 0; + depot_stack_handle_t handle; struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) return; + + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); for (i = 0; i < (1 << order); i++) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); From 608b5d668c8ea6734594a401c9adab4093ad9847 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:55:05 -0700 Subject: [PATCH 035/175] mm: page_owner: use kstrtobool() to parse bool option I tried to use page_owner=1 for a while noticed too late it had no effect as opposed to similar init_on_alloc=1 (these work). Let's make them consistent. The change decreses binary size slightly: text data bss dec hex filename 12408 321 17 12746 31ca mm/page_owner.o.before 12320 321 17 12658 3172 mm/page_owner.o.after Link: https://lkml.kernel.org/r/20210401210909.3532086-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 590c2c89af744c..5c941ca9861d12 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -42,13 +42,7 @@ static void init_early_allocated_pages(void); static int __init early_page_owner_param(char *buf) { - if (!buf) - return -EINVAL; - - if (strcmp(buf, "on") == 0) - page_owner_enabled = true; - - return 0; + return kstrtobool(buf, &page_owner_enabled); } early_param("page_owner", early_page_owner_param); From 8e9b16c47680f6e7d6e5864a37f313f905a91cf5 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:55:08 -0700 Subject: [PATCH 036/175] mm: page_owner: detect page_owner recursion via task_struct Before the change page_owner recursion was detected via fetching backtrace and inspecting it for current instruction pointer. It has a few problems: - it is slightly slow as it requires extra backtrace and a linear stack scan of the result - it is too late to check if backtrace fetching required memory allocation itself (ia64's unwinder requires it). To simplify recursion tracking let's use page_owner recursion flag in 'struct task_struct'. The change make page_owner=on work on ia64 by avoiding infinite recursion in: kmalloc() -> __set_page_owner() -> save_stack() -> unwind() [ia64-specific] -> build_script() -> kmalloc() -> __set_page_owner() [we short-circuit here] -> save_stack() -> unwind() [recursion] Link: https://lkml.kernel.org/r/20210402115342.1463781-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++++ mm/page_owner.c | 32 ++++++++++---------------------- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d7d07aa0facfa6..9c25c8e670303b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -841,6 +841,10 @@ struct task_struct { /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif +#ifdef CONFIG_PAGE_OWNER + /* Used by page_owner=on to detect recursion in page tracking. */ + unsigned in_page_owner:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/mm/page_owner.c b/mm/page_owner.c index 5c941ca9861d12..9661d5320a07c1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -98,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } -static inline bool check_recursive_alloc(unsigned long *entries, - unsigned int nr_entries, - unsigned long ip) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (entries[i] == ip) - return true; - } - return false; -} - static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; depot_stack_handle_t handle; unsigned int nr_entries; - nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); - /* - * We need to check recursion here because our request to - * stackdepot could trigger memory allocation to save new - * entry. New memory allocation would reach here and call - * stack_depot_save_entries() again if we don't catch it. There is - * still not enough memory in stackdepot so it would try to - * allocate memory again and loop forever. + * Avoid recursion. + * + * Sometimes page metadata allocation tracking requires more + * memory to be allocated: + * - when new stack trace is saved to stack depot + * - when backtrace itself is calculated (ia64) */ - if (check_recursive_alloc(entries, nr_entries, _RET_IP_)) + if (current->in_page_owner) return dummy_handle; + current->in_page_owner = 1; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; + current->in_page_owner = 0; return handle; } From f58bd538e6a2deb2bcdfe527d9ed45643348a4e6 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:55:12 -0700 Subject: [PATCH 037/175] mm: page_poison: print page info when corruption is caught When page_poison detects page corruption it's useful to see who freed a page recently to have a guess where write-after-free corruption happens. After this change corruption report has extra page data. Example report from real corruption (includes only page_pwner part): pagealloc: memory corruption e00000014cd61d10: 11 00 00 00 00 00 00 00 30 1d d2 ff ff 0f 00 60 ........0......` e00000014cd61d20: b0 1d d2 ff ff 0f 00 60 90 fe 1c 00 08 00 00 20 .......`....... ... CPU: 1 PID: 220402 Comm: cc1plus Not tainted 5.12.0-rc5-00107-g9720c6f59ecf #245 Hardware name: hp server rx3600, BIOS 04.03 04/08/2008 ... Call Trace: [] show_stack+0x90/0xc0 [] dump_stack+0x150/0x1c0 [] __kernel_unpoison_pages+0x410/0x440 [] get_page_from_freelist+0x1460/0x2ca0 [] __alloc_pages_nodemask+0x3c0/0x660 [] alloc_pages_vma+0xb0/0x500 [] __handle_mm_fault+0x1230/0x1fe0 [] handle_mm_fault+0x310/0x4e0 [] ia64_do_page_fault+0x1f0/0xb80 [] ia64_leave_kernel+0x0/0x270 page_owner tracks the page as freed page allocated via order 0, migratetype Movable, gfp_mask 0x100dca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), pid 37, ts 8173444098740 __reset_page_owner+0x40/0x200 free_pcp_prepare+0x4d0/0x600 free_unref_page+0x20/0x1c0 __put_page+0x110/0x1a0 migrate_pages+0x16d0/0x1dc0 compact_zone+0xfc0/0x1aa0 proactive_compact_node+0xd0/0x1e0 kcompactd+0x550/0x600 kthread+0x2c0/0x2e0 call_payload+0x50/0x80 Here we can see that page was freed by page migration but something managed to write to it afterwards. [slyfox@gentoo.org: s/dump_page_owner/dump_page/, per Vlastimil] Link: https://lkml.kernel.org/r/20210407230800.1086854-1-slyfox@gentoo.org Link: https://lkml.kernel.org/r/20210404141735.2152984-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_poison.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index 655dc589560439..98438985e1ed9c 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -45,7 +46,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b) return error && !(error & (error - 1)); } -static void check_poison_mem(unsigned char *mem, size_t bytes) +static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes) { static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); unsigned char *start; @@ -70,6 +71,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); dump_stack(); + dump_page(page, "pagealloc: corrupted page details"); } static void unpoison_page(struct page *page) @@ -83,7 +85,7 @@ static void unpoison_page(struct page *page) * that is freed to buddy. Thus no extra check is done to * see if a page was poisoned. */ - check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE); + check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE); kasan_enable_current(); kunmap_atomic(addr); } From dce44566192ec0b38597fdfd435013c2d54653ff Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 29 Apr 2021 22:55:15 -0700 Subject: [PATCH 038/175] mm/memtest: add ARCH_USE_MEMTEST early_memtest() does not get called from all architectures. Hence enabling CONFIG_MEMTEST and providing a valid memtest=[1..N] kernel command line option might not trigger the memory pattern tests as would be expected in normal circumstances. This situation is misleading. The change here prevents the above mentioned problem after introducing a new config option ARCH_USE_MEMTEST that should be subscribed on platforms that call early_memtest(), in order to enable the config CONFIG_MEMTEST. Conversely CONFIG_MEMTEST cannot be enabled on platforms where it would not be tested anyway. Link: https://lkml.kernel.org/r/1617269193-22294-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas (arm64) Reviewed-by: Max Filippov Cc: Russell King Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/xtensa/Kconfig | 1 + lib/Kconfig.debug | 9 ++++++++- 7 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e6e08d8a45fcec..085c830d344bdf 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -33,6 +33,7 @@ config ARM select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_LD_ORPHAN_WARN diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 570fa52eb6f0ac..7f2a8009133738 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -67,6 +67,7 @@ config ARM64 select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_GNU_PROPERTY + select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_SYM_ANNOTATIONS diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 702648f60e41d9..49a3c9cd1cb296 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -16,6 +16,7 @@ config MIPS select ARCH_SUPPORTS_UPROBES select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT + select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3b34c44832e07a..2d060c00259584 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -149,6 +149,7 @@ config PPC select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC32 || PPC_BOOK3S_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0fc82237414ddf..dac15f646f797e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,6 +100,7 @@ config X86 select ARCH_SUPPORTS_LTO_CLANG if X86_64 select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64 select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_SYM_ANNOTATIONS diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 9ad6b7b82707cc..524413aabbc403 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -7,6 +7,7 @@ config XTENSA select ARCH_HAS_SYNC_DMA_FOR_CPU if MMU select ARCH_HAS_SYNC_DMA_FOR_DEVICE if MMU select ARCH_HAS_DMA_SET_UNCACHED if MMU + select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_FRAME_POINTERS diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b38bbb5bb00ef7..678c13967580ec 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2573,11 +2573,18 @@ config TEST_FPU endif # RUNTIME_TESTING_MENU +config ARCH_USE_MEMTEST + bool + help + An architecture should select this when it uses early_memtest() + during boot process. + config MEMTEST bool "Memtest" + depends on ARCH_USE_MEMTEST help This option adds a kernel parameter 'memtest', which allows memtest - to be set. + to be set and executed. memtest=0, mean disabled; -- default memtest=1, mean do 1 test pattern; ... From 63135aa3866db99fd923b716c5ff2e468879624a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Apr 2021 22:55:18 -0700 Subject: [PATCH 039/175] mm: provide filemap_range_needs_writeback() helper Patch series "Improve IOCB_NOWAIT O_DIRECT reads", v3. An internal workload complained because it was using too much CPU, and when I took a look, we had a lot of io_uring workers going to town. For an async buffered read like workload, I am normally expecting _zero_ offloads to a worker thread, but this one had tons of them. I'd drop caches and things would look good again, but then a minute later we'd regress back to using workers. Turns out that every minute something was reading parts of the device, which would add page cache for that inode. I put patches like these in for our kernel, and the problem was solved. Don't -EAGAIN IOCB_NOWAIT dio reads just because we have page cache entries for the given range. This causes unnecessary work from the callers side, when the IO could have been issued totally fine without blocking on writeback when there is none. This patch (of 3): For O_DIRECT reads/writes, we check if we need to issue a call to filemap_write_and_wait_range() to issue and/or wait for writeback for any page in the given range. The existing mechanism just checks for a page in the range, which is suboptimal for IOCB_NOWAIT as we'll fallback to the slow path (and needing retry) if there's just a clean page cache page in the range. Provide filemap_range_needs_writeback() which tries a little harder to check if we actually need to issue and/or wait for writeback in the range. Link: https://lkml.kernel.org/r/20210224164455.1096727-1-axboe@kernel.dk Link: https://lkml.kernel.org/r/20210224164455.1096727-2-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 ++ mm/filemap.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index bf4e90d3ab186c..12766edee81fb5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2878,6 +2878,8 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); +extern bool filemap_range_needs_writeback(struct address_space *, + loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 151090fdcf291a..1750742b5689f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -635,6 +635,49 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } +/** + * filemap_range_needs_writeback - check if range potentially needs writeback + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. Used by O_DIRECT + * read/write with IOCB_NOWAIT, to see if the caller needs to do + * filemap_write_and_wait_range() before proceeding. + * + * Return: %true if the caller should do filemap_write_and_wait_range() before + * doing O_DIRECT to a page in this range, %false otherwise. + */ +bool filemap_range_needs_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + struct page *page; + + if (!mapping_needs_writeback(mapping)) + return false; + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + return false; + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + xas_for_each(&xas, page, max) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + break; + } + rcu_read_unlock(); + return page != NULL; +} +EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); + /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages From 7a60d6d7b34ebf9290d495e8bb4cd57c784ffb22 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Apr 2021 22:55:21 -0700 Subject: [PATCH 040/175] mm: use filemap_range_needs_writeback() for O_DIRECT reads For the generic page cache read helper, use the better variant of checking for the need to call filemap_write_and_wait_range() when doing O_DIRECT reads. This avoids falling back to the slow path for IOCB_NOWAIT, if there are no pages to wait for (or write out). Link: https://lkml.kernel.org/r/20210224164455.1096727-3-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 1750742b5689f2..16aea94b7e671c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2681,8 +2681,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) size = i_size_read(inode); if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) + if (filemap_range_needs_writeback(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) return -EAGAIN; } else { retval = filemap_write_and_wait_range(mapping, From 985b71db17506c668e5a9bd9fc700c95640dc191 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Apr 2021 22:55:24 -0700 Subject: [PATCH 041/175] iomap: use filemap_range_needs_writeback() for O_DIRECT reads For reads, use the better variant of checking for the need to call filemap_write_and_wait_range() when doing O_DIRECT. This avoids falling back to the slow path for IOCB_NOWAIT, if there are no pages to wait for (or write out). Link: https://lkml.kernel.org/r/20210224164455.1096727-4-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/iomap/direct-io.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bdd0d89bbf0a34..9398b8c31323b3 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -487,12 +487,28 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + iomap_flags |= IOMAP_NOWAIT; + } + if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { iomap_flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, pos, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + iomap_flags |= IOMAP_NOWAIT; + } + /* for data sync or sync, we need sync completion processing */ if (iocb->ki_flags & IOCB_DSYNC) dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -507,14 +523,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_WRITE_FUA; } - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, pos, end)) { - ret = -EAGAIN; - goto out_free_dio; - } - iomap_flags |= IOMAP_NOWAIT; - } - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (pos >= dio->i_size || pos + count > dio->i_size) From d31fa86a27b3ecdc32bf19326c4d3bba854542e2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:55:26 -0700 Subject: [PATCH 042/175] mm/filemap: use filemap_read_page in filemap_fault After splitting generic_file_buffered_read() into smaller parts, it turns out we can reuse one of the parts in filemap_fault(). This fixes an oversight -- waiting for the I/O to complete is now interruptible by a fatal signal. And it saves us a few bytes of text in an unlikely path. $ ./scripts/bloat-o-meter before.o after.o add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-207 (-207) Function old new delta filemap_fault 2187 1980 -207 Total: Before=37491, After=37284, chg -0.55% Link: https://lkml.kernel.org/r/20210226140011.2883498-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andrew Morton Cc: Kent Overstreet Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 16aea94b7e671c..84e7f3c7d66753 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2980,7 +2980,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; - struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; pgoff_t max_off; @@ -3067,14 +3066,8 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * because there really aren't any performance issues here * and we need to check for errors. */ - ClearPageError(page); fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (!PageUptodate(page)) - error = -EIO; - } + error = filemap_read_page(file, mapping, page); if (fpin) goto out_retry; put_page(page); @@ -3082,7 +3075,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; out_retry: From 79e3094c53c56d0d4da23f578de271e7602ba5ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:55:29 -0700 Subject: [PATCH 043/175] mm/filemap: drop check for truncated page after I/O If the I/O completed successfully, the page will remain Uptodate, even if it is subsequently truncated. If the I/O completed with an error, this check would cause us to retry the I/O if the page were truncated before we woke up. There is no need to retry the I/O; the I/O to fill the page failed, so we can legitimately just return -EIO. This code was originally added by commit 56f0d5fe6851 ("[PATCH] readpage-vs-invalidate fix") in 2005 (this commit ID is from the linux-fullhistory tree; it is also commit ba1f08f14b52 in tglx-history). At the time, truncate_complete_page() called ClearPageUptodate(), and so this was fixing a real bug. In 2008, commit 84209e02de48 ("mm: dont clear PG_uptodate on truncate/invalidate") removed the call to ClearPageUptodate, and this check has been unnecessary ever since. It doesn't do any real harm, but there's no need to keep it. Link: https://lkml.kernel.org/r/20210303222547.1056428-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 84e7f3c7d66753..9620696f880cda 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2348,8 +2348,6 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, return error; if (PageUptodate(page)) return 0; - if (!page->mapping) /* page truncated */ - return AOP_TRUNCATED_PAGE; shrink_readahead_size_eio(&file->f_ra); return -EIO; } From 1c824a680b1b67ad43c0908f11a70bcf37af56d5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:55:32 -0700 Subject: [PATCH 044/175] mm: page-writeback: simplify memcg handling in test_clear_page_writeback() Page writeback doesn't hold a page reference, which allows truncate to free a page the second PageWriteback is cleared. This used to require special attention in test_clear_page_writeback(), where we had to be careful not to rely on the unstable page->memcg binding and look up all the necessary information before clearing the writeback flag. Since commit 073861ed77b6 ("mm: fix VM_BUG_ON(PageTail) and BUG_ON(PageWriteback)") test_clear_page_writeback() is called with an explicit reference on the page, and this dance is no longer needed. Use unlock_page_memcg() and dec_lruvec_page_state() directly. This removes the last user of the lock_page_memcg() return value, change it to void. Touch up the comments in there as well. This also removes the last extern user of __unlock_page_memcg(), make it static. Further, it removes the last user of dec_lruvec_state(), delete it, along with a few other unused helpers. Link: https://lkml.kernel.org/r/YCQbYAWg4nvBFL6h@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++-------- include/linux/vmstat.h | 24 +++--------------------- mm/memcontrol.c | 36 +++++++++++------------------------- mm/page-writeback.c | 9 +++------ 4 files changed, 19 insertions(+), 60 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c04d39a796765..ae448d955a87cf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -867,8 +867,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); extern bool cgroup_memory_noswap; #endif -struct mem_cgroup *lock_page_memcg(struct page *page); -void __unlock_page_memcg(struct mem_cgroup *memcg); +void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); /* @@ -1289,12 +1288,7 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline struct mem_cgroup *lock_page_memcg(struct page *page) -{ - return NULL; -} - -static inline void __unlock_page_memcg(struct mem_cgroup *memcg) +static inline void lock_page_memcg(struct page *page) { } diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 506d625163a11c..3299cd69e4ca38 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -512,16 +512,10 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - __mod_lruvec_state(lruvec, idx, 1); -} - -static inline void __dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) +static inline void inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) { - __mod_lruvec_state(lruvec, idx, -1); + mod_lruvec_state(lruvec, idx, 1); } static inline void __inc_lruvec_page_state(struct page *page, @@ -536,18 +530,6 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } -static inline void inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, 1); -} - -static inline void dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, -1); -} - static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e064ac0d850ac1..06caac775abb3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2118,11 +2118,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) * This function protects unlocked LRU pages from being moved to * another cgroup. * - * It ensures lifetime of the returned memcg. Caller is responsible - * for the lifetime of the page; __unlock_page_memcg() is available - * when @page might get freed inside the locked section. + * It ensures lifetime of the locked memcg. Caller is responsible + * for the lifetime of the page. */ -struct mem_cgroup *lock_page_memcg(struct page *page) +void lock_page_memcg(struct page *page) { struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; @@ -2132,21 +2131,15 @@ struct mem_cgroup *lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page itself from being freed. E.g. writeback - * doesn't hold a page reference and relies on PG_writeback to - * keep off truncation, migration and so forth. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page_memcg(head); if (unlikely(!memcg)) - return NULL; + return; #ifdef CONFIG_PROVE_LOCKING local_irq_save(flags); @@ -2155,7 +2148,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) #endif if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page_memcg(head)) { @@ -2164,24 +2157,17 @@ struct mem_cgroup *lock_page_memcg(struct page *page) } /* - * When charge migration first begins, we can have locked and - * unlocked page stat updates happening concurrently. Track - * the task who has the lock for unlock_page_memcg(). + * When charge migration first begins, we can have multiple + * critical sections holding the fast-path RCU lock and one + * holding the slowpath move_lock. Track the task who has the + * move_lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; - - return memcg; } EXPORT_SYMBOL(lock_page_memcg); -/** - * __unlock_page_memcg - unlock and unpin a memcg - * @memcg: the memcg - * - * Unlock and unpin a memcg returned by lock_page_memcg(). - */ -void __unlock_page_memcg(struct mem_cgroup *memcg) +static void __unlock_page_memcg(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9e35b636a39375..5e761fb62800e7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2722,12 +2722,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; - struct lruvec *lruvec; int ret; - memcg = lock_page_memcg(page); - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2755,11 +2752,11 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - dec_lruvec_state(lruvec, NR_WRITEBACK); + dec_lruvec_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } - __unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } From 842ca547f706b1e05ccf3026a0ab15d24772a188 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:55:35 -0700 Subject: [PATCH 045/175] mm: move page_mapping_file to pagemap.h page_mapping_file() is only used by some architectures, and then it is usually only used in one place. Make it a static inline function so other architectures don't have to carry this dead code. Link: https://lkml.kernel.org/r/20210317123011.350118-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/copypage-v4mc.c | 1 + arch/arm/mm/copypage-v6.c | 1 + arch/arm/mm/copypage-xscale.c | 1 + arch/csky/abiv1/cacheflush.c | 1 + arch/mips/mm/cache.c | 1 + arch/nios2/mm/cacheflush.c | 1 + arch/sh/mm/cache-sh4.c | 1 + arch/sh/mm/cache-sh7705.c | 1 + arch/sparc/mm/tlb.c | 1 + include/linux/mm.h | 1 - include/linux/pagemap.h | 10 ++++++++++ mm/util.c | 10 ---------- 12 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 44f7292ec27be7..f1da3b439b968a 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index 6a769a6c314ef3..d8a115de5507ae 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index eb5d338657d10e..bcb485620a05f0 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 9f1fe80cc84736..07ff17ea33deee 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 7719d632df8dfa..a7bf0c80371cd0 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 65de1bd6a7604a..6aa9257c3ede42 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index ddfa9685f1ef74..72c2e1b46c0838 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index 4c67b3d887750c..9b63a53a5e46fe 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 20ee1473933322..9a725547578e87 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 21115933b9b8e4..2e5c207e702cb4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1629,7 +1629,6 @@ static inline pgoff_t page_index(struct page *page) bool page_mapped(struct page *page); struct address_space *page_mapping(struct page *page); -struct address_space *page_mapping_file(struct page *page); /* * Return true only if the page has been allocated with diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4686f9ab0636e5..469fa7ffcf9634 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -157,6 +157,16 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) void release_pages(struct page **pages, int nr); +/* + * For file cache pages, return the address_space, otherwise return NULL + */ +static inline struct address_space *page_mapping_file(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return NULL; + return page_mapping(page); +} + /* * speculatively take a reference to a page. * If the page is free (_refcount == 0), then _refcount is untouched, and 0 diff --git a/mm/util.c b/mm/util.c index c37e24d5fa43e2..083c5c417cfc79 100644 --- a/mm/util.c +++ b/mm/util.c @@ -711,16 +711,6 @@ struct address_space *page_mapping(struct page *page) } EXPORT_SYMBOL(page_mapping); -/* - * For file cache pages, return the address_space, otherwise return NULL - */ -struct address_space *page_mapping_file(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return NULL; - return page_mapping(page); -} - /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) { From 4b17f030fdc821ca58218489e3b7fd8381707849 Mon Sep 17 00:00:00 2001 From: Rui Sun Date: Thu, 29 Apr 2021 22:55:38 -0700 Subject: [PATCH 046/175] mm/filemap: update stale comment Commit a6de4b4873e1 ("mm: convert find_get_entry to return the head page") uses @index instead of @offset, but the comment is stale, update it. Link: https://lkml.kernel.org/r/1617948260-50724-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Rui Sun Signed-off-by: Shaokun Zhang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 9620696f880cda..5be57ba01d3399 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1767,7 +1767,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * @mapping: the address_space to search * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a + * Looks up the page cache slot at @mapping & @index. If there is a * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a From f6899bc03cbadc6e308d98252c4a832b5fd45b87 Mon Sep 17 00:00:00 2001 From: Nikita Ermakov Date: Thu, 29 Apr 2021 22:55:41 -0700 Subject: [PATCH 047/175] mm/msync: exit early when the flags is an MS_ASYNC and start < vm_start If an unmapped region was found and the flag is MS_ASYNC (without MS_INVALIDATE) there is nothing to do and the result would be always -ENOMEM, so return immediately. Link: https://lkml.kernel.org/r/20201025092901.56399-1-sh1r4s3@mail.si-head.nl Signed-off-by: Nikita Ermakov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/msync.c b/mm/msync.c index 69c6d202953187..137d1c104f3e94 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -55,7 +55,9 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out; /* * If the interval [start,end) covers some unmapped address ranges, - * just ignore them, but return -ENOMEM at the end. + * just ignore them, but return -ENOMEM at the end. Besides, if the + * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM + * anyway and there is nothing left to do, so return immediately. */ mmap_read_lock(mm); vma = find_vma(mm, start); @@ -69,6 +71,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { + if (flags == MS_ASYNC) + goto out_unlock; start = vma->vm_start; if (start >= end) goto out_unlock; From 8745d7f6346ca107256b3990bd5cd71039818739 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 29 Apr 2021 22:55:44 -0700 Subject: [PATCH 048/175] mm/gup: add compound page list iterator Patch series "mm/gup: page unpining improvements", v4. This series improves page unpinning, with an eye on improving MR deregistration for big swaths of memory (which is bound by the page unpining), particularly: 1) Decrement the head page by @ntails and thus reducing a lot the number of atomic operations per compound page. This is done by comparing individual tail pages heads, and counting number of consecutive tails on which they match heads and based on that update head page refcount. Should have a visible improvement in all page (un)pinners which use compound pages 2) Introducing a new API for unpinning page ranges (to avoid the trick in the previous item and be based on math), and use that in RDMA ib_mem_release (used for mr deregistration). Performance improvements: unpin_user_pages() for hugetlbfs and THP improves ~3x (through gup_test) and RDMA MR dereg improves ~4.5x with the new API. See patches 2 and 4 for those. This patch (of 4): Add a helper that iterates over head pages in a list of pages. It essentially counts the tails until the next page to process has a different head that the current. This is going to be used by unpin_user_pages() family of functions, to batch the head page refcount updates once for all passed consecutive tail pages. Link: https://lkml.kernel.org/r/20210212130843.13865-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20210212130843.13865-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins Suggested-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Cc: Doug Ledford Cc: Matthew Wilcox Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index ef7d2da9f03ff1..5856f68467b309 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -213,6 +213,32 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); +static inline void compound_next(unsigned long i, unsigned long npages, + struct page **list, struct page **head, + unsigned int *ntails) +{ + struct page *page; + unsigned int nr; + + if (i >= npages) + return; + + page = compound_head(list[i]); + for (nr = i + 1; nr < npages; nr++) { + if (compound_head(list[nr]) != page) + break; + } + + *head = page; + *ntails = nr - i; +} + +#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \ + for (__i = 0, \ + compound_next(__i, __npages, __list, &(__head), &(__ntails)); \ + __i < __npages; __i += __ntails, \ + compound_next(__i, __npages, __list, &(__head), &(__ntails))) + /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. From 31b912de1316644040ca9a0fb9b514ffa462c20c Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 29 Apr 2021 22:55:47 -0700 Subject: [PATCH 049/175] mm/gup: decrement head page once for group of subpages Rather than decrementing the head page refcount one by one, we walk the page array and checking which belong to the same compound_head. Later on we decrement the calculated amount of references in a single write to the head page. To that end switch to for_each_compound_head() does most of the work. set_page_dirty() needs no adjustment as it's a nop for non-dirty head pages and it doesn't operate on tail pages. This considerably improves unpinning of pages with THP and hugetlbfs: - THP gup_test -t -m 16384 -r 10 [-L|-a] -S -n 512 -w PIN_LONGTERM_BENCHMARK (put values): ~87.6k us -> ~23.2k us - 16G with 1G huge page size gup_test -f /mnt/huge/file -m 16384 -r 10 [-L|-a] -S -n 512 -w PIN_LONGTERM_BENCHMARK: (put values): ~87.6k us -> ~27.5k us Link: https://lkml.kernel.org/r/20210212130843.13865-3-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Cc: Christoph Hellwig Cc: Doug Ledford Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 5856f68467b309..de1b75ef44da60 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -265,20 +265,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long index; - - /* - * TODO: this can be optimized for huge pages: if a series of pages is - * physically contiguous and part of the same compound page, then a - * single operation to the head page should suffice. - */ + struct page *head; + unsigned int ntails; if (!make_dirty) { unpin_user_pages(pages, npages); return; } - for (index = 0; index < npages; index++) { - struct page *page = compound_head(pages[index]); + for_each_compound_head(index, pages, npages, head, ntails) { /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key @@ -299,9 +294,9 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ - if (!PageDirty(page)) - set_page_dirty_lock(page); - unpin_user_page(page); + if (!PageDirty(head)) + set_page_dirty_lock(head); + put_compound_head(head, ntails, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); @@ -318,6 +313,8 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long index; + struct page *head; + unsigned int ntails; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by @@ -326,13 +323,9 @@ void unpin_user_pages(struct page **pages, unsigned long npages) */ if (WARN_ON(IS_ERR_VALUE(npages))) return; - /* - * TODO: this can be optimized for huge pages: if a series of pages is - * physically contiguous and part of the same compound page, then a - * single operation to the head page should suffice. - */ - for (index = 0; index < npages; index++) - unpin_user_page(pages[index]); + + for_each_compound_head(index, pages, npages, head, ntails) + put_compound_head(head, ntails, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_pages); From 458a4f788f8602e5701b3d8c2fb6b021310a7301 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 29 Apr 2021 22:55:50 -0700 Subject: [PATCH 050/175] mm/gup: add a range variant of unpin_user_pages_dirty_lock() Add an unpin_user_page_range_dirty_lock() API which takes a starting page and how many consecutive pages we want to unpin and optionally dirty. To that end, define another iterator for_each_compound_range() that operates in page ranges as opposed to page array. For users (like RDMA mr_dereg) where each sg represents a contiguous set of pages, we're able to more efficiently unpin pages without having to supply an array of pages much of what happens today with unpin_user_pages(). Link: https://lkml.kernel.org/r/20210212130843.13865-4-joao.m.martins@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Christoph Hellwig Cc: Doug Ledford Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/gup.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e5c207e702cb4..702c2a7379d6bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1265,6 +1265,8 @@ static inline void put_page(struct page *page) void unpin_user_page(struct page *page); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); /** diff --git a/mm/gup.c b/mm/gup.c index de1b75ef44da60..66522ae28d0917 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -213,6 +213,32 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); +static inline void compound_range_next(unsigned long i, unsigned long npages, + struct page **list, struct page **head, + unsigned int *ntails) +{ + struct page *next, *page; + unsigned int nr = 1; + + if (i >= npages) + return; + + next = *list + i; + page = compound_head(next); + if (PageCompound(page) && compound_order(page) >= 1) + nr = min_t(unsigned int, + page + compound_nr(page) - next, npages - i); + + *head = page; + *ntails = nr; +} + +#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \ + for (__i = 0, \ + compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \ + __i < __npages; __i += __ntails, \ + compound_range_next(__i, __npages, __list, &(__head), &(__ntails))) + static inline void compound_next(unsigned long i, unsigned long npages, struct page **list, struct page **head, unsigned int *ntails) @@ -301,6 +327,42 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); +/** + * unpin_user_page_range_dirty_lock() - release and optionally dirty + * gup-pinned page range + * + * @page: the starting page of a range maybe marked dirty, and definitely released. + * @npages: number of consecutive pages to release. + * @make_dirty: whether to mark the pages dirty + * + * "gup-pinned page range" refers to a range of pages that has had one of the + * pin_user_pages() variants called on that page. + * + * For the page ranges defined by [page .. page+npages], make that range (or + * its head pages, if a compound page) dirty, if @make_dirty is true, and if the + * page range was previously listed as clean. + * + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), unpin_user_page(). + * + */ +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty) +{ + unsigned long index; + struct page *head; + unsigned int ntails; + + for_each_compound_range(index, &page, npages, head, ntails) { + if (make_dirty && !PageDirty(head)) + set_page_dirty_lock(head); + put_compound_head(head, ntails, FOLL_PIN); + } +} +EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); + /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. From 1d4b0166e36334c3f32686a336bb25dd904fce2b Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 29 Apr 2021 22:55:53 -0700 Subject: [PATCH 051/175] RDMA/umem: batch page unpin in __ib_umem_release() Use the newly added unpin_user_page_range_dirty_lock() for more quickly unpinning a consecutive range of pages represented as compound pages. This will also calculate number of pages to unpin (for the tail pages which matching head page) and thus batch the refcount update. Running a test program which calls memory range reg/unreg on a region 1G in size and measures cost of both operations together (in a guest using rxe) with THP and hugetlbfs: Before: 590 rounds in 5.003 sec: 8480.335 usec / round 6898 rounds in 60.001 sec: 8698.367 usec / round After: 2688 rounds in 5.002 sec: 1860.786 usec / round 32517 rounds in 60.001 sec: 1845.225 usec / round Link: https://lkml.kernel.org/r/20210212130843.13865-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Acked-by: Jason Gunthorpe Cc: Christoph Hellwig Cc: Doug Ledford Cc: John Hubbard Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/core/umem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 2dde99a9ba0789..9b607013e2a249 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -47,17 +47,17 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct sg_page_iter sg_iter; - struct page *page; + bool make_dirty = umem->writable && dirty; + struct scatterlist *sg; + unsigned int i; if (umem->nmap > 0) ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, DMA_BIDIRECTIONAL); - for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { - page = sg_page_iter_page(&sg_iter); - unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty); - } + for_each_sg(umem->sg_head.sgl, sg, umem->sg_nents, i) + unpin_user_page_range_dirty_lock(sg_page(sg), + DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); sg_free_table(&umem->sg_head); } From 4066c119483af8e86a75447fd35be1d2553d370f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 29 Apr 2021 22:55:56 -0700 Subject: [PATCH 052/175] mm: gup: remove FOLL_SPLIT Since commit 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") and commit ba925fa35057 ("s390/gmap: improve THP splitting") FOLL_SPLIT has not been used anymore. Remove the dead code. Link: https://lkml.kernel.org/r/20210330203900.9222-1-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.rst | 5 ----- include/linux/mm.h | 1 - mm/gup.c | 28 ++-------------------------- 3 files changed, 2 insertions(+), 32 deletions(-) diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst index 0ed23e59abe513..216db1d67d04eb 100644 --- a/Documentation/vm/transhuge.rst +++ b/Documentation/vm/transhuge.rst @@ -53,11 +53,6 @@ prevent the page from being split by anyone. of handling GUP on hugetlbfs will also work fine on transparent hugepage backed mappings. -In case you can't handle compound pages if they're returned by -follow_page, the FOLL_SPLIT bit can be specified as a parameter to -follow_page, so that it will split the hugepages before returning -them. - Graceful fallback ================= diff --git a/include/linux/mm.h b/include/linux/mm.h index 702c2a7379d6bc..64be3baf861a98 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2791,7 +2791,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ #define FOLL_POPULATE 0x40 /* fault in page */ -#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ diff --git a/mm/gup.c b/mm/gup.c index 66522ae28d0917..71e546e279fcac 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -516,18 +516,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, } } - if (flags & FOLL_SPLIT && PageTransCompound(page)) { - get_page(page); - pte_unmap_unlock(ptep, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return ERR_PTR(ret); - goto retry; - } - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) { page = ERR_PTR(-ENOMEM); @@ -672,7 +660,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { + if (flags & FOLL_SPLIT_PMD) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -681,19 +669,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else if (flags & FOLL_SPLIT) { - if (unlikely(!try_get_page(page))) { - spin_unlock(ptl); - return ERR_PTR(-ENOMEM); - } - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (pmd_none(*pmd)) - return no_page_table(vma, flags); - } else { /* flags & FOLL_SPLIT_PMD */ + } else { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; From 2840d498e30ce53a3a7cb482a5445efd892c7697 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Thu, 29 Apr 2021 22:55:59 -0700 Subject: [PATCH 053/175] mm/memremap.c: fix improper SPDX comment style Replace /* */ comment with //, fix SPDX comment style. see: Documentation/process/license-rules.rst Link: https://lkml.kernel.org/r/1614223348-15516-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memremap.c b/mm/memremap.c index 7aa7d6e80ee508..15a074ffb8d73d 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include #include From 27faca83a7e955e4e0b831d75a8a9a840fe9bae4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:02 -0700 Subject: [PATCH 054/175] mm: memcontrol: fix kernel stack account For simplification commit 991e7673859e ("mm: memcontrol: account kernel stack per node") changed the per zone vmalloc backed stack pages accounting to per node. By doing that we have lost a certain precision because those pages might live in different NUMA nodes. In the end NR_KERNEL_STACK_KB exported to the userspace might be over estimated on some nodes while underestimated on others. But this is not a real world problem, just a problem found by reading the code. So there is no actual data to showing how much impact it has on users. This doesn't impose any real problem to correctnes of the kernel behavior as the counter is not used for any internal processing but it can cause some confusion to the userspace. Address the problem by accounting each vmalloc backing page to its own node. Link: https://lkml.kernel.org/r/20210303151843.81156-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 0a5d28fe99904b..771e0ea90499d0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -380,14 +380,17 @@ static void account_kernel_stack(struct task_struct *tsk, int account) void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); + if (vm) { + int i; - /* All stack pages are in the same node. */ - if (vm) - mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); - else + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + account * (PAGE_SIZE / 1024)); + } else { + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); + } } static int memcg_charge_kernel_stack(struct task_struct *tsk) From a47920306c72acaa6ab935c174476ec1d2c7284d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 29 Apr 2021 22:56:05 -0700 Subject: [PATCH 055/175] memcg: cleanup root memcg checks Replace the implicit checking of root memcg with explicit root memcg checking i.e. !css->parent with mem_cgroup_is_root(). Link: https://lkml.kernel.org/r/20210223205625.2792891-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06caac775abb3d..c5ae6a878b8e33 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4123,7 +4123,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > 100) return -EINVAL; - if (css->parent) + if (!mem_cgroup_is_root(memcg)) memcg->swappiness = val; else vm_swappiness = val; @@ -4473,7 +4473,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!css->parent || !((val == 0) || (val == 1))) + if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) return -EINVAL; memcg->oom_kill_disable = val; From 3d0cbb9816935ea3846eb2c0d3c07cd31697267e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 29 Apr 2021 22:56:08 -0700 Subject: [PATCH 056/175] memcg: enable memcg oom-kill for __GFP_NOFAIL In the era of async memcg oom-killer, the commit a0d8b00a3381 ("mm: memcg: do not declare OOM from __GFP_NOFAIL allocations") added the code to skip memcg oom-killer for __GFP_NOFAIL allocations. The reason was that the __GFP_NOFAIL callers will not enter aync oom synchronization path and will keep the task marked as in memcg oom. At that time the tasks marked in memcg oom can bypass the memcg limits and the oom synchronization would have happened later in the later userspace triggered page fault. Thus letting the task marked as under memcg oom bypass the memcg limit for arbitrary time. With the synchronous memcg oom-killer (commit 29ef680ae7c21 ("memcg, oom: move out_of_memory back to the charge path")) and not letting the task marked under memcg oom to bypass the memcg limits (commit 1f14c1ac19aa4 ("mm: memcg: do not allow task about to OOM kill to bypass the limit")), we can again allow __GFP_NOFAIL allocations to trigger memcg oom-kill. This will make memcg oom behavior closer to page allocator oom behavior. Link: https://lkml.kernel.org/r/20210223204337.2785120-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5ae6a878b8e33..e01602134759a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2779,9 +2779,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (gfp_mask & __GFP_NOFAIL) - goto force; - if (fatal_signal_pending(current)) goto force; From a3d4c05a447486b90298a8c964916c8f4fcb903f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:11 -0700 Subject: [PATCH 057/175] mm: memcontrol: fix cpuhotplug statistics flushing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: memcontrol: switch to rstat", v3. This series converts memcg stats tracking to the streamlined rstat infrastructure provided by the cgroup core code. rstat is already used by the CPU controller and the IO controller. This change is motivated by recent accuracy problems in memcg's custom stats code, as well as the benefits of sharing common infra with other controllers. The current memcg implementation does batched tree aggregation on the write side: local stat changes are cached in per-cpu counters, which are then propagated upward in batches when a threshold (32 pages) is exceeded. This is cheap, but the error introduced by the lazy upward propagation adds up: 32 pages times CPUs times cgroups in the subtree. We've had complaints from service owners that the stats do not reliably track and react to allocation behavior as expected, sometimes swallowing the results of entire test applications. The original memcg stat implementation used to do tree aggregation exclusively on the read side: local stats would only ever be tracked in per-cpu counters, and a memory.stat read would iterate the entire subtree and sum those counters up. This didn't keep up with the times: - Cgroup trees are much bigger now. We switched to lazily-freed cgroups, where deleted groups would hang around until their remaining page cache has been reclaimed. This can result in large subtrees that are expensive to walk, while most of the groups are idle and their statistics don't change much anymore. - Automated monitoring increased. With the proliferation of userspace oom killing, proactive reclaim, and higher-resolution logging of workload trends in general, top-level stat files are polled at least once a second in many deployments. - The lifetime of cgroups got shorter. Where most cgroup setups in the past would have a few large policy-oriented cgroups for everything running on the system, newer cgroup deployments tend to create one group per application - which gets deleted again as the processes exit. An aggregation scheme that doesn't retain child data inside the parents loses event history of the subtree. Rstat addresses all three of those concerns through intelligent, persistent read-side aggregation. As statistics change at the local level, rstat tracks - on a per-cpu basis - only those parts of a subtree that have changes pending and require aggregation. The actual aggregation occurs on the colder read side - which can now skip over (potentially large) numbers of recently idle cgroups. === The test_kmem cgroup selftest is currently failing due to excessive cumulative vmstat drift from 100 subgroups: ok 1 test_kmem_basic memory.current = 8810496 slab + anon + file + kernel_stack = 17074568 slab = 6101384 anon = 946176 file = 0 kernel_stack = 10027008 not ok 2 test_kmem_memcg_deletion ok 3 test_kmem_proc_kpagecgroup ok 4 test_kmem_kernel_stacks ok 5 test_kmem_dead_cgroups ok 6 test_percpu_basic As you can see, memory.stat items far exceed memory.current. The kernel stack alone is bigger than all of charged memory. That's because the memory of the test has been uncharged from memory.current, but the negative vmstat deltas are still sitting in the percpu caches. The test at this time isn't even counting percpu, pagetables etc. yet, which would further contribute to the error. The last patch in the series updates the test to include them - as well as reduces the vmstat tolerances in general to only expect page_counter batching. With all patches applied, the (now more stringent) test succeeds: ok 1 test_kmem_basic ok 2 test_kmem_memcg_deletion ok 3 test_kmem_proc_kpagecgroup ok 4 test_kmem_kernel_stacks ok 5 test_kmem_dead_cgroups ok 6 test_percpu_basic === A kernel build test confirms that overhead is comparable. Two kernels are built simultaneously in a nested tree with several idle siblings: root - kernelbuild - one - two - three - four - build-a (defconfig, make -j16) `- build-b (defconfig, make -j16) `- idle-1 `- ... `- idle-9 During the builds, kernelbuild/memory.stat is read once a second. A perf diff shows that the changes in cycle distribution is minimal. Top 10 kernel symbols: 0.09% +0.08% [kernel.kallsyms] [k] __mod_memcg_lruvec_state 0.00% +0.06% [kernel.kallsyms] [k] cgroup_rstat_updated 0.08% -0.05% [kernel.kallsyms] [k] __mod_memcg_state.part.0 0.16% -0.04% [kernel.kallsyms] [k] release_pages 0.00% +0.03% [kernel.kallsyms] [k] __count_memcg_events 0.01% +0.03% [kernel.kallsyms] [k] mem_cgroup_charge_statistics.constprop.0 0.10% -0.02% [kernel.kallsyms] [k] get_mem_cgroup_from_mm 0.05% -0.02% [kernel.kallsyms] [k] mem_cgroup_update_lru_size 0.57% +0.01% [kernel.kallsyms] [k] asm_exc_page_fault === The on-demand aggregated stats are now fully accurate: $ grep -e nr_inactive_file /proc/vmstat | awk '{print($1,$2*4096)}'; \ grep -e inactive_file /sys/fs/cgroup/memory.stat vanilla: patched: nr_inactive_file 1574105088 nr_inactive_file 1027801088 inactive_file 1577410560 inactive_file 1027801088 === This patch (of 8): The memcg hotunplug callback erroneously flushes counts on the local CPU, not the counts of the CPU going away; those counts will be lost. Flush the CPU that is actually going away. Also simplify the code a bit by using mod_memcg_state() and count_memcg_events() instead of open-coding the upward flush - this is comparable to how vmstat.c handles hotunplug flushing. Link: https://lkml.kernel.org/r/20210209163304.77088-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210209163304.77088-2-hannes@cmpxchg.org Fixes: a983b5ebee572 ("mm: memcontrol: fix excessive complexity in memory.stat reporting") Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Reviewed-by: Michal Koutný Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e01602134759a8..5be0e8834a5a31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2370,45 +2370,52 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg, *mi; + struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); for_each_mem_cgroup(memcg) { + struct memcg_vmstats_percpu *statc; int i; + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + for (i = 0; i < MEMCG_NR_STAT; i++) { int nid; - long x; - x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); - if (x) - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmstats[i]); + if (statc->stat[i]) { + mod_memcg_state(memcg, i, statc->stat[i]); + statc->stat[i] = 0; + } if (i >= NR_VM_NODE_STAT_ITEMS) continue; for_each_node(nid) { + struct batched_lruvec_stat *lstatc; struct mem_cgroup_per_node *pn; + long x; pn = mem_cgroup_nodeinfo(memcg, nid); - x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) + lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); + + x = lstatc->count[i]; + lstatc->count[i] = 0; + + if (x) { do { atomic_long_add(x, &pn->lruvec_stat[i]); } while ((pn = parent_nodeinfo(pn, nid))); + } } } for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - long x; - - x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); - if (x) - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmevents[i]); + if (statc->events[i]) { + count_memcg_events(memcg, i, statc->events[i]); + statc->events[i] = 0; + } } } From a3747b53b1771a787fea71d86a2fc39aea337685 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:14 -0700 Subject: [PATCH 058/175] mm: memcontrol: kill mem_cgroup_nodeinfo() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No need to encapsulate a simple struct member access. Link: https://lkml.kernel.org/r/20210209163304.77088-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Michal Koutný Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 +------- mm/memcontrol.c | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ae448d955a87cf..149dc2fd97a6a3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -602,12 +602,6 @@ void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); -static struct mem_cgroup_per_node * -mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) -{ - return memcg->nodeinfo[nid]; -} - /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec @@ -631,7 +625,7 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, if (!memcg) memcg = root_mem_cgroup; - mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5be0e8834a5a31..b4a73c41312ed0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -414,13 +414,14 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, int size, int old_size) { struct memcg_shrinker_map *new, *old; + struct mem_cgroup_per_node *pn; int nid; lockdep_assert_held(&memcg_shrinker_map_mutex); for_each_node(nid) { - old = rcu_dereference_protected( - mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); + pn = memcg->nodeinfo[nid]; + old = rcu_dereference_protected(pn->shrinker_map, true); /* Not yet online memcg */ if (!old) return 0; @@ -433,7 +434,7 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, memset(new->map, (int)0xff, old_size); memset((void *)new->map + old_size, 0, size - old_size); - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); + rcu_assign_pointer(pn->shrinker_map, new); call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); } @@ -450,7 +451,7 @@ static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) return; for_each_node(nid) { - pn = mem_cgroup_nodeinfo(memcg, nid); + pn = memcg->nodeinfo[nid]; map = rcu_dereference_protected(pn->shrinker_map, true); kvfree(map); rcu_assign_pointer(pn->shrinker_map, NULL); @@ -713,7 +714,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) int nid; for_each_node(nid) { - mz = mem_cgroup_nodeinfo(memcg, nid); + mz = memcg->nodeinfo[nid]; mctz = soft_limit_tree_node(nid); if (mctz) mem_cgroup_remove_exceeded(mz, mctz); @@ -796,7 +797,7 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) parent = parent_mem_cgroup(pn->memcg); if (!parent) return NULL; - return mem_cgroup_nodeinfo(parent, nid); + return parent->nodeinfo[nid]; } void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, @@ -1136,7 +1137,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup_per_node *mz; - mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); + mz = root->nodeinfo[reclaim->pgdat->node_id]; iter = &mz->iter; if (prev && reclaim->generation != iter->generation) @@ -1238,7 +1239,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from, int nid; for_each_node(nid) { - mz = mem_cgroup_nodeinfo(from, nid); + mz = from->nodeinfo[nid]; iter = &mz->iter; cmpxchg(&iter->position, dead_memcg, NULL); } @@ -2397,7 +2398,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) struct mem_cgroup_per_node *pn; long x; - pn = mem_cgroup_nodeinfo(memcg, nid); + pn = memcg->nodeinfo[nid]; lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); x = lstatc->count[i]; @@ -4098,7 +4099,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { - mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + mz = memcg->nodeinfo[pgdat->node_id]; anon_cost += mz->lruvec.anon_cost; file_cost += mz->lruvec.file_cost; From a18e6e6e150a98b9ce3e9acabeff407e7b6ba0c0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:17 -0700 Subject: [PATCH 059/175] mm: memcontrol: privatize memcg_page_state query functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users outside of the memory controller itself. The rest of the kernel cares either about node or lruvec stats. Link: https://lkml.kernel.org/r/20210209163304.77088-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Michal Koutný Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 44 -------------------------------------- mm/memcontrol.c | 32 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 44 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 149dc2fd97a6a3..9a02aabd0000ef 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -864,39 +864,6 @@ extern bool cgroup_memory_noswap; void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = atomic_long_read(&memcg->vmstats[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, - int idx) -{ - long x = 0; - int cpu; - - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->stat[idx], cpu); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -1322,17 +1289,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - return 0; -} - -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, - int idx) -{ - return 0; -} - static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int nr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b4a73c41312ed0..56d1c6e58c3b3b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -789,6 +789,38 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); } +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ +static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = atomic_long_read(&memcg->vmstats[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ +static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) +{ + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_local->stat[idx], cpu); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static struct mem_cgroup_per_node * parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) { From a7df69b81aac5bdeb5c5aef9addd680ce22feebf Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:20 -0700 Subject: [PATCH 060/175] cgroup: rstat: support cgroup1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rstat currently only supports the default hierarchy in cgroup2. In order to replace memcg's private stats infrastructure - used in both cgroup1 and cgroup2 - with rstat, the latter needs to support cgroup1. The initialization and destruction callbacks for regular cgroups are already in place. Remove the cgroup_on_dfl() guards to handle cgroup1. The initialization of the root cgroup is currently hardcoded to only handle cgrp_dfl_root.cgrp. Move those callbacks to cgroup_setup_root() and cgroup_destroy_root() to handle the default root as well as the various cgroup1 roots we may set up during mounting. The linking of css to cgroups happens in code shared between cgroup1 and cgroup2 as well. Simply remove the cgroup_on_dfl() guard. Linkage of the root css to the root cgroup is a bit trickier: per default, the root css of a subsystem controller belongs to the default hierarchy (i.e. the cgroup2 root). When a controller is mounted in its cgroup1 version, the root css is stolen and moved to the cgroup1 root; on unmount, the css moves back to the default hierarchy. Annotate rebind_subsystems() to move the root css linkage along between roots. Link: https://lkml.kernel.org/r/20210209163304.77088-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Tejun Heo Reviewed-by: Michal Koutný Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 34 +++++++++++++++++++++------------- kernel/cgroup/rstat.c | 2 -- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9153b20e5cc656..e049edd6677609 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1339,6 +1339,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_unlock(&cgroup_mutex); + cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1751,6 +1752,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) &dcgrp->e_csets[ss->id]); spin_unlock_irq(&css_set_lock); + if (ss->css_rstat_flush) { + list_del_rcu(&css->rstat_css_node); + list_add_rcu(&css->rstat_css_node, + &dcgrp->rstat_css_list); + } + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -1971,10 +1978,14 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = rebind_subsystems(root, ss_mask); + ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto exit_stats; + ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); @@ -2006,6 +2017,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = 0; goto out; +exit_stats: + cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -4934,8 +4947,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4976,8 +4988,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_flush(cgrp); + cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5034,7 +5045,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); @@ -5159,11 +5170,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - if (cgroup_on_dfl(parent)) { - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - } + ret = cgroup_rstat_init(cgrp); + if (ret) + goto out_cancel_ref; /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); @@ -5250,8 +5259,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: - if (cgroup_on_dfl(parent)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d51175cedfca4f..faa767a870baf2 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -285,8 +285,6 @@ void __init cgroup_rstat_boot(void) for_each_possible_cpu(cpu) raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); - - BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); } /* From dc26532aed0ab25c0801a34640d1f3b9b9098a48 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:23 -0700 Subject: [PATCH 061/175] cgroup: rstat: punt root-level optimization to individual controllers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current users of the rstat code can source root-level statistics from the native counters of their respective subsystem, allowing them to forego aggregation at the root level. This optimization is currently implemented inside the generic rstat code, which doesn't track the root cgroup and doesn't invoke the subsystem flush callbacks on it. However, the memory controller cannot do this optimization, because cgroup1 breaks out memory specifically for the local level, including at the root level. In preparation for the memory controller switching to rstat, move the optimization from rstat core to the controllers. Afterwards, rstat will always track the root cgroup for changes and invoke the subsystem callbacks on it; and it's up to the subsystem to special-case and skip aggregation of the root cgroup if it can source this information through other, cheaper means. This is the case for the io controller and the cgroup base stats. In their respective flush callbacks, check whether the parent is the root cgroup, and if so, skip the unnecessary upward propagation. The extra cost of tracking the root cgroup is negligible: on stat changes, we actually remove a branch that checks for the root. The queueing for a flush touches only per-cpu data, and only the first stat change since a flush requires a (per-cpu) lock. Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: Michal Hocko Cc: Michal Koutný Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-cgroup.c | 17 +++++++----- kernel/cgroup/rstat.c | 61 +++++++++++++++++++++++++------------------ 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a317c03d40f6e1..582d2f18717ee3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -764,6 +764,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; + /* Root-level stats are sourced from system-wide IO stats */ + if (!cgroup_parent(css->cgroup)) + return; + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -786,8 +790,8 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) blkg_iostat_add(&bisc->last, &delta); u64_stats_update_end(&blkg->iostat.sync); - /* propagate global delta to parent */ - if (parent) { + /* propagate global delta to parent (unless that's root) */ + if (parent && parent->parent) { u64_stats_update_begin(&parent->iostat.sync); blkg_iostat_set(&delta, &blkg->iostat.cur); blkg_iostat_sub(&delta, &blkg->iostat.last); @@ -801,10 +805,11 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) } /* - * The rstat algorithms intentionally don't handle the root cgroup to avoid - * incurring overhead when no cgroups are defined. For that reason, - * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the - * iostat in the root cgroup's blkcg_gq. + * We source root cgroup stats from the system-wide stats to avoid + * tracking the same information twice and incurring overhead when no + * cgroups are defined. For that reason, cgroup_rstat_flush in + * blkcg_print_stat does not actually fill out the iostat in the root + * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index faa767a870baf2..3a3fd2993a6500 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -25,13 +25,8 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup *parent; unsigned long flags; - /* nothing to do for root */ - if (!cgroup_parent(cgrp)) - return; - /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. @@ -46,10 +41,10 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_lock_irqsave(cpu_lock, flags); /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { + while (true) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -58,8 +53,17 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (rstatc->updated_next) break; + /* Root has no parent to link it to, but mark it busy */ + if (!parent) { + rstatc->updated_next = cgrp; + break; + } + + prstatc = cgroup_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = cgrp; + + cgrp = parent; } raw_spin_unlock_irqrestore(cpu_lock, flags); @@ -113,23 +117,26 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, */ if (rstatc->updated_next) { struct cgroup *parent = cgroup_parent(pos); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); - struct cgroup_rstat_cpu *nrstatc; - struct cgroup **nextp; - - nextp = &prstatc->updated_children; - while (true) { - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; + + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; + + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (true) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + if (*nextp == pos) + break; + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - return pos; } @@ -309,11 +316,15 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_base_stat cur, delta; unsigned seq; + /* Root-level stats are sourced from system-wide CPU stats */ + if (!parent) + return; + /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); @@ -326,8 +337,8 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); - /* propagate global delta to parent */ - if (parent) { + /* propagate global delta to parent (unless that's root) */ + if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); From 2d146aa3aa842d7f5065802556b4f9a2c6e8ef12 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:26 -0700 Subject: [PATCH 062/175] mm: memcontrol: switch to rstat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the memory controller's custom hierarchical stats code with the generic rstat infrastructure provided by the cgroup core. The current implementation does batched upward propagation from the write side (i.e. as stats change). The per-cpu batches introduce an error, which is multiplied by the number of subgroups in a tree. In systems with many CPUs and sizable cgroup trees, the error can be large enough to confuse users (e.g. 32 batch pages * 32 CPUs * 32 subgroups results in an error of up to 128M per stat item). This can entirely swallow allocation bursts inside a workload that the user is expecting to see reflected in the statistics. In the past, we've done read-side aggregation, where a memory.stat read would have to walk the entire subtree and add up per-cpu counts. This became problematic with lazily-freed cgroups: we could have large subtrees where most cgroups were entirely idle. Hence the switch to change-driven upward propagation. Unfortunately, it needed to trade accuracy for speed due to the write side being so hot. Rstat combines the best of both worlds: from the write side, it cheaply maintains a queue of cgroups that have pending changes, so that the read side can do selective tree aggregation. This way the reported stats will always be precise and recent as can be, while the aggregation can skip over potentially large numbers of idle cgroups. The way rstat works is that it implements a tree for tracking cgroups with pending local changes, as well as a flush function that walks the tree upwards. The controller then drives this by 1) telling rstat when a local cgroup stat changes (e.g. mod_memcg_state) and 2) when a flush is required to get uptodate hierarchy stats for a given subtree (e.g. when memory.stat is read). The controller also provides a flush callback that is called during the rstat flush walk for each cgroup and aggregates its local per-cpu counters and propagates them upwards. This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT + NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward aggregation. It removes 3 words from the per-cpu data. It eliminates memcg_exact_page_state(), since memcg_page_state() is now exact. [akpm@linux-foundation.org: merge fix] [hannes@cmpxchg.org: fix a sleep in atomic section problem] Link: https://lkml.kernel.org/r/20210315234100.64307-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210209163304.77088-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Reviewed-by: Michal Koutný Acked-by: Balbir Singh Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 67 +++++++----- mm/memcontrol.c | 218 +++++++++++++++---------------------- 2 files changed, 127 insertions(+), 158 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9a02aabd0000ef..74910ce9a3f9fa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -76,10 +76,27 @@ enum mem_cgroup_events_target { }; struct memcg_vmstats_percpu { - long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_VM_EVENT_ITEMS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_VM_EVENT_ITEMS]; }; struct mem_cgroup_reclaim_iter { @@ -287,8 +304,8 @@ struct mem_cgroup { MEMCG_PADDING(_pad1_); - atomic_long_t vmstats[MEMCG_NR_STAT]; - atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; + /* memory.stat */ + struct memcg_vmstats vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -315,10 +332,6 @@ struct mem_cgroup { atomic_t moving_account; struct task_struct *move_lock_task; - /* Legacy local VM stats and events */ - struct memcg_vmstats_percpu __percpu *vmstats_local; - - /* Subtree VM stats and events (batched updates) */ struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK @@ -939,10 +952,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); - void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1023,6 +1032,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, unsigned int nr); +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1131,6 +1144,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page, return lruvec == &pgdat->__lruvec; } +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} + static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@ -1334,18 +1351,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, mod_node_page_state(page_pgdat(page), idx, val); } -static inline -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - return 0; -} - -static inline void split_page_memcg(struct page *head, unsigned int nr) -{ -} - static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) @@ -1368,8 +1373,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline void split_page_memcg(struct page *head, unsigned int nr) +{ +} + +static inline +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) { + return 0; } #endif /* CONFIG_MEMCG */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56d1c6e58c3b3b..b323588223acc5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -765,37 +765,17 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) */ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - long x, threshold = MEMCG_CHARGE_BATCH; - if (mem_cgroup_disabled()) return; - if (memcg_stat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; - - x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); - if (unlikely(abs(x) > threshold)) { - struct mem_cgroup *mi; - - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(memcg->vmstats_local->stat[idx], x); - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmstats[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ +/* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->vmstats[idx]); + long x = READ_ONCE(memcg->vmstats.state[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -803,17 +783,14 @@ static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return x; } -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ +/* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { long x = 0; int cpu; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->stat[idx], cpu); + x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -936,30 +913,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - unsigned long x; - if (mem_cgroup_disabled()) return; - x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); - if (unlikely(x > MEMCG_CHARGE_BATCH)) { - struct mem_cgroup *mi; - - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(memcg->vmstats_local->events[idx], x); - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmevents[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->events[idx], x); + __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->vmevents[event]); + return READ_ONCE(memcg->vmstats.events[event]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) @@ -968,7 +931,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) int cpu; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[event], cpu); return x; } @@ -1604,6 +1567,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ + cgroup_rstat_flush(memcg->css.cgroup); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -2409,22 +2373,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) drain_stock(stock); for_each_mem_cgroup(memcg) { - struct memcg_vmstats_percpu *statc; int i; - statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - - for (i = 0; i < MEMCG_NR_STAT; i++) { + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { int nid; - if (statc->stat[i]) { - mod_memcg_state(memcg, i, statc->stat[i]); - statc->stat[i] = 0; - } - - if (i >= NR_VM_NODE_STAT_ITEMS) - continue; - for_each_node(nid) { struct batched_lruvec_stat *lstatc; struct mem_cgroup_per_node *pn; @@ -2443,13 +2396,6 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) } } } - - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - if (statc->events[i]) { - count_memcg_events(memcg, i, statc->events[i]); - statc->events[i] = 0; - } - } } return 0; @@ -3572,6 +3518,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { + cgroup_rstat_flush(memcg->css.cgroup); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3636,26 +3583,15 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) +static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg) { - unsigned long stat[MEMCG_NR_STAT] = {0}; - struct mem_cgroup *mi; - int node, cpu, i; - - for_each_online_cpu(cpu) - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); - - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < MEMCG_NR_STAT; i++) - atomic_long_add(stat[i], &mi->vmstats[i]); + int node; for_each_node(node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + unsigned long stat[NR_VM_NODE_STAT_ITEMS] = { 0 }; struct mem_cgroup_per_node *pi; - - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - stat[i] = 0; + int cpu, i; for_each_online_cpu(cpu) for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) @@ -3668,25 +3604,6 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) } } -static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) -{ - unsigned long events[NR_VM_EVENT_ITEMS]; - struct mem_cgroup *mi; - int cpu, i; - - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] = 0; - - for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += per_cpu(memcg->vmstats_percpu->events[i], - cpu); - - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - atomic_long_add(events[i], &mi->vmevents[i]); -} - #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -4003,6 +3920,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, @@ -4073,6 +3992,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4549,22 +4470,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return &memcg->cgwb_domain; } -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page(). - */ -static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = atomic_long_read(&memcg->vmstats[idx]); - int cpu; - - for_each_online_cpu(cpu) - x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; - if (x < 0) - x = 0; - return x; -} - /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question @@ -4590,13 +4495,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); + cgroup_rstat_flush_irqsafe(memcg->css.cgroup); - *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); - *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + - memcg_exact_page_state(memcg, NR_ACTIVE_FILE); - *pheadroom = PAGE_COUNTER_MAX; + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + + memcg_page_state(memcg, NR_ACTIVE_FILE); + *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), READ_ONCE(memcg->memory.high)); @@ -5228,7 +5134,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); - free_percpu(memcg->vmstats_local); kfree(memcg); } @@ -5236,11 +5141,10 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) { memcg_wb_domain_exit(memcg); /* - * Flush percpu vmstats and vmevents to guarantee the value correctness - * on parent's and all ancestor levels. + * Flush percpu lruvec stats to guarantee the value + * correctness on parent's and all ancestor levels. */ - memcg_flush_percpu_vmstats(memcg); - memcg_flush_percpu_vmevents(memcg); + memcg_flush_lruvec_page_state(memcg); __mem_cgroup_free(memcg); } @@ -5267,11 +5171,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } - memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, - GFP_KERNEL_ACCOUNT); - if (!memcg->vmstats_local) - goto fail; - memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5471,6 +5370,62 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct memcg_vmstats_percpu *statc; + long delta, v; + int i; + + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + + for (i = 0; i < MEMCG_NR_STAT; i++) { + /* + * Collect the aggregated propagation counts of groups + * below us. We're in a per-cpu loop here and this is + * a global counter, so the first cycle will get them. + */ + delta = memcg->vmstats.state_pending[i]; + if (delta) + memcg->vmstats.state_pending[i] = 0; + + /* Add CPU changes on this level since the last flush */ + v = READ_ONCE(statc->state[i]); + if (v != statc->state_prev[i]) { + delta += v - statc->state_prev[i]; + statc->state_prev[i] = v; + } + + if (!delta) + continue; + + /* Aggregate counts on this level and propagate upwards */ + memcg->vmstats.state[i] += delta; + if (parent) + parent->vmstats.state_pending[i] += delta; + } + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + delta = memcg->vmstats.events_pending[i]; + if (delta) + memcg->vmstats.events_pending[i] = 0; + + v = READ_ONCE(statc->events[i]); + if (v != statc->events_prev[i]) { + delta += v - statc->events_prev[i]; + statc->events_prev[i] = v; + } + + if (!delta) + continue; + + memcg->vmstats.events[i] += delta; + if (parent) + parent->vmstats.events_pending[i] += delta; + } +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ static int mem_cgroup_do_precharge(unsigned long count) @@ -6524,6 +6479,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, From 2cd21c89800c2203331e5564df2155757ded2e86 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:29 -0700 Subject: [PATCH 063/175] mm: memcontrol: consolidate lruvec stat flushing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two functions to flush the per-cpu data of an lruvec into the rest of the cgroup tree: when the cgroup is being freed, and when a CPU disappears during hotplug. The difference is whether all CPUs or just one is being collected, but the rest of the flushing code is the same. Merge them into one function and share the common code. Link: https://lkml.kernel.org/r/20210209163304.77088-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Roman Gushchin Cc: Michal Koutný Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 74 +++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b323588223acc5..5582e1531b43e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2364,39 +2364,39 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static int memcg_hotplug_cpu_dead(unsigned int cpu) +static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) { - struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; - - stock = &per_cpu(memcg_stock, cpu); - drain_stock(stock); + int nid; - for_each_mem_cgroup(memcg) { + for_each_node(nid) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + unsigned long stat[NR_VM_NODE_STAT_ITEMS]; + struct batched_lruvec_stat *lstatc; int i; + lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - int nid; + stat[i] = lstatc->count[i]; + lstatc->count[i] = 0; + } - for_each_node(nid) { - struct batched_lruvec_stat *lstatc; - struct mem_cgroup_per_node *pn; - long x; + do { + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + atomic_long_add(stat[i], &pn->lruvec_stat[i]); + } while ((pn = parent_nodeinfo(pn, nid))); + } +} - pn = memcg->nodeinfo[nid]; - lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); +static int memcg_hotplug_cpu_dead(unsigned int cpu) +{ + struct memcg_stock_pcp *stock; + struct mem_cgroup *memcg; - x = lstatc->count[i]; - lstatc->count[i] = 0; + stock = &per_cpu(memcg_stock, cpu); + drain_stock(stock); - if (x) { - do { - atomic_long_add(x, &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } - } - } - } + for_each_mem_cgroup(memcg) + memcg_flush_lruvec_page_state(memcg, cpu); return 0; } @@ -3583,27 +3583,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg) -{ - int node; - - for_each_node(node) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - unsigned long stat[NR_VM_NODE_STAT_ITEMS] = { 0 }; - struct mem_cgroup_per_node *pi; - int cpu, i; - - for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - stat[i] += per_cpu( - pn->lruvec_stat_cpu->count[i], cpu); - - for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pi->lruvec_stat[i]); - } -} - #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -5139,12 +5118,15 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { + int cpu; + memcg_wb_domain_exit(memcg); /* * Flush percpu lruvec stats to guarantee the value * correctness on parent's and all ancestor levels. */ - memcg_flush_lruvec_page_state(memcg); + for_each_online_cpu(cpu) + memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } From 4bbcc5a41c5449f6a67edb3fbc2dccae9c6724db Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:33 -0700 Subject: [PATCH 064/175] kselftests: cgroup: update kmem test for new vmstat implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With memcg having switched to rstat, memory.stat output is precise. Update the cgroup selftest to reflect the expectations and error tolerances of the new implementation. Also add newly tracked types of memory to the memory.stat side of the equation, since they're included in memory.current and could throw false positives. Link: https://lkml.kernel.org/r/20210209163304.77088-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Michal Koutný Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/cgroup/test_kmem.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 0941aa16157e3a..22b31ebb351392 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -19,12 +19,12 @@ /* - * Memory cgroup charging and vmstat data aggregation is performed using - * percpu batches 32 pages big (look at MEMCG_CHARGE_BATCH). So the maximum - * discrepancy between charge and vmstat entries is number of cpus multiplied - * by 32 pages multiplied by 2. + * Memory cgroup charging is performed using percpu batches 32 pages + * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So + * the maximum discrepancy between charge and vmstat entries is number + * of cpus multiplied by 32 pages. */ -#define MAX_VMSTAT_ERROR (4096 * 32 * 2 * get_nprocs()) +#define MAX_VMSTAT_ERROR (4096 * 32 * get_nprocs()) static int alloc_dcache(const char *cgroup, void *arg) @@ -162,7 +162,7 @@ static int cg_run_in_subcgroups(const char *parent, */ static int test_kmem_memcg_deletion(const char *root) { - long current, slab, anon, file, kernel_stack, sum; + long current, slab, anon, file, kernel_stack, pagetables, percpu, sock, sum; int ret = KSFT_FAIL; char *parent; @@ -184,11 +184,14 @@ static int test_kmem_memcg_deletion(const char *root) anon = cg_read_key_long(parent, "memory.stat", "anon "); file = cg_read_key_long(parent, "memory.stat", "file "); kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack "); + pagetables = cg_read_key_long(parent, "memory.stat", "pagetables "); + percpu = cg_read_key_long(parent, "memory.stat", "percpu "); + sock = cg_read_key_long(parent, "memory.stat", "sock "); if (current < 0 || slab < 0 || anon < 0 || file < 0 || - kernel_stack < 0) + kernel_stack < 0 || pagetables < 0 || percpu < 0 || sock < 0) goto cleanup; - sum = slab + anon + file + kernel_stack; + sum = slab + anon + file + kernel_stack + pagetables + percpu + sock; if (abs(sum - current) < MAX_VMSTAT_ERROR) { ret = KSFT_PASS; } else { @@ -198,6 +201,9 @@ static int test_kmem_memcg_deletion(const char *root) printf("anon = %ld\n", anon); printf("file = %ld\n", file); printf("kernel_stack = %ld\n", kernel_stack); + printf("pagetables = %ld\n", pagetables); + printf("percpu = %ld\n", percpu); + printf("sock = %ld\n", sock); } cleanup: From 0add0c77a9bd0ce7cd3b53894fb08154881402a4 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 29 Apr 2021 22:56:36 -0700 Subject: [PATCH 065/175] memcg: charge before adding to swapcache on swapin Currently the kernel adds the page, allocated for swapin, to the swapcache before charging the page. This is fine but now we want a per-memcg swapcache stat which is essential for folks who wants to transparently migrate from cgroup v1's memsw to cgroup v2's memory and swap counters. In addition charging a page before exposing it to other parts of the kernel is a step in the right direction. To correctly maintain the per-memcg swapcache stat, this patch has adopted to charge the page before adding it to swapcache. One challenge in this option is the failure case of add_to_swap_cache() on which we need to undo the mem_cgroup_charge(). Specifically undoing mem_cgroup_uncharge_swap() is not simple. To resolve the issue, this patch decouples the charging for swapin pages from mem_cgroup_charge(). Two new functions are introduced, mem_cgroup_swapin_charge_page() for just charging the swapin page and mem_cgroup_swapin_uncharge_swap() for uncharging the swap slot once the page has been successfully added to the swapcache. [shakeelb@google.com: set page->private before calling swap_readpage] Link: https://lkml.kernel.org/r/20210318015959.2986837-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210305212639.775498-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Hugh Dickins Tested-by: Heiko Carstens Cc: Michal Hocko Cc: Stephen Rothwell Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 +++++ mm/memcontrol.c | 117 +++++++++++++++++++++++-------------- mm/memory.c | 16 +++-- mm/swap_state.c | 13 ++--- 4 files changed, 100 insertions(+), 59 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 74910ce9a3f9fa..e946c96daa32bc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -609,6 +609,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) } int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, + gfp_t gfp, swp_entry_t entry); +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -1112,6 +1115,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, return 0; } +static inline int mem_cgroup_swapin_charge_page(struct page *page, + struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) +{ + return 0; +} + +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +{ +} + static inline void mem_cgroup_uncharge(struct page *page) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5582e1531b43e4..6d0863db3e41fc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6644,6 +6644,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } +static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, + gfp_t gfp) +{ + unsigned int nr_pages = thp_nr_pages(page); + int ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret) + goto out; + + css_get(&memcg->css); + commit_charge(page, memcg); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); + local_irq_enable(); +out: + return ret; +} + /** * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge @@ -6653,55 +6674,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * + * Do not use this for pages allocated for swapin. + * * Returns 0 on success. Otherwise, an error code is returned. */ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - unsigned int nr_pages = thp_nr_pages(page); - struct mem_cgroup *memcg = NULL; - int ret = 0; + struct mem_cgroup *memcg; + int ret; if (mem_cgroup_disabled()) - goto out; + return 0; - if (PageSwapCache(page)) { - swp_entry_t ent = { .val = page_private(page), }; - unsigned short id; + memcg = get_mem_cgroup_from_mm(mm); + ret = __mem_cgroup_charge(page, memcg, gfp_mask); + css_put(&memcg->css); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page and memcg binding is - * protected by the page lock, which serializes swap cache - * removal, which in turn serializes uncharging. - */ - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (page_memcg(compound_head(page))) - goto out; + return ret; +} - id = lookup_swap_cgroup_id(ent); - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } +/** + * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin + * @page: page to charge + * @mm: mm context of the victim + * @gfp: reclaim mode + * @entry: swap entry for which the page is allocated + * + * This function charges a page allocated for swapin. Please call this before + * adding the page to the swapcache. + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, + gfp_t gfp, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + int ret; - if (!memcg) + if (mem_cgroup_disabled()) + return 0; + + id = lookup_swap_cgroup_id(entry); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (!memcg || !css_tryget_online(&memcg->css)) memcg = get_mem_cgroup_from_mm(mm); + rcu_read_unlock(); - ret = try_charge(memcg, gfp_mask, nr_pages); - if (ret) - goto out_put; + ret = __mem_cgroup_charge(page, memcg, gfp); - css_get(&memcg->css); - commit_charge(page, memcg); - - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); - memcg_check_events(memcg, page); - local_irq_enable(); + css_put(&memcg->css); + return ret; +} +/* + * mem_cgroup_swapin_uncharge_swap - uncharge swap slot + * @entry: swap entry for which the page is charged + * + * Call this function after successfully adding the charged page to swapcache. + * + * Note: This function assumes the page for which swap slot is being uncharged + * is order 0 page. + */ +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +{ /* * Cgroup1's unified memory+swap counter has been charged with the * new swapcache page, finish the transfer by uncharging the swap @@ -6714,20 +6751,14 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (do_memsw_account() && PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; + if (!mem_cgroup_disabled() && do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(entry, 1); } - -out_put: - css_put(&memcg->css); -out: - return ret; } struct uncharge_gather { diff --git a/mm/memory.c b/mm/memory.c index 550405fc3b5e63..3196fa5e7f7d78 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3309,28 +3309,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { - int err; - __SetPageLocked(page); __SetPageSwapBacked(page); - set_page_private(page, entry.val); - /* Tell memcg to use swap ownership records */ - SetPageSwapCache(page); - err = mem_cgroup_charge(page, vma->vm_mm, - GFP_KERNEL); - ClearPageSwapCache(page); - if (err) { + if (mem_cgroup_swapin_charge_page(page, + vma->vm_mm, GFP_KERNEL, entry)) { ret = VM_FAULT_OOM; goto out_page; } + mem_cgroup_swapin_uncharge_swap(entry); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(page, shadow); lru_cache_add(page); + + /* To provide entry to swap_readpage() */ + set_page_private(page, entry.val); swap_readpage(page, true); + set_page_private(page, 0); } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, diff --git a/mm/swap_state.c b/mm/swap_state.c index 3cdee7b11da9b6..fb7efa08fe5774 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -497,16 +497,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageLocked(page); __SetPageSwapBacked(page); - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) { - put_swap_page(page, entry); + if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) goto fail_unlock; - } - if (mem_cgroup_charge(page, NULL, gfp_mask)) { - delete_from_swap_cache(page); + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - } + + mem_cgroup_swapin_uncharge_swap(entry); if (shadow) workingset_refault(page, shadow); @@ -517,6 +515,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return page; fail_unlock: + put_swap_page(page, entry); unlock_page(page); put_page(page); return NULL; From 9f38f03ae8d5f57371b71aa6b4275765b65454fd Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:39 -0700 Subject: [PATCH 066/175] mm: memcontrol: slab: fix obtain a reference to a freeing memcg Patch series "Use obj_cgroup APIs to charge kmem pages", v5. Since Roman's series "The new cgroup slab memory controller" applied. All slab objects are charged with the new APIs of obj_cgroup. The new APIs introduce a struct obj_cgroup to charge slab objects. It prevents long-living objects from pinning the original memory cgroup in the memory. But there are still some corner objects (e.g. allocations larger than order-1 page on SLUB) which are not charged with the new APIs. Those objects (include the pages which are allocated from buddy allocator directly) are charged as kmem pages which still hold a reference to the memory cgroup. E.g. We know that the kernel stack is charged as kmem pages because the size of the kernel stack can be greater than 2 pages (e.g. 16KB on x86_64 or arm64). If we create a thread (suppose the thread stack is charged to memory cgroup A) and then move it from memory cgroup A to memory cgroup B. Because the kernel stack of the thread hold a reference to the memory cgroup A. The thread can pin the memory cgroup A in the memory even if we remove the cgroup A. If we want to see this scenario by using the following script. We can see that the system has added 500 dying cgroups (This is not a real world issue, just a script to show that the large kmallocs are charged as kmem pages which can pin the memory cgroup in the memory). #!/bin/bash cat /proc/cgroups | grep memory cd /sys/fs/cgroup/memory echo 1 > memory.move_charge_at_immigrate for i in range{1..500} do mkdir kmem_test echo $$ > kmem_test/cgroup.procs sleep 3600 & echo $$ > cgroup.procs echo `cat kmem_test/cgroup.procs` > cgroup.procs rmdir kmem_test done cat /proc/cgroups | grep memory This patchset aims to make those kmem pages to drop the reference to memory cgroup by using the APIs of obj_cgroup. Finally, we can see that the number of the dying cgroups will not increase if we run the above test script. This patch (of 7): The rcu_read_lock/unlock only can guarantee that the memcg will not be freed, but it cannot guarantee the success of css_get (which is in the refill_stock when cached memcg changed) to memcg. rcu_read_lock() memcg = obj_cgroup_memcg(old) __memcg_kmem_uncharge(memcg) refill_stock(memcg) if (stock->cached != memcg) // css_get can change the ref counter from 0 back to 1. css_get(&memcg->css) rcu_read_unlock() This fix is very like the commit: eefbfa7fd678 ("mm: memcg/slab: fix use after free in obj_cgroup_charge") Fix this by holding a reference to the memcg which is passed to the __memcg_kmem_uncharge() before calling __memcg_kmem_uncharge(). Link: https://lkml.kernel.org/r/20210319163821.20704-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20210319163821.20704-2-songmuchun@bytedance.com Fixes: 3de7d4f25a74 ("mm: memcg/slab: optimize objcg stock draining") Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6d0863db3e41fc..cf7a10d2fbd1be 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3150,9 +3150,17 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); if (nr_pages) { + struct mem_cgroup *memcg; + rcu_read_lock(); - __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages); +retry: + memcg = obj_cgroup_memcg(old); + if (unlikely(!css_tryget(&memcg->css))) + goto retry; rcu_read_unlock(); + + __memcg_kmem_uncharge(memcg, nr_pages); + css_put(&memcg->css); } /* From e74d225910ec3a9999f06934afa068b6a30babf8 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:42 -0700 Subject: [PATCH 067/175] mm: memcontrol: introduce obj_cgroup_{un}charge_pages We know that the unit of slab object charging is bytes, the unit of kmem page charging is PAGE_SIZE. If we want to reuse obj_cgroup APIs to charge the kmem pages, we should pass PAGE_SIZE (as third parameter) to obj_cgroup_charge(). Because the size is already PAGE_SIZE, we can skip touch the objcg stock. And obj_cgroup_{un}charge_pages() are introduced to charge in units of page level. In the latter patch, we also can reuse those two helpers to charge or uncharge a number of kernel pages to a object cgroup. This is just a code movement without any functional changes. Link: https://lkml.kernel.org/r/20210319163821.20704-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 63 +++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf7a10d2fbd1be..0f77608e89ca40 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2874,6 +2874,20 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) page->memcg_data = (unsigned long)memcg; } +static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); +retry: + memcg = obj_cgroup_memcg(objcg); + if (unlikely(!css_tryget(&memcg->css))) + goto retry; + rcu_read_unlock(); + + return memcg; +} + #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp, bool new_page) @@ -3025,6 +3039,29 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages) +{ + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + __memcg_kmem_uncharge(memcg, nr_pages); + css_put(&memcg->css); +} + +static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, + unsigned int nr_pages) +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = __memcg_kmem_charge(memcg, gfp, nr_pages); + css_put(&memcg->css); + + return ret; +} + /** * __memcg_kmem_charge: charge a number of kernel pages to a memcg * @memcg: memory cgroup to charge @@ -3149,19 +3186,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); - if (nr_pages) { - struct mem_cgroup *memcg; - - rcu_read_lock(); -retry: - memcg = obj_cgroup_memcg(old); - if (unlikely(!css_tryget(&memcg->css))) - goto retry; - rcu_read_unlock(); - - __memcg_kmem_uncharge(memcg, nr_pages); - css_put(&memcg->css); - } + if (nr_pages) + obj_cgroup_uncharge_pages(old, nr_pages); /* * The leftover is flushed to the centralized per-memcg value. @@ -3219,7 +3245,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) { - struct mem_cgroup *memcg; unsigned int nr_pages, nr_bytes; int ret; @@ -3236,24 +3261,16 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) * refill_obj_stock(), called from this function or * independently later. */ - rcu_read_lock(); -retry: - memcg = obj_cgroup_memcg(objcg); - if (unlikely(!css_tryget(&memcg->css))) - goto retry; - rcu_read_unlock(); - nr_pages = size >> PAGE_SHIFT; nr_bytes = size & (PAGE_SIZE - 1); if (nr_bytes) nr_pages += 1; - ret = __memcg_kmem_charge(memcg, gfp, nr_pages); + ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); if (!ret && nr_bytes) refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); - css_put(&memcg->css); return ret; } From 48060834f2277374bb68c04c62de8b57e769f701 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:45 -0700 Subject: [PATCH 068/175] mm: memcontrol: directly access page->memcg_data in mm/page_alloc.c page_memcg() is not suitable for use by page_expected_state() and page_bad_reason(). Because it can BUG_ON() for the slab pages when CONFIG_DEBUG_VM is enabled. As neither lru, nor kmem, nor slab page should have anything left in there by the time the page is freed, what we care about is whether the value of page->memcg_data is 0. So just directly access page->memcg_data here. Link: https://lkml.kernel.org/r/20210319163821.20704-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2f19bf948dbe6..56a8103580d603 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1103,7 +1103,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page_memcg(page) | + page->memcg_data | #endif (page->flags & check_flags))) return false; @@ -1128,7 +1128,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page_memcg(page))) + if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif return bad_reason; From 7ab345a8973017c89a1be87b6c8722d1fee1fd95 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:48 -0700 Subject: [PATCH 069/175] mm: memcontrol: change ug->dummy_page only if memcg changed Just like assignment to ug->memcg, we only need to update ug->dummy_page if memcg changed. So move it to there. This is a very small optimization. Link: https://lkml.kernel.org/r/20210319163821.20704-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f77608e89ca40..85fd2d86d23f79 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6843,6 +6843,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) uncharge_gather_clear(ug); } ug->memcg = page_memcg(page); + ug->dummy_page = page; /* pairs with css_put in uncharge_batch */ css_get(&ug->memcg->css); @@ -6856,7 +6857,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) else ug->pgpgout++; - ug->dummy_page = page; page->memcg_data = 0; css_put(&ug->memcg->css); } From b4e0b68fbd9d1fd7e31cbe8adca3ad6cf556e2ee Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:52 -0700 Subject: [PATCH 070/175] mm: memcontrol: use obj_cgroup APIs to charge kmem pages Since Roman's series "The new cgroup slab memory controller" applied. All slab objects are charged via the new APIs of obj_cgroup. The new APIs introduce a struct obj_cgroup to charge slab objects. It prevents long-living objects from pinning the original memory cgroup in the memory. But there are still some corner objects (e.g. allocations larger than order-1 page on SLUB) which are not charged via the new APIs. Those objects (include the pages which are allocated from buddy allocator directly) are charged as kmem pages which still hold a reference to the memory cgroup. We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do that, we should store an object cgroup pointer to page->memcg_data for the kmem pages. Finally, page->memcg_data will have 3 different meanings. 1) For the slab pages, page->memcg_data points to an object cgroups vector. 2) For the kmem pages (exclude the slab pages), page->memcg_data points to an object cgroup. 3) For the user pages (e.g. the LRU pages), page->memcg_data points to a memory cgroup. We do not change the behavior of page_memcg() and page_memcg_rcu(). They are also suitable for LRU pages and kmem pages. Why? Because memory allocations pinning memcgs for a long time - it exists at a larger scale and is causing recurring problems in the real world: page cache doesn't get reclaimed for a long time, or is used by the second, third, fourth, ... instance of the same job that was restarted into a new cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and make page reclaim very inefficient. We can convert LRU pages and most other raw memcg pins to the objcg direction to fix this problem, and then the page->memcg will always point to an object cgroup pointer. At that time, LRU pages and kmem pages will be treated the same. The implementation of page_memcg() will remove the kmem page check. This patch aims to charge the kmem pages by using the new APIs of obj_cgroup. Finally, the page->memcg_data of the kmem page points to an object cgroup. We can use the __page_objcg() to get the object cgroup associated with a kmem page. Or we can use page_memcg() to get the memory cgroup associated with a kmem page, but caller must ensure that the returned memcg won't be released (e.g. acquire the rcu_read_lock or css_set_lock). Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Miaohe Lin Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Cc: Christian Borntraeger [songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 120 ++++++++++++++++++++++++++++++------- mm/memcontrol.c | 116 +++++++++++++++++------------------ 2 files changed, 155 insertions(+), 81 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e946c96daa32bc..78ca34c935aba6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -371,6 +371,62 @@ enum page_memcg_data_flags { #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) +static inline bool PageMemcgKmem(struct page *page); + +/* + * After the initialization objcg->memcg is always pointing at + * a valid memcg, but can be atomically swapped to the parent memcg. + * + * The caller must ensure that the returned memcg won't be released: + * e.g. acquire the rcu_read_lock or css_set_lock. + */ +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) +{ + return READ_ONCE(objcg->memcg); +} + +/* + * __page_memcg - get the memory cgroup associated with a non-kmem page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages or + * kmem pages. + */ +static inline struct mem_cgroup *__page_memcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * __page_objcg - get the object cgroup associated with a kmem page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper object cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages or + * LRU pages. + */ +static inline struct obj_cgroup *__page_objcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page); + + return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + /* * page_memcg - get the memory cgroup associated with a page * @page: a pointer to the page struct @@ -380,20 +436,23 @@ enum page_memcg_data_flags { * proper memory cgroup pointer. It's not safe to call this function * against some type of pages, e.g. slab pages or ex-slab pages. * - * Any of the following ensures page and memcg binding stability: + * For a non-kmem page any of the following ensures page and memcg binding + * stability: + * * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. */ static inline struct mem_cgroup *page_memcg(struct page *page) { - unsigned long memcg_data = page->memcg_data; - - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + if (PageMemcgKmem(page)) + return obj_cgroup_memcg(__page_objcg(page)); + else + return __page_memcg(page); } /* @@ -407,11 +466,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page) */ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) { + unsigned long memcg_data = READ_ONCE(page->memcg_data); + VM_BUG_ON_PAGE(PageSlab(page), page); WARN_ON_ONCE(!rcu_read_lock_held()); - return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & - ~MEMCG_DATA_FLAGS_MASK); + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -419,15 +486,21 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) * @page: a pointer to the page struct * * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function unlike page_memcg() can take any page + * or NULL. This function unlike page_memcg() can take any page * as an argument. It has to be used in cases when it's not known if a page - * has an associated memory cgroup pointer or an object cgroups vector. + * has an associated memory cgroup pointer or an object cgroups vector or + * an object cgroup. + * + * For a non-kmem page any of the following ensures page and memcg binding + * stability: * - * Any of the following ensures page and memcg binding stability: * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. */ static inline struct mem_cgroup *page_memcg_check(struct page *page) { @@ -440,6 +513,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } @@ -718,21 +798,15 @@ static inline void obj_cgroup_get(struct obj_cgroup *objcg) percpu_ref_get(&objcg->refcnt); } -static inline void obj_cgroup_put(struct obj_cgroup *objcg) +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, + unsigned long nr) { - percpu_ref_put(&objcg->refcnt); + percpu_ref_get_many(&objcg->refcnt, nr); } -/* - * After the initialization objcg->memcg is always pointing at - * a valid memcg, but can be atomically swapped to the parent memcg. - * - * The caller must ensure that the returned memcg won't be released: - * e.g. acquire the rcu_read_lock or css_set_lock. - */ -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) +static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - return READ_ONCE(objcg->memcg); + percpu_ref_put(&objcg->refcnt); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85fd2d86d23f79..f3ddf94c9485d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -865,18 +865,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ - struct mem_cgroup *memcg = page_memcg(head); + struct mem_cgroup *memcg; pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; + rcu_read_lock(); + memcg = page_memcg(head); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { + rcu_read_unlock(); __mod_node_page_state(pgdat, idx, val); return; } lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); + rcu_read_unlock(); } EXPORT_SYMBOL(__mod_lruvec_page_state); @@ -1051,20 +1055,6 @@ static __always_inline struct mem_cgroup *active_memcg(void) return current->active_memcg; } -static __always_inline struct mem_cgroup *get_active_memcg(void) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = active_memcg(); - /* remote memcg must hold a ref. */ - if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - rcu_read_unlock(); - - return memcg; -} - static __always_inline bool memcg_kmem_bypass(void) { /* Allow remote memcg charging from any context. */ @@ -1078,20 +1068,6 @@ static __always_inline bool memcg_kmem_bypass(void) return false; } -/** - * If active memcg is set, do not fallback to current->mm->memcg. - */ -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) -{ - if (memcg_kmem_bypass()) - return NULL; - - if (unlikely(active_memcg())) - return get_active_memcg(); - - return get_mem_cgroup_from_mm(current->mm); -} - /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -3121,18 +3097,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page */ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; int ret = 0; - memcg = get_mem_cgroup_from_current(); - if (memcg && !mem_cgroup_is_root(memcg)) { - ret = __memcg_kmem_charge(memcg, gfp, 1 << order); + objcg = get_obj_cgroup_from_current(); + if (objcg) { + ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { - page->memcg_data = (unsigned long)memcg | + page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - css_put(&memcg->css); + obj_cgroup_put(objcg); } return ret; } @@ -3144,16 +3120,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page_memcg(page); + struct obj_cgroup *objcg; unsigned int nr_pages = 1 << order; - if (!memcg) + if (!PageMemcgKmem(page)) return; - VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - __memcg_kmem_uncharge(memcg, nr_pages); + objcg = __page_objcg(page); + obj_cgroup_uncharge_pages(objcg, nr_pages); page->memcg_data = 0; - css_put(&memcg->css); + obj_cgroup_put(objcg); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3294,7 +3270,11 @@ void split_page_memcg(struct page *head, unsigned int nr) for (i = 1; i < nr; i++) head[i].memcg_data = head->memcg_data; - css_get_many(&memcg->css, nr - 1); + + if (PageMemcgKmem(head)) + obj_cgroup_get_many(__page_objcg(head), nr - 1); + else + css_get_many(&memcg->css, nr - 1); } #ifdef CONFIG_MEMCG_SWAP @@ -6788,7 +6768,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) struct uncharge_gather { struct mem_cgroup *memcg; - unsigned long nr_pages; + unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; struct page *dummy_page; @@ -6803,10 +6783,10 @@ static void uncharge_batch(const struct uncharge_gather *ug) { unsigned long flags; - if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); + if (ug->nr_memory) { + page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); @@ -6814,7 +6794,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -6825,40 +6805,60 @@ static void uncharge_batch(const struct uncharge_gather *ug) static void uncharge_page(struct page *page, struct uncharge_gather *ug) { unsigned long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page_memcg(page)) - return; - /* * Nobody should be changing or seriously looking at - * page_memcg(page) at this point, we have fully + * page memcg or objcg at this point, we have fully * exclusive access to the page. */ + if (PageMemcgKmem(page)) { + objcg = __page_objcg(page); + /* + * This get matches the put at the end of the function and + * kmem pages do not hold memcg references anymore. + */ + memcg = get_mem_cgroup_from_objcg(objcg); + } else { + memcg = __page_memcg(page); + } - if (ug->memcg != page_memcg(page)) { + if (!memcg) + return; + + if (ug->memcg != memcg) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page_memcg(page); + ug->memcg = memcg; ug->dummy_page = page; /* pairs with css_put in uncharge_batch */ - css_get(&ug->memcg->css); + css_get(&memcg->css); } nr_pages = compound_nr(page); - ug->nr_pages += nr_pages; - if (PageMemcgKmem(page)) + if (PageMemcgKmem(page)) { + ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - else + + page->memcg_data = 0; + obj_cgroup_put(objcg); + } else { + /* LRU pages aren't accounted at the root level */ + if (!mem_cgroup_is_root(memcg)) + ug->nr_memory += nr_pages; ug->pgpgout++; - page->memcg_data = 0; - css_put(&ug->memcg->css); + page->memcg_data = 0; + } + + css_put(&memcg->css); } /** From f1286fae540697e0b4713a8262f4aab5cf65f1c5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:55 -0700 Subject: [PATCH 071/175] mm: memcontrol: inline __memcg_kmem_{un}charge() into obj_cgroup_{un}charge_pages() There is only one user of __memcg_kmem_charge(), so manually inline __memcg_kmem_charge() to obj_cgroup_charge_pages(). Similarly manually inline __memcg_kmem_uncharge() into obj_cgroup_uncharge_pages() and call obj_cgroup_uncharge_pages() in obj_cgroup_release(). This is just code cleanup without any functionality changes. Link: https://lkml.kernel.org/r/20210319163821.20704-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 63 ++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3ddf94c9485d3..c100265dc393cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -255,10 +255,8 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) #ifdef CONFIG_MEMCG_KMEM extern spinlock_t css_set_lock; -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages); -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages); +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -295,7 +293,7 @@ static void obj_cgroup_release(struct percpu_ref *ref) spin_lock_irqsave(&css_set_lock, flags); memcg = obj_cgroup_memcg(objcg); if (nr_pages) - __memcg_kmem_uncharge(memcg, nr_pages); + obj_cgroup_uncharge_pages(objcg, nr_pages); list_del(&objcg->list); mem_cgroup_put(memcg); spin_unlock_irqrestore(&css_set_lock, flags); @@ -3015,46 +3013,45 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } +/* + * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg + * @objcg: object cgroup to uncharge + * @nr_pages: number of pages to uncharge + */ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, unsigned int nr_pages) { struct mem_cgroup *memcg; memcg = get_mem_cgroup_from_objcg(objcg); - __memcg_kmem_uncharge(memcg, nr_pages); - css_put(&memcg->css); -} -static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, - unsigned int nr_pages) -{ - struct mem_cgroup *memcg; - int ret; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + refill_stock(memcg, nr_pages); - memcg = get_mem_cgroup_from_objcg(objcg); - ret = __memcg_kmem_charge(memcg, gfp, nr_pages); css_put(&memcg->css); - - return ret; } -/** - * __memcg_kmem_charge: charge a number of kernel pages to a memcg - * @memcg: memory cgroup to charge +/* + * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg + * @objcg: object cgroup to charge * @gfp: reclaim mode * @nr_pages: number of pages to charge * * Returns 0 on success, an error code on failure. */ -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages) +static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, + unsigned int nr_pages) { struct page_counter *counter; + struct mem_cgroup *memcg; int ret; + memcg = get_mem_cgroup_from_objcg(objcg); + ret = try_charge(memcg, gfp, nr_pages); if (ret) - return ret; + goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { @@ -3066,25 +3063,15 @@ static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, */ if (gfp & __GFP_NOFAIL) { page_counter_charge(&memcg->kmem, nr_pages); - return 0; + goto out; } cancel_charge(memcg, nr_pages); - return -ENOMEM; + ret = -ENOMEM; } - return 0; -} - -/** - * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg - * @memcg: memcg to uncharge - * @nr_pages: number of pages to uncharge - */ -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); +out: + css_put(&memcg->css); - refill_stock(memcg, nr_pages); + return ret; } /** From bd290e1e75d8a8b2d87031b63db56ae165677870 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:58 -0700 Subject: [PATCH 072/175] mm: memcontrol: move PageMemcgKmem to the scope of CONFIG_MEMCG_KMEM The page only can be marked as kmem when CONFIG_MEMCG_KMEM is enabled. So move PageMemcgKmem() to the scope of the CONFIG_MEMCG_KMEM. As a bonus, on !CONFIG_MEMCG_KMEM build some code can be compiled out. Link: https://lkml.kernel.org/r/20210319163821.20704-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 78ca34c935aba6..bcf92b99f00172 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -523,6 +523,7 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } +#ifdef CONFIG_MEMCG_KMEM /* * PageMemcgKmem - check if the page has MemcgKmem flag set * @page: a pointer to the page struct @@ -537,7 +538,6 @@ static inline bool PageMemcgKmem(struct page *page) return page->memcg_data & MEMCG_DATA_KMEM; } -#ifdef CONFIG_MEMCG_KMEM /* * page_objcgs - get the object cgroups vector associated with a page * @page: a pointer to the page struct @@ -579,6 +579,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #else +static inline bool PageMemcgKmem(struct page *page) +{ + return false; +} + static inline struct obj_cgroup **page_objcgs(struct page *page) { return NULL; From a10e995749a6c65833edd201c55665e5d44d14fc Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 29 Apr 2021 22:57:01 -0700 Subject: [PATCH 073/175] linux/memcontrol.h: remove duplicate struct declaration struct mem_cgroup is declared twice. One has been declared at forward struct declaration. Remove the duplicate. Link: https://lkml.kernel.org/r/20210330020246.2265371-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Reviewed-by: Muchun Song Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bcf92b99f00172..5904716f29ba4f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1123,8 +1123,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_MAX 0 -struct mem_cgroup; - static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; From 9317d0fffeb4c3929069cfc7377cfa2a7cd36d1d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:57:04 -0700 Subject: [PATCH 074/175] mm: page_counter: mitigate consequences of a page_counter underflow When the unsigned page_counter underflows, even just by a few pages, a cgroup will not be able to run anything afterwards and trigger the OOM killer in a loop. Underflows shouldn't happen, but when they do in practice, we may just be off by a small amount that doesn't interfere with the normal operation - consequences don't need to be that dire. Reset the page_counter to 0 upon underflow. We'll issue a warning that the accounting will be off and then try to keep limping along. [ We used to do this with the original res_counter, where it was a more straight-forward correction inside the spinlock section. I didn't carry it forward into the lockless page counters for simplicity, but it turns out this is quite useful in practice. ] Link: https://lkml.kernel.org/r/20210408143155.2679744-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Chris Down Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_counter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/page_counter.c b/mm/page_counter.c index c6860f51b6c608..7d83641eb86b72 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -52,9 +52,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_protected_usage(counter, new); /* More uncharges than charges? */ - WARN_ON_ONCE(new < 0); + if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", + new, nr_pages)) { + new = 0; + atomic_long_set(&counter->usage, new); + } + propagate_protected_usage(counter, new); } /** From bf90ac198e30d242a12fc550d35b335e462a7632 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Thu, 29 Apr 2021 22:57:07 -0700 Subject: [PATCH 075/175] mm/memory.c: do_numa_page(): delete bool "migrated" Smatch gives the warning: do_numa_page() warn: assigning (-11) to unsigned variable 'migrated' Link: https://lkml.kernel.org/r/1614603421-2681-1-git-send-email-wangqing@vivo.com Signed-off-by: Wang Qing Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3196fa5e7f7d78..f95d12032b3404 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4098,7 +4098,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) int page_nid = NUMA_NO_NODE; int last_cpupid; int target_nid; - bool migrated = false; pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -4168,8 +4167,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) } /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) { + if (migrate_misplaced_page(page, vma, target_nid)) { page_nid = target_nid; flags |= TNF_MIGRATED; } else From 0c1dcb052452ed667719b20ca35837bcf9ca4375 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Thu, 29 Apr 2021 22:57:09 -0700 Subject: [PATCH 076/175] mm/interval_tree: add comments to improve code readability Add a comment explaining the value of the ISSTATIC parameter, Inform the reader that this is not a coding style issue. Link: https://lkml.kernel.org/r/1613964695-17614-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/interval_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 11c75fb0758424..32e390c42c53f2 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -22,7 +22,7 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, - vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, From 8e2df191ae7029010db386efd31be87d4d01cea6 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 29 Apr 2021 22:57:12 -0700 Subject: [PATCH 077/175] x86/vmemmap: drop handling of 4K unaligned vmemmap range Patch series "Cleanup and fixups for vmemmap handling", v6. This series contains cleanups to remove dead code that handles unaligned cases for 4K and 1GB pages (patch#1 and patch#2) when removing the vemmmap range, and a fix (patch#3) to handle the case when two vmemmap ranges intersect the same PMD. This patch (of 4): remove_pte_table() is prepared to handle the case where either the start or the end of the range is not PAGE aligned. This cannot actually happen: __populate_section_memmap enforces the range to be PMD aligned, so as long as the size of the struct page remains multiple of 8, the vmemmap range will be aligned to PAGE_SIZE. Drop the dead code and place a VM_BUG_ON in vmemmap_{populate,free} to catch nasty cases. Note that the VM_BUG_ON is placed in there because vmemmap_{populate,free= } is the gate of all removing and freeing page tables logic. Link: https://lkml.kernel.org/r/20210309214050.4674-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20210309214050.4674-2-osalvador@suse.de Signed-off-by: Oscar Salvador Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Michal Hocko Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 48 ++++++++++++------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 55247451ba85bd..ff312a87e58d88 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -962,7 +962,6 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, { unsigned long next, pages = 0; pte_t *pte; - void *page_addr; phys_addr_t phys_addr; pte = pte_start + pte_index(addr); @@ -983,42 +982,15 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, if (phys_addr < (phys_addr_t)0x40000000) return; - if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { - /* - * Do not free direct mapping pages since they were - * freed when offlining, or simply not in use. - */ - if (!direct) - free_pagetable(pte_page(*pte), 0); - - spin_lock(&init_mm.page_table_lock); - pte_clear(&init_mm, addr, pte); - spin_unlock(&init_mm.page_table_lock); + if (!direct) + free_pagetable(pte_page(*pte), 0); - /* For non-direct mapping, pages means nothing. */ - pages++; - } else { - /* - * If we are here, we are freeing vmemmap pages since - * direct mapped memory ranges to be freed are aligned. - * - * If we are not removing the whole page, it means - * other page structs in this page are being used and - * we cannot remove them. So fill the unused page_structs - * with 0xFD, and remove the page when it is wholly - * filled with 0xFD. - */ - memset((void *)addr, PAGE_INUSE, next - addr); - - page_addr = page_address(pte_page(*pte)); - if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { - free_pagetable(pte_page(*pte), 0); + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); - spin_lock(&init_mm.page_table_lock); - pte_clear(&init_mm, addr, pte); - spin_unlock(&init_mm.page_table_lock); - } - } + /* For non-direct mapping, pages means nothing. */ + pages++; } /* Call free_pte_table() in remove_pmd_table(). */ @@ -1197,6 +1169,9 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct, void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { + VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); + VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); + remove_pagetable(start, end, false, altmap); } @@ -1556,6 +1531,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, { int err; + VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); + VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); + if (end - start < PAGES_PER_SECTION * sizeof(struct page)) err = vmemmap_populate_basepages(start, end, node, NULL); else if (boot_cpu_has(X86_FEATURE_PSE)) From 69ccfe74e16bcb61e4817f78ead31b973c36339c Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 29 Apr 2021 22:57:16 -0700 Subject: [PATCH 078/175] x86/vmemmap: drop handling of 1GB vmemmap ranges There is no code to allocate 1GB pages when mapping the vmemmap range as this might waste some memory and requires more complexity which is not really worth. Drop the dead code both for the aligned and unaligned cases and leave only the direct map handling. Link: https://lkml.kernel.org/r/20210309214050.4674-3-osalvador@suse.de Signed-off-by: Oscar Salvador Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ff312a87e58d88..af217ff6da577b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1062,7 +1062,6 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, unsigned long next, pages = 0; pmd_t *pmd_base; pud_t *pud; - void *page_addr; pud = pud_start + pud_index(addr); for (; addr < end; addr = next, pud++) { @@ -1071,33 +1070,13 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, if (!pud_present(*pud)) continue; - if (pud_large(*pud)) { - if (IS_ALIGNED(addr, PUD_SIZE) && - IS_ALIGNED(next, PUD_SIZE)) { - if (!direct) - free_pagetable(pud_page(*pud), - get_order(PUD_SIZE)); - - spin_lock(&init_mm.page_table_lock); - pud_clear(pud); - spin_unlock(&init_mm.page_table_lock); - pages++; - } else { - /* If here, we are freeing vmemmap pages. */ - memset((void *)addr, PAGE_INUSE, next - addr); - - page_addr = page_address(pud_page(*pud)); - if (!memchr_inv(page_addr, PAGE_INUSE, - PUD_SIZE)) { - free_pagetable(pud_page(*pud), - get_order(PUD_SIZE)); - - spin_lock(&init_mm.page_table_lock); - pud_clear(pud); - spin_unlock(&init_mm.page_table_lock); - } - } - + if (pud_large(*pud) && + IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; continue; } From 8d400913c231bd1da74067255816453f96cd35b0 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 29 Apr 2021 22:57:19 -0700 Subject: [PATCH 079/175] x86/vmemmap: handle unpopulated sub-pmd ranges When sizeof(struct page) is not a power of 2, sections do not span a PMD anymore and so when populating them some parts of the PMD will remain unused. Because of this, PMDs will be left behind when depopulating sections since remove_pmd_table() thinks that those unused parts are still in use. Fix this by marking the unused parts with PAGE_UNUSED, so memchr_inv() will do the right thing and will let us free the PMD when the last user of it is gone. This patch is based on a similar patch by David Hildenbrand: https://lore.kernel.org/linux-mm/20200722094558.9828-9-david@redhat.com/ [osalvador@suse.de: go back to the ifdef version] Link: https://lkml.kernel.org/r/YGy++mSft7K4u+88@localhost.localdomain Link: https://lkml.kernel.org/r/20210309214050.4674-4-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 68 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index af217ff6da577b..fa7948042bff0a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -826,6 +826,51 @@ void __init paging_init(void) zone_sizes_init(); } +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define PAGE_UNUSED 0xFD + +/* Returns true if the PMD is completely unused and thus it can be freed */ +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + memset((void *)addr, PAGE_UNUSED, end - addr); + + return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE); +} + +static void __meminit vmemmap_use_sub_pmd(unsigned long start) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed when removing some other adjacent memmap (just in + * case the first memmap never gets initialized e.g., because the memory + * block never gets onlined). + */ + memset((void *)start, 0, sizeof(struct page)); +} + +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * Could be our memmap page is filled with PAGE_UNUSED already from a + * previous remove. Make sure to reset it. + */ + vmemmap_use_sub_pmd(start); + + /* + * Mark with PAGE_UNUSED the unused parts of the new memmap range + */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset((void *)start, PAGE_UNUSED, + start - ALIGN_DOWN(start, PMD_SIZE)); + if (!IS_ALIGNED(end, PMD_SIZE)) + memset((void *)end, PAGE_UNUSED, + ALIGN(end, PMD_SIZE) - end); +} +#endif + /* * Memory hotplug specific functions */ @@ -871,8 +916,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return add_pages(nid, start_pfn, nr_pages, params); } -#define PAGE_INUSE 0xFD - static void __meminit free_pagetable(struct page *page, int order) { unsigned long magic; @@ -1006,7 +1049,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, unsigned long next, pages = 0; pte_t *pte_base; pmd_t *pmd; - void *page_addr; pmd = pmd_start + pmd_index(addr); for (; addr < end; addr = next, pmd++) { @@ -1026,22 +1068,16 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); pages++; - } else { - /* If here, we are freeing vmemmap pages. */ - memset((void *)addr, PAGE_INUSE, next - addr); - - page_addr = page_address(pmd_page(*pmd)); - if (!memchr_inv(page_addr, PAGE_INUSE, - PMD_SIZE)) { + } +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (vmemmap_pmd_is_unused(addr, next)) { free_hugepage_table(pmd_page(*pmd), altmap); - spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); - } } - +#endif continue; } @@ -1492,11 +1528,17 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; + + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) + vmemmap_use_new_sub_pmd(addr, next); + continue; } else if (altmap) return -ENOMEM; /* no fallback */ } else if (pmd_large(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); + vmemmap_use_sub_pmd(addr); continue; } if (vmemmap_populate_basepages(addr, next, node, NULL)) From faf1c0008a33d4ac6336f63a358641cf86926fc0 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 29 Apr 2021 22:57:22 -0700 Subject: [PATCH 080/175] x86/vmemmap: optimize for consecutive sections in partial populated PMDs We can optimize in the case we are adding consecutive sections, so no memset(PAGE_UNUSED) is needed. In that case, let us keep track where the unused range of the previous memory range begins, so we can compare it with start of the range to be added. If they are equal, we know sections are added consecutively. For that purpose, let us introduce 'unused_pmd_start', which always holds the beginning of the unused memory range. In the case a section does not contiguously follow the previous one, we know we can memset [unused_pmd_start, PMD_BOUNDARY) with PAGE_UNUSE. This patch is based on a similar patch by David Hildenbrand: https://lore.kernel.org/linux-mm/20200722094558.9828-10-david@redhat.com/ Link: https://lkml.kernel.org/r/20210309214050.4674-5-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 67 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index fa7948042bff0a..58ae2f746b3ee0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -829,17 +829,42 @@ void __init paging_init(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP #define PAGE_UNUSED 0xFD +/* + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges + * from unused_pmd_start to next PMD_SIZE boundary. + */ +static unsigned long unused_pmd_start __meminitdata; + +static void __meminit vmemmap_flush_unused_pmd(void) +{ + if (!unused_pmd_start) + return; + /* + * Clears (unused_pmd_start, PMD_END] + */ + memset((void *)unused_pmd_start, PAGE_UNUSED, + ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start); + unused_pmd_start = 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG /* Returns true if the PMD is completely unused and thus it can be freed */ static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) { unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + /* + * Flush the unused range cache to ensure that memchr_inv() will work + * for the whole range. + */ + vmemmap_flush_unused_pmd(); memset((void *)addr, PAGE_UNUSED, end - addr); return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE); } +#endif -static void __meminit vmemmap_use_sub_pmd(unsigned long start) +static void __meminit __vmemmap_use_sub_pmd(unsigned long start) { /* * As we expect to add in the same granularity as we remove, it's @@ -851,23 +876,53 @@ static void __meminit vmemmap_use_sub_pmd(unsigned long start) memset((void *)start, 0, sizeof(struct page)); } +static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_pmd_start == start) { + if (likely(IS_ALIGNED(end, PMD_SIZE))) + unused_pmd_start = 0; + else + unused_pmd_start = end; + return; + } + + /* + * If the range does not contiguously follows previous one, make sure + * to mark the unused range of the previous one so it can be removed. + */ + vmemmap_flush_unused_pmd(); + __vmemmap_use_sub_pmd(start); +} + + static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) { + vmemmap_flush_unused_pmd(); + /* * Could be our memmap page is filled with PAGE_UNUSED already from a * previous remove. Make sure to reset it. */ - vmemmap_use_sub_pmd(start); + __vmemmap_use_sub_pmd(start); /* * Mark with PAGE_UNUSED the unused parts of the new memmap range */ if (!IS_ALIGNED(start, PMD_SIZE)) memset((void *)start, PAGE_UNUSED, - start - ALIGN_DOWN(start, PMD_SIZE)); + start - ALIGN_DOWN(start, PMD_SIZE)); + + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD where the + * unused range begins. + */ if (!IS_ALIGNED(end, PMD_SIZE)) - memset((void *)end, PAGE_UNUSED, - ALIGN(end, PMD_SIZE) - end); + unused_pmd_start = end; } #endif @@ -1538,7 +1593,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, return -ENOMEM; /* no fallback */ } else if (pmd_large(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); - vmemmap_use_sub_pmd(addr); + vmemmap_use_sub_pmd(addr, next); continue; } if (vmemmap_populate_basepages(addr, next, node, NULL)) From f9001107820c647f65b57fb9c1ca2c0908b5fede Mon Sep 17 00:00:00 2001 From: Ovidiu Panait Date: Thu, 29 Apr 2021 22:57:26 -0700 Subject: [PATCH 081/175] mm, tracing: improve rss_stat tracepoint message Adjust the rss_stat tracepoint to print the name of the resident page type that got updated (e.g. MM_ANONPAGES/MM_FILEPAGES), rather than the numeric index corresponding to it (the __entry->member value): Before this patch: ------------------ rss_stat: mm_id=1216113068 curr=0 member=1 size=28672B rss_stat: mm_id=1216113068 curr=0 member=1 size=0B rss_stat: mm_id=534402304 curr=1 member=0 size=188416B rss_stat: mm_id=534402304 curr=1 member=1 size=40960B After this patch: ----------------- rss_stat: mm_id=1726253524 curr=1 type=MM_ANONPAGES size=40960B rss_stat: mm_id=1726253524 curr=1 type=MM_FILEPAGES size=663552B rss_stat: mm_id=1726253524 curr=1 type=MM_ANONPAGES size=65536B rss_stat: mm_id=1726253524 curr=1 type=MM_FILEPAGES size=647168B Use TRACE_DEFINE_ENUM()/__print_symbolic() logic to map the enum values to the strings they represent, so that userspace tools can also parse the raw data correctly. Link: https://lkml.kernel.org/r/20210310162305.4862-1-ovidiu.panait@windriver.com Signed-off-by: Ovidiu Panait Suggested-by: Steven Rostedt (VMware) Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 3a60b6b6db32fa..829a75692cc0b5 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -343,6 +343,26 @@ static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) #define __PTR_TO_HASHVAL #endif +#define TRACE_MM_PAGES \ + EM(MM_FILEPAGES) \ + EM(MM_ANONPAGES) \ + EM(MM_SWAPENTS) \ + EMe(MM_SHMEMPAGES) + +#undef EM +#undef EMe + +#define EM(a) TRACE_DEFINE_ENUM(a); +#define EMe(a) TRACE_DEFINE_ENUM(a); + +TRACE_MM_PAGES + +#undef EM +#undef EMe + +#define EM(a) { a, #a }, +#define EMe(a) { a, #a } + TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, @@ -365,10 +385,10 @@ TRACE_EVENT(rss_stat, __entry->size = (count << PAGE_SHIFT); ), - TP_printk("mm_id=%u curr=%d member=%d size=%ldB", + TP_printk("mm_id=%u curr=%d type=%s size=%ldB", __entry->mm_id, __entry->curr, - __entry->member, + __print_symbolic(__entry->member, TRACE_MM_PAGES), __entry->size) ); #endif /* _TRACE_KMEM_H */ From 74ffa5a3e68504dd289135b1cf0422c19ffb3f2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:57:29 -0700 Subject: [PATCH 082/175] mm: add remap_pfn_range_notrack Patch series "add remap_pfn_range_notrack instead of reinventing it in i915", v2. i915 has some reason to want to avoid the track_pfn_remap overhead in remap_pfn_range. Add a function to the core VM to do just that rather than reinventing the functionality poorly in the driver. Note that the remap_io_sg path does get exercises when using Xorg on my Thinkpad X1, so this should be considered lightly tested, I've not managed to hit the remap_io_mapping path at all. This patch (of 4): Add a version of remap_pfn_range that does not call track_pfn_range. This will be used to fix horrible abuses of VM internals in the i915 driver. Link: https://lkml.kernel.org/r/20210326055505.1424432-1-hch@lst.de Link: https://lkml.kernel.org/r/20210326055505.1424432-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Chris Wilson Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 51 ++++++++++++++++++++++++++++------------------ 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 64be3baf861a98..a8335cecf706d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2732,6 +2732,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); diff --git a/mm/memory.c b/mm/memory.c index f95d12032b3404..8dc69471be6cb9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2260,26 +2260,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; - unsigned long remap_pfn = pfn; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) @@ -2309,10 +2300,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); - if (err) - return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); @@ -2324,12 +2311,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) - untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); + return -EINVAL; + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (err) + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); From 1fbaf8fc12a0136c7e62e7ad6fe886fe1749912c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:57:32 -0700 Subject: [PATCH 083/175] mm: add a io_mapping_map_user helper Add a helper that calls remap_pfn_range for an struct io_mapping, relying on the pgprot pre-validation done when creating the mapping instead of doing it at runtime. Link: https://lkml.kernel.org/r/20210326055505.1424432-3-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Chris Wilson Cc: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Peter Zijlstra Cc: Rodrigo Vivi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io-mapping.h | 3 +++ mm/Kconfig | 3 +++ mm/Makefile | 1 + mm/io-mapping.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+) create mode 100644 mm/io-mapping.c diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index c093e81310a9b3..e9743cfd858527 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -220,3 +220,6 @@ io_mapping_free(struct io_mapping *iomap) } #endif /* _LINUX_IO_MAPPING_H */ + +int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size); diff --git a/mm/Kconfig b/mm/Kconfig index d0808a23e54bc8..8a27c7c914280f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -871,4 +871,7 @@ config MAPPING_DIRTY_HELPERS config KMAP_LOCAL bool +# struct io_mapping based helper. Selected by drivers that need them +config IO_MAPPING + bool endmenu diff --git a/mm/Makefile b/mm/Makefile index 72227b24a61688..c0135e385984bb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_IO_MAPPING) += io-mapping.o diff --git a/mm/io-mapping.c b/mm/io-mapping.c new file mode 100644 index 00000000000000..01b3627999304e --- /dev/null +++ b/mm/io-mapping.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +/** + * io_mapping_map_user - remap an I/O mapping to userspace + * @iomap: the source io_mapping + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * + * Note: this is only safe if the mm semaphore is held when called. + */ +int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size) +{ + vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + + if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) + return -EINVAL; + + /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ + return remap_pfn_range_notrack(vma, addr, pfn, size, + __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | + (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK))); +} +EXPORT_SYMBOL_GPL(io_mapping_map_user); From b739f125e4ebd73d10ed30a856574e13649119ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:57:35 -0700 Subject: [PATCH 084/175] i915: use io_mapping_map_user Replace the home-grown remap_io_mapping that abuses apply_to_page_range with the proper io_mapping_map_user interface. Link: https://lkml.kernel.org/r/20210326055505.1424432-4-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Chris Wilson Cc: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Peter Zijlstra Cc: Rodrigo Vivi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/Kconfig | 1 + drivers/gpu/drm/i915/gem/i915_gem_mman.c | 9 +++-- drivers/gpu/drm/i915/i915_drv.h | 3 -- drivers/gpu/drm/i915/i915_mm.c | 44 ------------------------ 4 files changed, 5 insertions(+), 52 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 369695df8b1516..69f57ca9c68d73 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -20,6 +20,7 @@ config DRM_I915 select INPUT if ACPI select ACPI_VIDEO if ACPI select ACPI_BUTTON if ACPI + select IO_MAPPING select SYNC_FILE select IOSF_MBI select CRC32 diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 2561a2f1e54ffb..23f6b00e08e211 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -367,11 +367,10 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) goto err_unpin; /* Finally, remap it using the new GTT offset */ - ret = remap_io_mapping(area, - area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT), - (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT, - min_t(u64, vma->size, area->vm_end - area->vm_start), - &ggtt->iomap); + ret = io_mapping_map_user(&ggtt->iomap, area, area->vm_start + + (vma->ggtt_view.partial.offset << PAGE_SHIFT), + (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT, + min_t(u64, vma->size, area->vm_end - area->vm_start)); if (ret) goto err_fence; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 69e43bf91a1532..9ec9277539ec14 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1905,9 +1905,6 @@ int i915_reg_read_ioctl(struct drm_device *dev, void *data, struct drm_file *file); /* i915_mm.c */ -int remap_io_mapping(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size, - struct io_mapping *iomap); int remap_io_sg(struct vm_area_struct *vma, unsigned long addr, unsigned long size, struct scatterlist *sgl, resource_size_t iobase); diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index 666808cb3a3260..9a777b0ff59b05 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -37,17 +37,6 @@ struct remap_pfn { resource_size_t iobase; }; -static int remap_pfn(pte_t *pte, unsigned long addr, void *data) -{ - struct remap_pfn *r = data; - - /* Special PTE are not associated with any struct page */ - set_pte_at(r->mm, addr, pte, pte_mkspecial(pfn_pte(r->pfn, r->prot))); - r->pfn++; - - return 0; -} - #define use_dma(io) ((io) != -1) static inline unsigned long sgt_pfn(const struct remap_pfn *r) @@ -77,40 +66,7 @@ static int remap_sg(pte_t *pte, unsigned long addr, void *data) return 0; } -/** - * remap_io_mapping - remap an IO mapping to userspace - * @vma: user vma to map to - * @addr: target user address to start at - * @pfn: physical address of kernel memory - * @size: size of map area - * @iomap: the source io_mapping - * - * Note: this is only safe if the mm semaphore is held when called. - */ -int remap_io_mapping(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size, - struct io_mapping *iomap) -{ - struct remap_pfn r; - int err; - #define EXPECTED_FLAGS (VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) - GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS); - - /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ - r.mm = vma->vm_mm; - r.pfn = pfn; - r.prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); - - err = apply_to_page_range(r.mm, addr, size, remap_pfn, &r); - if (unlikely(err)) { - zap_vma_ptes(vma, addr, (r.pfn - pfn) << PAGE_SHIFT); - return err; - } - - return 0; -} /** * remap_io_sg - remap an IO mapping to userspace From b12d691ea5e01db42ccf3b4207e57cb3ce7cfe91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:57:38 -0700 Subject: [PATCH 085/175] i915: fix remap_io_sg to verify the pgprot remap_io_sg claims that the pgprot is pre-verified using an io_mapping, but actually does not get passed an io_mapping and just uses the pgprot in the VMA. Remove the apply_to_page_range abuse and just loop over remap_pfn_range for each segment. Note: this could use io_mapping_map_user by passing an iomap to remap_io_sg if the maintainers can verify that the pgprot in the iomap in the only caller is indeed the desired one here. Link: https://lkml.kernel.org/r/20210326055505.1424432-5-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Chris Wilson Cc: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Peter Zijlstra Cc: Rodrigo Vivi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_mm.c | 73 +++++++++++----------------------- 1 file changed, 23 insertions(+), 50 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index 9a777b0ff59b05..4c8cd08c672d2d 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -28,46 +28,10 @@ #include "i915_drv.h" -struct remap_pfn { - struct mm_struct *mm; - unsigned long pfn; - pgprot_t prot; - - struct sgt_iter sgt; - resource_size_t iobase; -}; +#define EXPECTED_FLAGS (VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) #define use_dma(io) ((io) != -1) -static inline unsigned long sgt_pfn(const struct remap_pfn *r) -{ - if (use_dma(r->iobase)) - return (r->sgt.dma + r->sgt.curr + r->iobase) >> PAGE_SHIFT; - else - return r->sgt.pfn + (r->sgt.curr >> PAGE_SHIFT); -} - -static int remap_sg(pte_t *pte, unsigned long addr, void *data) -{ - struct remap_pfn *r = data; - - if (GEM_WARN_ON(!r->sgt.sgp)) - return -EINVAL; - - /* Special PTE are not associated with any struct page */ - set_pte_at(r->mm, addr, pte, - pte_mkspecial(pfn_pte(sgt_pfn(r), r->prot))); - r->pfn++; /* track insertions in case we need to unwind later */ - - r->sgt.curr += PAGE_SIZE; - if (r->sgt.curr >= r->sgt.max) - r->sgt = __sgt_iter(__sg_next(r->sgt.sgp), use_dma(r->iobase)); - - return 0; -} - -#define EXPECTED_FLAGS (VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) - /** * remap_io_sg - remap an IO mapping to userspace * @vma: user vma to map to @@ -82,12 +46,7 @@ int remap_io_sg(struct vm_area_struct *vma, unsigned long addr, unsigned long size, struct scatterlist *sgl, resource_size_t iobase) { - struct remap_pfn r = { - .mm = vma->vm_mm, - .prot = vma->vm_page_prot, - .sgt = __sgt_iter(sgl, use_dma(iobase)), - .iobase = iobase, - }; + unsigned long pfn, len, remapped = 0; int err; /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ @@ -96,11 +55,25 @@ int remap_io_sg(struct vm_area_struct *vma, if (!use_dma(iobase)) flush_cache_range(vma, addr, size); - err = apply_to_page_range(r.mm, addr, size, remap_sg, &r); - if (unlikely(err)) { - zap_vma_ptes(vma, addr, r.pfn << PAGE_SHIFT); - return err; - } - - return 0; + do { + if (use_dma(iobase)) { + if (!sg_dma_len(sgl)) + break; + pfn = (sg_dma_address(sgl) + iobase) >> PAGE_SHIFT; + len = sg_dma_len(sgl); + } else { + pfn = page_to_pfn(sg_page(sgl)); + len = sgl->length; + } + + err = remap_pfn_range(vma, addr + remapped, pfn, len, + vma->vm_page_prot); + if (err) + break; + remapped += len; + } while ((sgl = __sg_next(sgl))); + + if (err) + zap_vma_ptes(vma, addr, remapped); + return err; } From b99a342d4f11a5455d999b12f5fee42ab6acaf8c Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 29 Apr 2021 22:57:41 -0700 Subject: [PATCH 086/175] NUMA balancing: reduce TLB flush via delaying mapping on hint page fault With NUMA balancing, in hint page fault handler, the faulting page will be migrated to the accessing node if necessary. During the migration, TLB will be shot down on all CPUs that the process has run on recently. Because in the hint page fault handler, the PTE will be made accessible before the migration is tried. The overhead of TLB shooting down can be high, so it's better to be avoided if possible. In fact, if we delay mapping the page until migration, that can be avoided. This is what this patch doing. For the multiple threads applications, it's possible that a page is accessed by multiple threads almost at the same time. In the original implementation, because the first thread will install the accessible PTE before migrating the page, the other threads may access the page directly before the page is made inaccessible again during migration. While with the patch, the second thread will go through the page fault handler too. And because of the PageLRU() checking in the following code path, migrate_misplaced_page() numamigrate_isolate_page() isolate_lru_page() the migrate_misplaced_page() will return 0, and the PTE will be made accessible in the second thread. This will introduce a little more overhead. But we think the possibility for a page to be accessed by the multiple threads at the same time is low, and the overhead difference isn't too large. If this becomes a problem in some workloads, we need to consider how to reduce the overhead. To test the patch, we run a test case as follows on a 2-socket Intel server (1 NUMA node per socket) with 128GB DRAM (64GB per socket). 1. Run a memory eater on NUMA node 1 to use 40GB memory before running pmbench. 2. Run pmbench (normal accessing pattern) with 8 processes, and 8 threads per process, so there are 64 threads in total. The working-set size of each process is 8960MB, so the total working-set size is 8 * 8960MB = 70GB. The CPU of all pmbench processes is bound to node 1. The pmbench processes will access some DRAM on node 0. 3. After the pmbench processes run for 10 seconds, kill the memory eater. Now, some pages will be migrated from node 0 to node 1 via NUMA balancing. Test results show that, with the patch, the pmbench throughput (page accesses/s) increases 5.5%. The number of the TLB shootdowns interrupts reduces 98% (from ~4.7e7 to ~9.7e5) with about 9.2e6 pages (35.8GB) migrated. From the perf profile, it can be found that the CPU cycles spent by try_to_unmap() and its callees reduces from 6.02% to 0.47%. That is, the CPU cycles spent by TLB shooting down decreases greatly. Link: https://lkml.kernel.org/r/20210408132236.1175607-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Mel Gorman Cc: Peter Zijlstra Cc: Peter Xu Cc: Johannes Weiner Cc: Vlastimil Babka Cc: "Matthew Wilcox" Cc: Will Deacon Cc: Michel Lespinasse Cc: Arjun Roy Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 54 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 8dc69471be6cb9..a62f06750e93d6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4125,29 +4125,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out; } - /* - * Make it present again, Depending on how arch implementes non - * accessible ptes, some can allow access by kernel mode. - */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + /* Get the normal PTE */ + old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); - pte = pte_mkyoung(pte); - if (was_writable) - pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); - if (!page) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + if (!page) + goto out_map; /* TODO: handle PTE-mapped THP */ - if (PageCompound(page)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + if (PageCompound(page)) + goto out_map; /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as @@ -4157,7 +4145,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!pte_write(pte)) + if (!was_writable) flags |= TNF_NO_GROUP; /* @@ -4171,23 +4159,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == NUMA_NO_NODE) { put_page(page); - goto out; + goto out_map; } + pte_unmap_unlock(vmf->pte, vmf->ptl); /* Migrate to the requested node */ if (migrate_misplaced_page(page, vma, target_nid)) { page_nid = target_nid; flags |= TNF_MIGRATED; - } else + } else { flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; + } + goto out_map; + } out: if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; +out_map: + /* + * Make it present again, depending on how arch implements + * non-accessible ptes, some can allow access by kernel mode. + */ + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (was_writable) + pte = pte_mkwrite(pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) From a4609387859f0281951f5e476d9f76d7fb9ab321 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Thu, 29 Apr 2021 22:57:45 -0700 Subject: [PATCH 087/175] mm: extend MREMAP_DONTUNMAP to non-anonymous mappings Patch series "mm: Extend MREMAP_DONTUNMAP to non-anonymous mappings", v5. This patch (of 3): Currently MREMAP_DONTUNMAP only accepts private anonymous mappings. This restriction was placed initially for simplicity and not because there exists a technical reason to do so. This change will widen the support to include any mappings which are not VM_DONTEXPAND or VM_PFNMAP. The primary use case is to support MREMAP_DONTUNMAP on mappings which may have been created from a memfd. This change will result in mremap(MREMAP_DONTUNMAP) returning -EINVAL if VM_DONTEXPAND or VM_PFNMAP mappings are specified. Lokesh Gidra who works on the Android JVM, provided an explanation of how such a feature will improve Android JVM garbage collection: "Android is developing a new garbage collector (GC), based on userfaultfd. The garbage collector will use userfaultfd (uffd) on the java heap during compaction. On accessing any uncompacted page, the application threads will find it missing, at which point the thread will create the compacted page and then use UFFDIO_COPY ioctl to get it mapped and then resume execution. Before starting this compaction, in a stop-the-world pause the heap will be mremap(MREMAP_DONTUNMAP) so that the java heap is ready to receive UFFD_EVENT_PAGEFAULT events after resuming execution. To speedup mremap operations, pagetable movement was optimized by moving PUD entries instead of PTE entries [1]. It was necessary as mremap of even modest sized memory ranges also took several milliseconds, and stopping the application for that long isn't acceptable in response-time sensitive cases. With UFFDIO_CONTINUE feature [2], it will be even more efficient to implement this GC, particularly the 'non-moveable' portions of the heap. It will also help in reducing the need to copy (UFFDIO_COPY) the pages. However, for this to work, the java heap has to be on a 'shared' vma. Currently MREMAP_DONTUNMAP only supports private anonymous mappings, this patch will enable using UFFDIO_CONTINUE for the new userfaultfd-based heap compaction." [1] https://lore.kernel.org/linux-mm/20201215030730.NC3CU98e4%25akpm@linux-foundation.org/ [2] https://lore.kernel.org/linux-mm/20210302000133.272579-1-axelrasmussen@google.com/ Link: https://lkml.kernel.org/r/20210323182520.2712101-1-bgeffon@google.com Signed-off-by: Brian Geffon Acked-by: Hugh Dickins Tested-by: Lokesh Gidra Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Alejandro Colomar Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: "Kirill A . Shutemov" Cc: Michael Kerrisk Cc: "Michael S . Tsirkin" Cc: Mike Rapoport Cc: Minchan Kim Cc: Peter Xu Cc: Sonny Rao Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index ec8f840399ed4e..db5b8b28c2dd03 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -653,8 +653,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EINVAL); } - if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || - vma->vm_flags & VM_SHARED)) + if ((flags & MREMAP_DONTUNMAP) && + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return ERR_PTR(-EINVAL); if (is_vm_hugetlb_page(vma)) From 14d071134c740cfe61c09fc506fd3ab052beea10 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Thu, 29 Apr 2021 22:57:48 -0700 Subject: [PATCH 088/175] Revert "mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio" This reverts commit cd544fd1dc9293c6702fab6effa63dac1cc67e99. As discussed in [1] this commit was a no-op because the mapping type was checked in vma_to_resize before move_vma is ever called. This meant that vm_ops->mremap() would never be called on such mappings. Furthermore, we've since expanded support of MREMAP_DONTUNMAP to non-anonymous mappings, and these special mappings are still protected by the existing check of !VM_DONTEXPAND and !VM_PFNMAP which will result in a -EINVAL. 1. https://lkml.org/lkml/2020/12/28/2340 Link: https://lkml.kernel.org/r/20210323182520.2712101-2-bgeffon@google.com Signed-off-by: Brian Geffon Acked-by: Hugh Dickins Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Alejandro Colomar Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: "Kirill A . Shutemov" Cc: Lokesh Gidra Cc: Michael Kerrisk Cc: "Michael S . Tsirkin" Cc: Mike Rapoport Cc: Minchan Kim Cc: Peter Xu Cc: Sonny Rao Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- fs/aio.c | 5 +---- include/linux/mm.h | 2 +- mm/mmap.c | 6 +----- mm/mremap.c | 2 +- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 935af2ac6b1af1..05a89e33fde288 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) return 0; } -static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags) +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) { /* Not supported */ return -EINVAL; diff --git a/fs/aio.c b/fs/aio.c index 1f32da13d39ee6..76ce0cc3ee4ec8 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) +static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/include/linux/mm.h b/include/linux/mm.h index a8335cecf706d0..93097dbd96044a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -580,7 +580,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area, unsigned long flags); + int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not diff --git a/mm/mmap.c b/mm/mmap.c index 1d96a21acb2f3b..347ef9b83bb529 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3409,14 +3409,10 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } -static int special_mapping_mremap(struct vm_area_struct *new_vma, - unsigned long flags) +static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; diff --git a/mm/mremap.c b/mm/mremap.c index db5b8b28c2dd03..d22629ff8f3c07 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -545,7 +545,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma, flags); + err = vma->vm_ops->mremap(new_vma); } if (unlikely(err)) { From 8593100444e93861fb5c867bf8cc104543259714 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Thu, 29 Apr 2021 22:57:52 -0700 Subject: [PATCH 089/175] selftests: add a MREMAP_DONTUNMAP selftest for shmem This test extends the current mremap tests to validate that the MREMAP_DONTUNMAP operation can be performed on shmem mappings. Link: https://lkml.kernel.org/r/20210323182520.2712101-3-bgeffon@google.com Signed-off-by: Brian Geffon Cc: Axel Rasmussen Cc: Lokesh Gidra Cc: Mike Rapoport Cc: Peter Xu Cc: Hugh Dickins Cc: "Michael S . Tsirkin" Cc: Brian Geffon Cc: Andy Lutomirski Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Sonny Rao Cc: Minchan Kim Cc: "Kirill A . Shutemov" Cc: Dmitry Safonov Cc: Michael Kerrisk Cc: Alejandro Colomar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mremap_dontunmap.c | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c index 3a7b5ef0b0c6fc..f01dc4a85b0bec 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -127,6 +127,57 @@ static void mremap_dontunmap_simple() "unable to unmap source mapping"); } +// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. +static void mremap_dontunmap_simple_shmem() +{ + unsigned long num_pages = 5; + + int mem_fd = memfd_create("memfd", MFD_CLOEXEC); + BUG_ON(mem_fd < 0, "memfd_create"); + + BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, + "ftruncate"); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, mem_fd, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + BUG_ON(close(mem_fd) < 0, "close"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + if (dest_mapping == MAP_FAILED && errno == EINVAL) { + // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return; + } + + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // Because the region is backed by shmem, we will actually see the same + // memory at the source location still. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 'a') != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + // This test validates MREMAP_DONTUNMAP will move page tables to a specific // destination using MREMAP_FIXED, also while validating that the source // remains intact. @@ -300,6 +351,7 @@ int main(void) BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); mremap_dontunmap_simple(); + mremap_dontunmap_simple_shmem(); mremap_dontunmap_simple_fixed(); mremap_dontunmap_partial_mapping(); mremap_dontunmap_partial_mapping_overwrite(); From 943f229e9608104c11bf9a230883dbd121323532 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Thu, 29 Apr 2021 22:57:55 -0700 Subject: [PATCH 090/175] mm/dmapool: switch from strlcpy to strscpy strlcpy is marked as deprecated in Documentation/process/deprecated.rst, and there is no functional difference when the caller expects truncation (when not checking the return value). strscpy is relatively better as it also avoids scanning the whole source string. Link: https://lkml.kernel.org/r/1613962050-14188-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index f3791532fef22a..16483f86360e1c 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -157,7 +157,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, if (!retval) return retval; - strlcpy(retval->name, name, sizeof(retval->name)); + strscpy(retval->name, name, sizeof(retval->name)); retval->dev = dev; From 2284f47fe9fe2ed2ef619e5474e155cfeeebd569 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Thu, 29 Apr 2021 22:57:58 -0700 Subject: [PATCH 091/175] mm/sparse: add the missing sparse_buffer_fini() in error branch sparse_buffer_init() and sparse_buffer_fini() should appear in pair, or a WARN issue would be through the next time sparse_buffer_init() runs. Add the missing sparse_buffer_fini() in error branch. Link: https://lkml.kernel.org/r/20210325113155.118574-1-wangwensheng4@huawei.com Fixes: 85c77f791390 ("mm/sparse: add new sparse_init_nid() and sparse_init()") Signed-off-by: Wang Wensheng Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/sparse.c b/mm/sparse.c index 7bd23f9d6cef64..33406ea2ecc448 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -547,6 +547,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", __func__, nid); pnum_begin = pnum; + sparse_buffer_fini(); goto failed; } check_usemap_section_nr(nid, usage); From 8c2acfe8c1df1c8baacbeee4c519683ae3f3d722 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:58:01 -0700 Subject: [PATCH 092/175] samples/vfio-mdev/mdpy: use remap_vmalloc_range Patch series "remap_vmalloc_range cleanups". This series removes an open coded instance of remap_vmalloc_range and removes the unused remap_vmalloc_range_partial export. This patch (of 2): Use remap_vmalloc_range instead of open coding it using remap_vmalloc_range_partial. Link: https://lkml.kernel.org/r/20210301082235.932968-1-hch@lst.de Link: https://lkml.kernel.org/r/20210301082235.932968-2-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Kirti Wankhede Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- samples/vfio-mdev/mdpy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 885b88ea20e234..f0c0e7209719a3 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -406,9 +406,7 @@ static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; - return remap_vmalloc_range_partial(vma, vma->vm_start, - mdev_state->memblk, 0, - vma->vm_end - vma->vm_start); + return remap_vmalloc_range(vma, mdev_state->memblk, 0); } static int mdpy_get_region_info(struct mdev_device *mdev, From 0f71d7e14c2129c5b99aec6961a55b331f9dbaf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:58:04 -0700 Subject: [PATCH 093/175] mm: unexport remap_vmalloc_range_partial remap_vmalloc_range_partial is only used to implement remap_vmalloc_range and by procfs. Unexport it. Link: https://lkml.kernel.org/r/20210301082235.932968-3-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Kirti Wankhede Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d5f2a84e488ad8..735b501ac41236 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3072,7 +3072,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, return 0; } -EXPORT_SYMBOL(remap_vmalloc_range_partial); /** * remap_vmalloc_range - map vmalloc pages to userspace From f608788cd2d6cae27d1a3d2253544ca76b353764 Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Thu, 29 Apr 2021 22:58:07 -0700 Subject: [PATCH 094/175] mm/vmalloc: use rb_tree instead of list for vread() lookups vread() has been linearly searching vmap_area_list for looking up vmalloc areas to read from. These same areas are also tracked by a rb_tree (vmap_area_root) which offers logarithmic lookup. This patch modifies vread() to use the rb_tree structure instead of the list and the speedup for heavy /proc/kcore readers can be pretty significant. Below are the wall clock measurements of a Python application that leverages the drgn debugging library to read and interpret data read from /proc/kcore. Before the patch: ----- $ time sudo sdb -e 'dbuf | head 3000 | wc' (unsigned long)3000 real 0m22.446s user 0m2.321s sys 0m20.690s ----- With the patch: ----- $ time sudo sdb -e 'dbuf | head 3000 | wc' (unsigned long)3000 real 0m2.104s user 0m2.043s sys 0m0.921s ----- Link: https://lkml.kernel.org/r/20210209190253.108763-1-serapheim@delphix.com Signed-off-by: Serapheim Dimitropoulos Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 735b501ac41236..8c20670e246dc8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2894,7 +2894,10 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); - list_for_each_entry(va, &vmap_area_list, list) { + va = __find_vmap_area((unsigned long)addr); + if (!va) + goto finished; + list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; From 972472c7466b50efed4539694007951a3fc7b95c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:10 -0700 Subject: [PATCH 095/175] ARM: mm: add missing pud_page define to 2-level page tables Patch series "huge vmalloc mappings", v13. The kernel virtual mapping layer grew support for mapping memory with > PAGE_SIZE ptes with commit 0ddab1d2ed66 ("lib/ioremap.c: add huge I/O map capability interfaces"), and implemented support for using those huge page mappings with ioremap. According to the submission, the use-case is mapping very large non-volatile memory devices, which could be GB or TB: https://lore.kernel.org/lkml/1425404664-19675-1-git-send-email-toshi.kani@hp.com/ The benefit is said to be in the overhead of maintaining the mapping, perhaps both in memory overhead and setup / teardown time. Memory overhead for the mapping with a 4kB page and 8 byte page table is 2GB per TB of mapping, down to 4MB / TB with 2MB pages. The same huge page vmap infrastructure can be quite easily adapted and used for mapping vmalloc memory pages without more complexity for arch or core vmap code. However unlike ioremap, vmalloc page table overhead is not a real problem, so the advantage to justify this is performance. Several of the most structures in the kernel (e.g., vfs and network hash tables) are allocated with vmalloc on NUMA machines, in order to distribute access bandwidth over the machine. Mapping these with larger pages can improve TLB usage significantly, for example this reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%, due to vfs hashes being allocated with 2MB pages. [ Other numbers? - The difference is even larger in a guest due to more costly TLB misses. - Eric Dumazet was keen on the network hash performance possibilities. - Other archs? Ding was doing x86 testing. ] The kernel module allocator also uses vmalloc to map module images even on non-NUMA, which can result in high iTLB pressure on highly modular distro type of kernels. This series does not implement huge mappings for modules yet, but it's a step along the way. Rick Edgecombe was looking at that IIRC. The per-cpu allocator similarly might be able to take advantage of this. Also on the todo list. The disadvantages of this I can see are: * Memory fragmentation can waste some physical memory because it will attempt to allocate larger pages to fit the required size, rounding up (once the requested size is >= 2MB). - I don't see it being a big problem in practice unless some user crops up that allocates thousands of 2.5MB ranges. We can tewak heuristics a bit there if needed to reduce peak waste. * Less granular mappings can make the NUMA distribution less balanced. - Similar to the above. - Could also allocate all major system hashes with one allocation up-front and spread them all across the one block, which should help overall NUMA distribution and reduce fragmentation waste. * Callers might expect something about the underlying allocated pages. - Tried to keep the apperance of base PAGE_SIZE pages throughout the APIs and exposed data structures. - Added a VM_NO_HUGE_VMAP flag to hammer troublesome cases with. - Finally, added a nohugevmalloc boot option to turn it off (independent of nohugeiomap). This patch (of 14): ARM uses its own PMD folding scheme which is missing pud_page which should just pass through to pmd_page. Move this from the 3-level page table to common header. Link: https://lkml.kernel.org/r/20210317062402.533919-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Russell King Cc: Ding Tianhong Cc: Uladzislau Rezki (Sony) Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgtable-3level.h | 2 -- arch/arm/include/asm/pgtable.h | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2b85d175e99969..d4edab51a77c07 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -186,8 +186,6 @@ static inline pte_t pte_mkspecial(pte_t pte) #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) -#define pud_page(pud) pmd_page(__pmd(pud_val(pud))) -#define pud_write(pud) pmd_write(__pmd(pud_val(pud))) #define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd)) #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index c02f24400369bc..d63a5bb6bd0c14 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -166,6 +166,9 @@ extern struct page *empty_zero_page; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; +#define pud_page(pud) pmd_page(__pmd(pud_val(pud))) +#define pud_write(pud) pmd_write(__pmd(pud_val(pud))) + #define pmd_none(pmd) (!pmd_val(pmd)) static inline pte_t *pmd_page_vaddr(pmd_t pmd) From c0eb315ad9719e41ce44708455cc69df7ac9f3f8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:13 -0700 Subject: [PATCH 096/175] mm/vmalloc: fix HUGE_VMAP regression by enabling huge pages in vmalloc_to_page vmalloc_to_page returns NULL for addresses mapped by larger pages[*]. Whether or not a vmap is huge depends on the architecture details, alignments, boot options, etc., which the caller can not be expected to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page. This change teaches vmalloc_to_page about larger pages, and returns the struct page that corresponds to the offset within the large page. This makes the API agnostic to mapping implementation details. [*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap: fail gracefully on unexpected huge vmap mappings") [npiggin@gmail.com: sparc32: add stub pud_page define for walking huge vmalloc page tables] Link: https://lkml.kernel.org/r/20210324232825.1157363-1-npiggin@gmail.com Link: https://lkml.kernel.org/r/20210317062402.533919-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Miaohe Lin Reviewed-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Cc: Stephen Rothwell Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_32.h | 3 +++ mm/vmalloc.c | 41 ++++++++++++++++++----------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 632cdb959542c0..a5cf79c149fef5 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -321,6 +321,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) pgprot_val(newprot)); } +/* only used by the huge vmap code, should never be called */ +#define pud_page(pud) NULL + struct seq_file; void mmu_info(struct seq_file *m); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8c20670e246dc8..ee865b6e88ebf6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -34,7 +34,7 @@ #include #include #include - +#include #include #include #include @@ -343,7 +343,9 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the struct page it maps. Huge vmap mappings will + * return the tail page that corresponds to the base page address, which + * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { @@ -363,25 +365,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pgd_none(*pgd)) return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; - pud = pud_offset(p4d, addr); + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; - /* - * Don't dereference bad PUD or PMD (below) entries. This will also - * identify huge mappings, which we may encounter on architectures - * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be - * identified as vmalloc addresses by is_vmalloc_addr(), but are - * not [unambiguously] associated with a struct page, so there is - * no correct value to return for them. - */ - WARN_ON_ONCE(pud_bad(*pud)); - if (pud_none(*pud) || pud_bad(*pud)) + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; + pmd = pmd_offset(pud, addr); - WARN_ON_ONCE(pmd_bad(*pmd)); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_map(pmd, addr); @@ -389,6 +399,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pte_present(pte)) page = pte_page(pte); pte_unmap(ptep); + return page; } EXPORT_SYMBOL(vmalloc_to_page); From 0c95cba4925509c13fce6278456a0badb9e49775 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:16 -0700 Subject: [PATCH 097/175] mm: apply_to_pte_range warn and fail if a large pte is encountered apply_to_pte_range might mistake a large pte for bad, or treat it as a page table, resulting in a crash or corruption. Add a test to warn and return error if large entries are found. Link: https://lkml.kernel.org/r/20210317062402.533919-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Miaohe Lin Reviewed-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 66 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a62f06750e93d6..cbdc2cd9cedb97 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2457,13 +2457,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, } do { next = pmd_addr_end(addr, end); - if (create || !pmd_none_or_clear_bad(pmd)) { - err = apply_to_pte_range(mm, pmd, addr, next, fn, data, - create, mask); - if (err) - break; + if (pmd_none(*pmd) && !create) + continue; + if (WARN_ON_ONCE(pmd_leaf(*pmd))) + return -EINVAL; + if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { + if (!create) + continue; + pmd_clear_bad(pmd); } + err = apply_to_pte_range(mm, pmd, addr, next, + fn, data, create, mask); + if (err) + break; } while (pmd++, addr = next, addr != end); + return err; } @@ -2485,13 +2493,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, } do { next = pud_addr_end(addr, end); - if (create || !pud_none_or_clear_bad(pud)) { - err = apply_to_pmd_range(mm, pud, addr, next, fn, data, - create, mask); - if (err) - break; + if (pud_none(*pud) && !create) + continue; + if (WARN_ON_ONCE(pud_leaf(*pud))) + return -EINVAL; + if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { + if (!create) + continue; + pud_clear_bad(pud); } + err = apply_to_pmd_range(mm, pud, addr, next, + fn, data, create, mask); + if (err) + break; } while (pud++, addr = next, addr != end); + return err; } @@ -2513,13 +2529,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, } do { next = p4d_addr_end(addr, end); - if (create || !p4d_none_or_clear_bad(p4d)) { - err = apply_to_pud_range(mm, p4d, addr, next, fn, data, - create, mask); - if (err) - break; + if (p4d_none(*p4d) && !create) + continue; + if (WARN_ON_ONCE(p4d_leaf(*p4d))) + return -EINVAL; + if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { + if (!create) + continue; + p4d_clear_bad(p4d); } + err = apply_to_pud_range(mm, p4d, addr, next, + fn, data, create, mask); + if (err) + break; } while (p4d++, addr = next, addr != end); + return err; } @@ -2539,9 +2563,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - if (!create && pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd) && !create) continue; - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return -EINVAL; + if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { + if (!create) + continue; + pgd_clear_bad(pgd); + } + err = apply_to_p4d_range(mm, pgd, addr, next, + fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); From 0a264884046f1ab0c906a61fd838002ecf9ef408 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:19 -0700 Subject: [PATCH 098/175] mm/vmalloc: rename vmap_*_range vmap_pages_*_range The vmalloc mapper operates on a struct page * array rather than a linear physical address, re-name it to make this distinction clear. Link: https://lkml.kernel.org/r/20210317062402.533919-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Miaohe Lin Reviewed-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ee865b6e88ebf6..ce4066b469559c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -189,7 +189,7 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size) arch_sync_kernel_mappings(start, end); } -static int vmap_pte_range(pmd_t *pmd, unsigned long addr, +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -217,7 +217,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static int vmap_pmd_range(pud_t *pud, unsigned long addr, +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -229,13 +229,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static int vmap_pud_range(p4d_t *p4d, unsigned long addr, +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -247,13 +247,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -265,7 +265,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; @@ -306,7 +306,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); From 95f0ddf081af3a77433090d9deaf3f76f5648336 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:23 -0700 Subject: [PATCH 099/175] mm/ioremap: rename ioremap_*_range to vmap_*_range This will be used as a generic kernel virtual mapping function, so re-name it in preparation. Link: https://lkml.kernel.org/r/20210317062402.533919-6-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Miaohe Lin Reviewed-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ioremap.c | 64 +++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/mm/ioremap.c b/mm/ioremap.c index 5fa1ab41d15260..3f4d36f9745a78 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -61,9 +61,9 @@ static inline int ioremap_pud_enabled(void) { return 0; } static inline int ioremap_pmd_enabled(void) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; @@ -81,9 +81,8 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { if (!ioremap_pmd_enabled()) return 0; @@ -103,9 +102,9 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, return pmd_set_huge(pmd, phys_addr, prot); } -static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; @@ -116,20 +115,19 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); - if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { *mask |= PGTBL_PMD_MODIFIED; continue; } - if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask)) + if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; } -static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { if (!ioremap_pud_enabled()) return 0; @@ -149,9 +147,9 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, return pud_set_huge(pud, phys_addr, prot); } -static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; @@ -162,20 +160,19 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end); - if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) { + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) { *mask |= PGTBL_PUD_MODIFIED; continue; } - if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask)) + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } -static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { if (!ioremap_p4d_enabled()) return 0; @@ -195,9 +192,9 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, return p4d_set_huge(p4d, phys_addr, prot); } -static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; @@ -208,19 +205,19 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); - if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { *mask |= PGTBL_P4D_MODIFIED; continue; } - if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask)) + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } -int ioremap_page_range(unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) +static int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { pgd_t *pgd; unsigned long start; @@ -235,8 +232,7 @@ int ioremap_page_range(unsigned long addr, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot, - &mask); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); @@ -249,6 +245,12 @@ int ioremap_page_range(unsigned long addr, return err; } +int ioremap_page_range(unsigned long addr, + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) +{ + return vmap_range(addr, end, phys_addr, prot); +} + #ifdef CONFIG_GENERIC_IOREMAP void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) { From bbc180a5adb05ee8053fab7a0c0bd56c5964240e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:26 -0700 Subject: [PATCH 100/175] mm: HUGE_VMAP arch support cleanup This changes the awkward approach where architectures provide init functions to determine which levels they can provide large mappings for, to one where the arch is queried for each call. This removes code and indirection, and allows constant-folding of dead code for unsupported levels. This also adds a prot argument to the arch query. This is unused currently but could help with some architectures (e.g., some powerpc processors can't map uncacheable memory with large pages). Link: https://lkml.kernel.org/r/20210317062402.533919-7-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Ding Tianhong Acked-by: Catalin Marinas [arm64] Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Christoph Hellwig Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/vmalloc.h | 8 ++ arch/arm64/mm/mmu.c | 10 +-- arch/powerpc/include/asm/vmalloc.h | 8 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 8 +- arch/x86/include/asm/vmalloc.h | 7 ++ arch/x86/mm/ioremap.c | 12 +-- include/linux/io.h | 9 --- include/linux/vmalloc.h | 6 ++ init/main.c | 1 - mm/debug_vm_pgtable.c | 4 +- mm/ioremap.c | 94 ++++++++++-------------- 11 files changed, 87 insertions(+), 80 deletions(-) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 2ca708ab9b20b0..597b40405319d8 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -1,4 +1,12 @@ #ifndef _ASM_ARM64_VMALLOC_H #define _ASM_ARM64_VMALLOC_H +#include + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +bool arch_vmap_p4d_supported(pgprot_t prot); +bool arch_vmap_pud_supported(pgprot_t prot); +bool arch_vmap_pmd_supported(pgprot_t prot); +#endif + #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d563335ad43f5e..8436e0755361ab 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1339,12 +1339,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) return dt_virt; } -int __init arch_ioremap_p4d_supported(void) +bool arch_vmap_p4d_supported(pgprot_t prot) { - return 0; + return false; } -int __init arch_ioremap_pud_supported(void) +bool arch_vmap_pud_supported(pgprot_t prot) { /* * Only 4k granule supports level 1 block mappings. @@ -1354,9 +1354,9 @@ int __init arch_ioremap_pud_supported(void) !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } -int __init arch_ioremap_pmd_supported(void) +bool arch_vmap_pmd_supported(pgprot_t prot) { - /* See arch_ioremap_pud_supported() */ + /* See arch_vmap_pud_supported() */ return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h index b992dfaaa16181..105abb73f075cf 100644 --- a/arch/powerpc/include/asm/vmalloc.h +++ b/arch/powerpc/include/asm/vmalloc.h @@ -1,4 +1,12 @@ #ifndef _ASM_POWERPC_VMALLOC_H #define _ASM_POWERPC_VMALLOC_H +#include + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +bool arch_vmap_p4d_supported(pgprot_t prot); +bool arch_vmap_pud_supported(pgprot_t prot); +bool arch_vmap_pmd_supported(pgprot_t prot); +#endif + #endif /* _ASM_POWERPC_VMALLOC_H */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 98f0b243c1ab21..743807fc210f3a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1082,13 +1082,13 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); } -int __init arch_ioremap_pud_supported(void) +bool arch_vmap_pud_supported(pgprot_t prot) { /* HPT does not cope with large pages in the vmalloc area */ return radix_enabled(); } -int __init arch_ioremap_pmd_supported(void) +bool arch_vmap_pmd_supported(pgprot_t prot) { return radix_enabled(); } @@ -1182,7 +1182,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) return 1; } -int __init arch_ioremap_p4d_supported(void) +bool arch_vmap_p4d_supported(pgprot_t prot) { - return 0; + return false; } diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h index 29837740b52092..094ea2b565f37c 100644 --- a/arch/x86/include/asm/vmalloc.h +++ b/arch/x86/include/asm/vmalloc.h @@ -1,6 +1,13 @@ #ifndef _ASM_X86_VMALLOC_H #define _ASM_X86_VMALLOC_H +#include #include +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +bool arch_vmap_p4d_supported(pgprot_t prot); +bool arch_vmap_pud_supported(pgprot_t prot); +bool arch_vmap_pmd_supported(pgprot_t prot); +#endif + #endif /* _ASM_X86_VMALLOC_H */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 9e5ccc56f8e077..fbaf0c44798606 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -481,24 +481,26 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int __init arch_ioremap_p4d_supported(void) +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +bool arch_vmap_p4d_supported(pgprot_t prot) { - return 0; + return false; } -int __init arch_ioremap_pud_supported(void) +bool arch_vmap_pud_supported(pgprot_t prot) { #ifdef CONFIG_X86_64 return boot_cpu_has(X86_FEATURE_GBPAGES); #else - return 0; + return false; #endif } -int __init arch_ioremap_pmd_supported(void) +bool arch_vmap_pmd_supported(pgprot_t prot) { return boot_cpu_has(X86_FEATURE_PSE); } +#endif /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem diff --git a/include/linux/io.h b/include/linux/io.h index 61ff7d6278b694..9595151d800d67 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, } #endif -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -void __init ioremap_huge_init(void); -int arch_ioremap_p4d_supported(void); -int arch_ioremap_pud_supported(void); -int arch_ioremap_pmd_supported(void); -#else -static inline void ioremap_huge_init(void) { } -#endif - /* * Managed iomap interface */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3de7be6dd17cd9..358c51c702c045 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -78,6 +78,12 @@ struct vmap_area { }; }; +#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP +static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; } +static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; } +static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; } +#endif + /* * Highlevel APIs for driver use */ diff --git a/init/main.c b/init/main.c index f498aac26e8cbe..ae96c79ad2d3b6 100644 --- a/init/main.c +++ b/init/main.c @@ -837,7 +837,6 @@ static void __init mm_init(void) pgtable_init(); debug_objects_mem_init(); vmalloc_init(); - ioremap_huge_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); /* Should be run after espfix64 is set up. */ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index a9bd6ce1ba02b3..05efe98a9ac2c8 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -247,7 +247,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { pmd_t pmd; - if (!arch_ioremap_pmd_supported()) + if (!arch_vmap_pmd_supported(prot)) return; pr_debug("Validating PMD huge\n"); @@ -385,7 +385,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { pud_t pud; - if (!arch_ioremap_pud_supported()) + if (!arch_vmap_pud_supported(prot)) return; pr_debug("Validating PUD huge\n"); diff --git a/mm/ioremap.c b/mm/ioremap.c index 3f4d36f9745a78..3264d0203785d6 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -16,49 +16,16 @@ #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static int __read_mostly ioremap_p4d_capable; -static int __read_mostly ioremap_pud_capable; -static int __read_mostly ioremap_pmd_capable; -static int __read_mostly ioremap_huge_disabled; +static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT; static int __init set_nohugeiomap(char *str) { - ioremap_huge_disabled = 1; + iomap_max_page_shift = P4D_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); - -void __init ioremap_huge_init(void) -{ - if (!ioremap_huge_disabled) { - if (arch_ioremap_p4d_supported()) - ioremap_p4d_capable = 1; - if (arch_ioremap_pud_supported()) - ioremap_pud_capable = 1; - if (arch_ioremap_pmd_supported()) - ioremap_pmd_capable = 1; - } -} - -static inline int ioremap_p4d_enabled(void) -{ - return ioremap_p4d_capable; -} - -static inline int ioremap_pud_enabled(void) -{ - return ioremap_pud_capable; -} - -static inline int ioremap_pmd_enabled(void) -{ - return ioremap_pmd_capable; -} - -#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static inline int ioremap_p4d_enabled(void) { return 0; } -static inline int ioremap_pud_enabled(void) { return 0; } -static inline int ioremap_pmd_enabled(void) { return 0; } +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static const bool iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, @@ -82,9 +49,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { - if (!ioremap_pmd_enabled()) + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) return 0; if ((end - addr) != PMD_SIZE) @@ -104,7 +75,7 @@ static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) + unsigned int max_page_shift, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; @@ -115,7 +86,8 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { next = pmd_addr_end(addr, end); - if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, + max_page_shift)) { *mask |= PGTBL_PMD_MODIFIED; continue; } @@ -127,9 +99,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { - if (!ioremap_pud_enabled()) + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) return 0; if ((end - addr) != PUD_SIZE) @@ -149,7 +125,7 @@ static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) + unsigned int max_page_shift, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; @@ -160,21 +136,27 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, do { next = pud_addr_end(addr, end); - if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) { + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, + max_page_shift)) { *mask |= PGTBL_PUD_MODIFIED; continue; } - if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask)) + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, + max_page_shift, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { - if (!ioremap_p4d_enabled()) + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) return 0; if ((end - addr) != P4D_SIZE) @@ -194,7 +176,7 @@ static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) + unsigned int max_page_shift, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; @@ -205,19 +187,22 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, do { next = p4d_addr_end(addr, end); - if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, + max_page_shift)) { *mask |= PGTBL_P4D_MODIFIED; continue; } - if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask)) + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, + max_page_shift, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { pgd_t *pgd; unsigned long start; @@ -232,7 +217,8 @@ static int vmap_range(unsigned long addr, unsigned long end, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); @@ -248,7 +234,7 @@ static int vmap_range(unsigned long addr, unsigned long end, int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { - return vmap_range(addr, end, phys_addr, prot); + return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift); } #ifdef CONFIG_GENERIC_IOREMAP From 8309c9d717024660185fab3c96705a9d7ed0d842 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:30 -0700 Subject: [PATCH 101/175] powerpc: inline huge vmap supported functions This allows unsupported levels to be constant folded away, and so p4d_free_pud_page can be removed because it's no longer linked to. Link: https://lkml.kernel.org/r/20210317062402.533919-8-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Michael Ellerman Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/vmalloc.h | 19 ++++++++++++++++--- arch/powerpc/mm/book3s64/radix_pgtable.c | 21 --------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h index 105abb73f075cf..3f0c153befb01d 100644 --- a/arch/powerpc/include/asm/vmalloc.h +++ b/arch/powerpc/include/asm/vmalloc.h @@ -1,12 +1,25 @@ #ifndef _ASM_POWERPC_VMALLOC_H #define _ASM_POWERPC_VMALLOC_H +#include #include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -bool arch_vmap_p4d_supported(pgprot_t prot); -bool arch_vmap_pud_supported(pgprot_t prot); -bool arch_vmap_pmd_supported(pgprot_t prot); +static inline bool arch_vmap_p4d_supported(pgprot_t prot) +{ + return false; +} + +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ + /* HPT does not cope with large pages in the vmalloc area */ + return radix_enabled(); +} + +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + return radix_enabled(); +} #endif #endif /* _ASM_POWERPC_VMALLOC_H */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 743807fc210f3a..8da62afccee5a9 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1082,22 +1082,6 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); } -bool arch_vmap_pud_supported(pgprot_t prot) -{ - /* HPT does not cope with large pages in the vmalloc area */ - return radix_enabled(); -} - -bool arch_vmap_pmd_supported(pgprot_t prot) -{ - return radix_enabled(); -} - -int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) -{ - return 0; -} - int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { pte_t *ptep = (pte_t *)pud; @@ -1181,8 +1165,3 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) return 1; } - -bool arch_vmap_p4d_supported(pgprot_t prot) -{ - return false; -} From 168a6333142bfa6dfb1f114110465760828bc6a3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:33 -0700 Subject: [PATCH 102/175] arm64: inline huge vmap supported functions This allows unsupported levels to be constant folded away, and so p4d_free_pud_page can be removed because it's no longer linked to. Link: https://lkml.kernel.org/r/20210317062402.533919-9-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Catalin Marinas Cc: Will Deacon Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/vmalloc.h | 23 ++++++++++++++++++++--- arch/arm64/mm/mmu.c | 26 -------------------------- 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 597b40405319d8..fc9a12d6cc1ac6 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -4,9 +4,26 @@ #include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -bool arch_vmap_p4d_supported(pgprot_t prot); -bool arch_vmap_pud_supported(pgprot_t prot); -bool arch_vmap_pmd_supported(pgprot_t prot); +static inline bool arch_vmap_p4d_supported(pgprot_t prot) +{ + return false; +} + +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ + /* + * Only 4k granule supports level 1 block mappings. + * SW table walks can't handle removal of intermediate entries. + */ + return IS_ENABLED(CONFIG_ARM64_4K_PAGES) && + !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); +} + +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + /* See arch_vmap_pud_supported() */ + return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); +} #endif #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8436e0755361ab..70fa3cdbe841eb 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1339,27 +1339,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) return dt_virt; } -bool arch_vmap_p4d_supported(pgprot_t prot) -{ - return false; -} - -bool arch_vmap_pud_supported(pgprot_t prot) -{ - /* - * Only 4k granule supports level 1 block mappings. - * SW table walks can't handle removal of intermediate entries. - */ - return IS_ENABLED(CONFIG_ARM64_4K_PAGES) && - !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); -} - -bool arch_vmap_pmd_supported(pgprot_t prot) -{ - /* See arch_vmap_pud_supported() */ - return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); -} - int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); @@ -1451,11 +1430,6 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) return 1; } -int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) -{ - return 0; /* Don't attempt a block mapping */ -} - #ifdef CONFIG_MEMORY_HOTPLUG static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) { From 97dc2a1548ab0dc320ce3618b73b3f9dc732b6ee Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:36 -0700 Subject: [PATCH 103/175] x86: inline huge vmap supported functions This allows unsupported levels to be constant folded away, and so p4d_free_pud_page can be removed because it's no longer linked to. Link: https://lkml.kernel.org/r/20210317062402.533919-10-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Ding Tianhong Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/vmalloc.h | 22 +++++++++++++++++++--- arch/x86/mm/ioremap.c | 21 --------------------- arch/x86/mm/pgtable.c | 13 ------------- 3 files changed, 19 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h index 094ea2b565f37c..e714b00fc0cac9 100644 --- a/arch/x86/include/asm/vmalloc.h +++ b/arch/x86/include/asm/vmalloc.h @@ -1,13 +1,29 @@ #ifndef _ASM_X86_VMALLOC_H #define _ASM_X86_VMALLOC_H +#include #include #include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -bool arch_vmap_p4d_supported(pgprot_t prot); -bool arch_vmap_pud_supported(pgprot_t prot); -bool arch_vmap_pmd_supported(pgprot_t prot); +static inline bool arch_vmap_p4d_supported(pgprot_t prot) +{ + return false; +} + +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ +#ifdef CONFIG_X86_64 + return boot_cpu_has(X86_FEATURE_GBPAGES); +#else + return false; +#endif +} + +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + return boot_cpu_has(X86_FEATURE_PSE); +} #endif #endif /* _ASM_X86_VMALLOC_H */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fbaf0c44798606..12c686c65ea996 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -481,27 +481,6 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -bool arch_vmap_p4d_supported(pgprot_t prot) -{ - return false; -} - -bool arch_vmap_pud_supported(pgprot_t prot) -{ -#ifdef CONFIG_X86_64 - return boot_cpu_has(X86_FEATURE_GBPAGES); -#else - return false; -#endif -} - -bool arch_vmap_pmd_supported(pgprot_t prot) -{ - return boot_cpu_has(X86_FEATURE_PSE); -} -#endif - /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f6a9e2e3664259..d27cf69e811d59 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -780,14 +780,6 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } -/* - * Until we support 512GB pages, skip them in the vmap area. - */ -int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) -{ - return 0; -} - #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. @@ -861,11 +853,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) #else /* !CONFIG_X86_64 */ -int pud_free_pmd_page(pud_t *pud, unsigned long addr) -{ - return pud_none(*pud); -} - /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd pmd entries. See vmalloc_sync_one(). From 6f680e70b6ff58c9670769534196800233685d55 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:39 -0700 Subject: [PATCH 104/175] mm/vmalloc: provide fallback arch huge vmap support functions If an architecture doesn't support a particular page table level as a huge vmap page size then allow it to skip defining the support query function. Link: https://lkml.kernel.org/r/20210317062402.533919-11-npiggin@gmail.com Signed-off-by: Nicholas Piggin Suggested-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/vmalloc.h | 7 +++---- arch/powerpc/include/asm/vmalloc.h | 7 +++---- arch/x86/include/asm/vmalloc.h | 13 +++++-------- include/linux/vmalloc.h | 24 ++++++++++++++++++++---- 4 files changed, 31 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index fc9a12d6cc1ac6..7a22aeea9bb586 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -4,11 +4,8 @@ #include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static inline bool arch_vmap_p4d_supported(pgprot_t prot) -{ - return false; -} +#define arch_vmap_pud_supported arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { /* @@ -19,11 +16,13 @@ static inline bool arch_vmap_pud_supported(pgprot_t prot) !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } +#define arch_vmap_pmd_supported arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { /* See arch_vmap_pud_supported() */ return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } + #endif #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h index 3f0c153befb01d..4c69ece52a31e5 100644 --- a/arch/powerpc/include/asm/vmalloc.h +++ b/arch/powerpc/include/asm/vmalloc.h @@ -5,21 +5,20 @@ #include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static inline bool arch_vmap_p4d_supported(pgprot_t prot) -{ - return false; -} +#define arch_vmap_pud_supported arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { /* HPT does not cope with large pages in the vmalloc area */ return radix_enabled(); } +#define arch_vmap_pmd_supported arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return radix_enabled(); } + #endif #endif /* _ASM_POWERPC_VMALLOC_H */ diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h index e714b00fc0cac9..49ce331f3ac6b0 100644 --- a/arch/x86/include/asm/vmalloc.h +++ b/arch/x86/include/asm/vmalloc.h @@ -6,24 +6,21 @@ #include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static inline bool arch_vmap_p4d_supported(pgprot_t prot) -{ - return false; -} +#ifdef CONFIG_X86_64 +#define arch_vmap_pud_supported arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { -#ifdef CONFIG_X86_64 return boot_cpu_has(X86_FEATURE_GBPAGES); -#else - return false; -#endif } +#endif +#define arch_vmap_pmd_supported arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return boot_cpu_has(X86_FEATURE_PSE); } + #endif #endif /* _ASM_X86_VMALLOC_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 358c51c702c045..eb5630be67833a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -78,10 +78,26 @@ struct vmap_area { }; }; -#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP -static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; } -static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; } -static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; } +/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ +#ifndef arch_vmap_p4d_supported +static inline bool arch_vmap_p4d_supported(pgprot_t prot) +{ + return false; +} +#endif + +#ifndef arch_vmap_pud_supported +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ + return false; +} +#endif + +#ifndef arch_vmap_pmd_supported +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + return false; +} #endif /* From 5e9e3d777b99aabe2f91f793a52e870a02642160 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:43 -0700 Subject: [PATCH 105/175] mm: move vmap_range from mm/ioremap.c to mm/vmalloc.c This is a generic kernel virtual memory mapper, not specific to ioremap. Code is unchanged other than making vmap_range non-static. Link: https://lkml.kernel.org/r/20210317062402.533919-12-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 + mm/ioremap.c | 203 ---------------------------------------- mm/vmalloc.c | 202 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 203 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb5630be67833a..ae9eb07d30d47a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -189,6 +189,9 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); #ifdef CONFIG_MMU +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift); extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, diff --git a/mm/ioremap.c b/mm/ioremap.c index 3264d0203785d6..d1dcc7e744acfa 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -28,209 +28,6 @@ early_param("nohugeiomap", set_nohugeiomap); static const bool iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - pte_t *pte; - u64 pfn; - - pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel_track(pmd, addr, mask); - if (!pte) - return -ENOMEM; - do { - BUG_ON(!pte_none(*pte)); - set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); - *mask |= PGTBL_PTE_MODIFIED; - return 0; -} - -static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - if (max_page_shift < PMD_SHIFT) - return 0; - - if (!arch_vmap_pmd_supported(prot)) - return 0; - - if ((end - addr) != PMD_SIZE) - return 0; - - if (!IS_ALIGNED(addr, PMD_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, PMD_SIZE)) - return 0; - - if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) - return 0; - - return pmd_set_huge(pmd, phys_addr, prot); -} - -static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift, pgtbl_mod_mask *mask) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_alloc_track(&init_mm, pud, addr, mask); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - - if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, - max_page_shift)) { - *mask |= PGTBL_PMD_MODIFIED; - continue; - } - - if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) - return -ENOMEM; - } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - if (max_page_shift < PUD_SHIFT) - return 0; - - if (!arch_vmap_pud_supported(prot)) - return 0; - - if ((end - addr) != PUD_SIZE) - return 0; - - if (!IS_ALIGNED(addr, PUD_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, PUD_SIZE)) - return 0; - - if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) - return 0; - - return pud_set_huge(pud, phys_addr, prot); -} - -static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift, pgtbl_mod_mask *mask) -{ - pud_t *pud; - unsigned long next; - - pud = pud_alloc_track(&init_mm, p4d, addr, mask); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - - if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, - max_page_shift)) { - *mask |= PGTBL_PUD_MODIFIED; - continue; - } - - if (vmap_pmd_range(pud, addr, next, phys_addr, prot, - max_page_shift, mask)) - return -ENOMEM; - } while (pud++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - if (max_page_shift < P4D_SHIFT) - return 0; - - if (!arch_vmap_p4d_supported(prot)) - return 0; - - if ((end - addr) != P4D_SIZE) - return 0; - - if (!IS_ALIGNED(addr, P4D_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, P4D_SIZE)) - return 0; - - if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) - return 0; - - return p4d_set_huge(p4d, phys_addr, prot); -} - -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift, pgtbl_mod_mask *mask) -{ - p4d_t *p4d; - unsigned long next; - - p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); - if (!p4d) - return -ENOMEM; - do { - next = p4d_addr_end(addr, end); - - if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, - max_page_shift)) { - *mask |= PGTBL_P4D_MODIFIED; - continue; - } - - if (vmap_pud_range(p4d, addr, next, phys_addr, prot, - max_page_shift, mask)) - return -ENOMEM; - } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - pgd_t *pgd; - unsigned long start; - unsigned long next; - int err; - pgtbl_mod_mask mask = 0; - - might_sleep(); - BUG_ON(addr >= end); - - start = addr; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, - max_page_shift, &mask); - if (err) - break; - } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); - - flush_cache_vmap(start, end); - - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) - arch_sync_kernel_mappings(start, end); - - return err; -} - int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ce4066b469559c..5c81717f7e0e0a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -68,6 +68,208 @@ static void free_work(struct work_struct *w) } /*** Page table manipulation functions ***/ +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) +{ + pte_t *pte; + u64 pfn; + + pfn = phys_addr >> PAGE_SHIFT; + pte = pte_alloc_kernel_track(pmd, addr, mask); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; + return 0; +} + +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PMD_MODIFIED; + continue; + } + + if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) + return -ENOMEM; + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PUD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc_track(&init_mm, p4d, addr, mask); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PUD_MODIFIED; + continue; + } + + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(addr, P4D_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_P4D_MODIFIED; + continue; + } + + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + flush_cache_vmap(start, end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) From 5d87510de15f31d1b26cffced7bc4d504539a2c7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:46 -0700 Subject: [PATCH 106/175] mm/vmalloc: add vmap_range_noflush variant As a side-effect, the order of flush_cache_vmap() and arch_sync_kernel_mappings() calls are switched, but that now matches the other callers in this file. Link: https://lkml.kernel.org/r/20210317062402.533919-13-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5c81717f7e0e0a..41c1dbdd2677f4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -240,7 +240,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, return 0; } -int vmap_range(unsigned long addr, unsigned long end, +static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { @@ -263,14 +263,24 @@ int vmap_range(unsigned long addr, unsigned long end, break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); - flush_cache_vmap(start, end); - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return err; } +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + int err; + + err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); + flush_cache_vmap(addr, end); + + return err; +} + static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { From 121e6f3258fe393e22c36f61a319be8a4f2c05ae Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:49 -0700 Subject: [PATCH 107/175] mm/vmalloc: hugepage vmalloc mappings Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC enables support on architectures that define HAVE_ARCH_HUGE_VMAP and supports PMD sized vmap mappings. vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or larger, and fall back to small pages if that was unsuccessful. Architectures must ensure that any arch specific vmalloc allocations that require PAGE_SIZE mappings (e.g., module allocations vs strict module rwx) use the VM_NOHUGE flag to inhibit larger mappings. This can result in more internal fragmentation and memory overhead for a given allocation, an option nohugevmalloc is added to disable at boot. [colin.king@canonical.com: fix read of uninitialized pointer area] Link: https://lkml.kernel.org/r/20210318155955.18220-1-colin.king@canonical.com Link: https://lkml.kernel.org/r/20210317062402.533919-14-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 11 ++ include/linux/vmalloc.h | 21 ++++ mm/page_alloc.c | 5 +- mm/vmalloc.c | 220 +++++++++++++++++++++++++++++++--------- 4 files changed, 209 insertions(+), 48 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 5e8f6680d4bf5c..bf27159be4d9af 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -829,6 +829,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD config HAVE_ARCH_HUGE_VMAP bool +# +# Archs that select this would be capable of PMD-sized vmaps (i.e., +# arch_vmap_pmd_supported() returns true), and they must make no assumptions +# that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag +# can be used to prohibit arch-specific allocations from using hugepages to +# help with this (e.g., modules may require it). +# +config HAVE_ARCH_HUGE_VMALLOC + depends on HAVE_ARCH_HUGE_VMAP + bool + config ARCH_WANT_HUGE_PMD_SHARE bool diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ae9eb07d30d47a..b4c82f2d40dc3c 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -26,6 +26,7 @@ struct notifier_block; /* in notifier.h */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ +#define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. @@ -54,6 +55,9 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + unsigned int page_order; +#endif unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; @@ -188,6 +192,22 @@ void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); +static inline bool is_vm_area_hugepages(const void *addr) +{ + /* + * This may not 100% tell if the area is mapped with > PAGE_SIZE + * page table entries, if for some reason the architecture indicates + * larger sizes are available but decides not to use them, nothing + * prevents that. This only indicates the size of the physical page + * allocated in the vmalloc layer. + */ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return find_vm_area(addr)->page_order > 0; +#else + return false; +#endif +} + #ifdef CONFIG_MMU int vmap_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, @@ -205,6 +225,7 @@ static inline void set_vm_flush_reset_perms(void *addr) if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } + #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 56a8103580d603..39ff5c604cef3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -8222,6 +8223,7 @@ void *__init alloc_large_system_hash(const char *tablename, void *table = NULL; gfp_t gfp_flags; bool virt; + bool huge; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8289,6 +8291,7 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -8305,7 +8308,7 @@ void *__init alloc_large_system_hash(const char *tablename, pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, - virt ? "vmalloc" : "linear"); + virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) *_hash_shift = log2qty; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 41c1dbdd2677f4..59c815eb7e741f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -42,6 +42,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)x; @@ -483,31 +496,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { unsigned long start = addr; - unsigned long end = addr + size; - unsigned long next; pgd_t *pgd; + unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; @@ -529,6 +523,66 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } +static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || + page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT); +} + int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages) { @@ -2112,6 +2166,24 @@ EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; +static inline unsigned int vm_area_page_order(struct vm_struct *vm) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return vm->page_order; +#else + return 0; +#endif +} + +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + vm->page_order = order; +#else + BUG_ON(order != 0); +#endif +} + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -2422,6 +2494,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, { int i; + /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); @@ -2431,6 +2504,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; @@ -2455,11 +2529,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * map. Find the start and end range of the direct mappings to make sure * the vm_unmap_aliases() flush includes the direct map. */ - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; start = min(addr, start); - end = max(addr + PAGE_SIZE, end); + end = max(addr + page_size, end); flush_dmap = 1; } } @@ -2500,13 +2577,14 @@ static void __vunmap(const void *addr, int deallocate_pages) vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { + unsigned int page_order = vm_area_page_order(area); int i; - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i]; BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, page_order); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2697,15 +2775,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, + int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); unsigned long array_size; - unsigned int i; + unsigned int nr_small_pages = size >> PAGE_SHIFT; + unsigned int page_order; struct page **pages; + unsigned int i; - array_size = (unsigned long)nr_pages * sizeof(struct page *); + array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2724,30 +2806,38 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } area->pages = pages; - area->nr_pages = nr_pages; + area->nr_pages = nr_small_pages; + set_vm_area_page_order(area, page_shift - PAGE_SHIFT); - for (i = 0; i < area->nr_pages; i++) { - struct page *page; + page_order = vm_area_page_order(area); - if (node == NUMA_NO_NODE) - page = alloc_page(gfp_mask); - else - page = alloc_pages_node(node, gfp_mask, 0); + /* + * Careful, we allocate and map page_order pages, but tracking is done + * per PAGE_SIZE page so as to keep the vm_struct APIs independent of + * the physical/mapped size. + */ + for (i = 0; i < area->nr_pages; i += 1U << page_order) { + struct page *page; + int p; + /* Compound pages required for remap_vmalloc_page */ + page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) goto fail; return area->addr; @@ -2755,7 +2845,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); + (area->nr_pages*PAGE_SIZE), size); __vfree(area->addr); return NULL; } @@ -2786,19 +2876,45 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages()) + if (!size || (size >> PAGE_SHIFT) > totalram_pages()) { + area = NULL; goto fail; + } + + if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) && + arch_vmap_pmd_supported(prot)) { + unsigned long size_per_node; - area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + size = PAGE_ALIGN(size); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail; - addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -2812,8 +2928,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, NULL, + if (shift > PAGE_SHIFT) { + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + + if (!area) { + /* Warn for area allocation, page allocations already warn */ + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); + } return NULL; } From b67177ecd956333029dbc1a4971a857fee0ccbb1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:53 -0700 Subject: [PATCH 108/175] mm/vmalloc: remove map_kernel_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/vmalloc: cleanup after hugepage series", v2. Christoph pointed out some overdue cleanups required after the huge vmalloc series, and I had another failure error message improvement as well. This patch (of 5): This is a shim around vmap_pages_range, get rid of it. Move the main API comment from the _noflush variant to the normal variant, and make _noflush internal to mm/. Link: https://lkml.kernel.org/r/20210322021806.892164-1-npiggin@gmail.com Link: https://lkml.kernel.org/r/20210322021806.892164-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Uladzislau Rezki Cc: Cédric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/cachetlb.rst | 2 +- include/linux/vmalloc.h | 11 ----- mm/internal.h | 6 +++ mm/percpu-vm.c | 5 ++- mm/vmalloc.c | 65 +++++++++++++---------------- 5 files changed, 38 insertions(+), 51 deletions(-) diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index a1582cc79f0fdb..756f7bcf819110 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -213,7 +213,7 @@ Here are the routines, one by one: there will be no entries in the cache for the kernel address space for virtual addresses in the range 'start' to 'end-1'. - The first of these two routines is invoked after map_kernel_range() + The first of these two routines is invoked after vmap_range() has installed the page table entries. The second is invoked before unmap_kernel_range() deletes the page table entries. diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b4c82f2d40dc3c..fb3b9989a4c563 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -212,10 +212,6 @@ static inline bool is_vm_area_hugepages(const void *addr) int vmap_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift); -extern int map_kernel_range_noflush(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages); -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, - struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); static inline void set_vm_flush_reset_perms(void *addr) @@ -227,13 +223,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #else -static inline int -map_kernel_range_noflush(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return size >> PAGE_SHIFT; -} -#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { diff --git a/mm/internal.h b/mm/internal.h index bbe900f9f095ac..58c3757c52d95e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -637,4 +637,10 @@ struct migration_target_control { gfp_t gfp_mask; }; +/* + * mm/vmalloc.c + */ +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index e46f7a6917f93b..88a53eb68a9401 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -8,6 +8,7 @@ * Chunks are mapped into vmalloc areas and populated page by page. * This is the default chunk allocator. */ +#include "internal.h" static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) @@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { - return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, - PAGE_KERNEL, pages); + return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), + PAGE_KERNEL, pages, PAGE_SHIFT); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 59c815eb7e741f..527781a3a0febb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -523,7 +523,16 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } -static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, +/* + * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * flush caches. + * + * The caller is responsible for calling flush_cache_vmap() after this + * function returns successfully and before the addresses are accessed. + * + * This is an internal function only. Do not use outside mm/. + */ +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -549,48 +558,26 @@ static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } -static int vmap_pages_range(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) -{ - int err; - - err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); - flush_cache_vmap(addr, end); - return err; -} - /** - * map_kernel_range_noflush - map kernel VM area with the specified pages + * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map - * @size: size of the VM area to map + * @end: end of the VM area to map (non-inclusive) * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. + * @pages: pages to map (always PAGE_SIZE pages) + * @page_shift: maximum shift that the pages may be mapped with, @pages must + * be aligned and contiguous up to at least this shift. * * RETURNS: * 0 on success, -errno on failure. */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT); -} - -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, - struct page **pages) +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) { - int ret; + int err; - ret = map_kernel_range_noflush(start, size, prot, pages); - flush_cache_vmap(start, start + size); - return ret; + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; } int is_vmalloc_or_module_addr(const void *x) @@ -2156,10 +2143,12 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) kasan_unpoison_vmalloc(mem, size); - if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { + if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, + pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + return mem; } EXPORT_SYMBOL(vm_map_ram); @@ -2703,6 +2692,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; + unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); @@ -2715,8 +2705,9 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), - pages) < 0) { + addr = (unsigned long)area->addr; + if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), + pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } From e82b9b3086b93857b1b46341714751b123a4d08b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:55 -0700 Subject: [PATCH 109/175] kernel/dma: remove unnecessary unmap_kernel_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vunmap will remove ptes. Link: https://lkml.kernel.org/r/20210322021806.892164-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Cédric Le Goater Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma/remap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 905c3fa005f107..b4526668072e77 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -66,6 +66,5 @@ void dma_common_free_remap(void *cpu_addr, size_t size) return; } - unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } From 94f88d7b901c28210d196f38168a548950dfc607 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:58 -0700 Subject: [PATCH 110/175] powerpc/xive: remove unnecessary unmap_kernel_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit iounmap will remove ptes. Link: https://lkml.kernel.org/r/20210322021806.892164-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Acked-by: Cédric Le Goater Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/xive/common.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 595310e056f4de..d6c2069cc82893 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -959,16 +959,12 @@ EXPORT_SYMBOL_GPL(is_xive_irq); void xive_cleanup_irq_data(struct xive_irq_data *xd) { if (xd->eoi_mmio) { - unmap_kernel_range((unsigned long)xd->eoi_mmio, - 1u << xd->esb_shift); iounmap(xd->eoi_mmio); if (xd->eoi_mmio == xd->trig_mmio) xd->trig_mmio = NULL; xd->eoi_mmio = NULL; } if (xd->trig_mmio) { - unmap_kernel_range((unsigned long)xd->trig_mmio, - 1u << xd->esb_shift); iounmap(xd->trig_mmio); xd->trig_mmio = NULL; } From 4ad0ae8c64ac8f81a3651bca11be7c3cb086df80 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:59:01 -0700 Subject: [PATCH 111/175] mm/vmalloc: remove unmap_kernel_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a shim around vunmap_range, get rid of it. Move the main API comment from the _noflush variant to the normal variant, and make _noflush internal to mm/. [npiggin@gmail.com: fix nommu builds and a comment bug per sfr] Link: https://lkml.kernel.org/r/1617292598.m6g0knx24s.astroid@bobo.none [akpm@linux-foundation.org: move vunmap_range_noflush() stub inside !CONFIG_MMU, not !CONFIG_NUMA] [npiggin@gmail.com: fix nommu builds] Link: https://lkml.kernel.org/r/1617292497.o1uhq5ipxp.astroid@bobo.none Link: https://lkml.kernel.org/r/20210322021806.892164-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Cédric Le Goater Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/cachetlb.rst | 2 +- arch/arm64/mm/init.c | 2 +- arch/powerpc/kernel/isa-bridge.c | 4 +- arch/powerpc/kernel/pci_64.c | 2 +- arch/powerpc/mm/ioremap.c | 2 +- drivers/pci/pci.c | 2 +- include/linux/vmalloc.h | 8 +--- mm/internal.h | 15 +++++++- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 59 ++++++++++++++--------------- 10 files changed, 51 insertions(+), 47 deletions(-) diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 756f7bcf819110..fe4290e2672969 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -215,7 +215,7 @@ Here are the routines, one by one: The first of these two routines is invoked after vmap_range() has installed the page table entries. The second is invoked - before unmap_kernel_range() deletes the page table entries. + before vunmap_range() deletes the page table entries. There exists another whole class of cpu cache issues which currently require a whole different set of interfaces to handle properly. diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 3685e12aba9b62..470f92e6a5426c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -521,7 +521,7 @@ void free_initmem(void) * prevents the region from being reused for kernel modules, which * is not supported by kallsyms. */ - unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); + vunmap_range((u64)__init_begin, (u64)__init_end); } void dump_mem_limit(void) diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 2257d24e6a2664..39c625737c092f 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -48,7 +48,7 @@ static void remap_isa_base(phys_addr_t pa, unsigned long size) if (slab_is_available()) { if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa, pgprot_noncached(PAGE_KERNEL))) - unmap_kernel_range(ISA_IO_BASE, size); + vunmap_range(ISA_IO_BASE, ISA_IO_BASE + size); } else { early_ioremap_range(ISA_IO_BASE, pa, size, pgprot_noncached(PAGE_KERNEL)); @@ -311,7 +311,7 @@ static void isa_bridge_remove(void) isa_bridge_pcidev = NULL; /* Unmap the ISA area */ - unmap_kernel_range(ISA_IO_BASE, 0x10000); + vunmap_range(ISA_IO_BASE, ISA_IO_BASE + 0x10000); } /** diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 9312e6eda7ff2b..3fb7e572abedfa 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -140,7 +140,7 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) addr = (unsigned long)area->addr; if (ioremap_page_range(addr, addr + size, paddr, pgprot_noncached(PAGE_KERNEL))) { - unmap_kernel_range(addr, size); + vunmap_range(addr, addr + size); return NULL; } diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index b1a0aebe8c48da..57342154d2b055 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -93,7 +93,7 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, if (!ret) return (void __iomem *)area->addr + offset; - unmap_kernel_range(va, size); + vunmap_range(va, va + size); free_vm_area(area); return NULL; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e4d4e399004b48..f4c26e6118eaa6 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4102,7 +4102,7 @@ void pci_unmap_iospace(struct resource *res) #if defined(PCI_IOBASE) && defined(CONFIG_MMU) unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start; - unmap_kernel_range(vaddr, resource_size(res)); + vunmap_range(vaddr, vaddr + resource_size(res)); #endif } EXPORT_SYMBOL(pci_unmap_iospace); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index fb3b9989a4c563..394d03cc0e9266 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -212,8 +212,7 @@ static inline bool is_vm_area_hugepages(const void *addr) int vmap_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift); -extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); -extern void unmap_kernel_range(unsigned long addr, unsigned long size); +void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); @@ -223,11 +222,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #else -static inline void -unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ -} -#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } diff --git a/mm/internal.h b/mm/internal.h index 58c3757c52d95e..42e30e71554ab6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -446,7 +446,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } - +static inline void vunmap_range_noflush(unsigned long start, unsigned long end) +{ +} #endif /* !CONFIG_MMU */ /* @@ -640,7 +642,18 @@ struct migration_target_control { /* * mm/vmalloc.c */ +#ifdef CONFIG_MMU int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); +#else +static inline +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + return -EINVAL; +} +#endif + +void vunmap_range_noflush(unsigned long start, unsigned long end); #endif /* __MM_INTERNAL_H */ diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 88a53eb68a9401..8d3844bc0c7cf8 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -134,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) { - unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); + vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 527781a3a0febb..f7a53c19e84bb4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -378,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } while (p4d++, addr = next, addr != end); } -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @start: start of the VM area to unmap - * @size: size of the VM area to unmap +/* + * vunmap_range_noflush is similar to vunmap_range, but does not + * flush caches or TLBs. * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify - * should have been allocated using get_vm_area() and its friends. + * The caller is responsible for calling flush_cache_vmap() before calling + * this function, and flush_tlb_kernel_range after it has returned + * successfully (and before the addresses are expected to cause a page fault + * or be re-mapped for something else, if TLB flushes are being delayed or + * coalesced). * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible - * for calling flush_cache_vunmap() on to-be-mapped areas before calling this - * function and flush_tlb_kernel_range() after. + * This is an internal function only. Do not use outside mm/. */ -void unmap_kernel_range_noflush(unsigned long start, unsigned long size) +void vunmap_range_noflush(unsigned long start, unsigned long end) { - unsigned long end = start + size; unsigned long next; pgd_t *pgd; unsigned long addr = start; @@ -414,6 +412,22 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size) arch_sync_kernel_mappings(start, end); } +/** + * vunmap_range - unmap kernel virtual addresses + * @addr: start of the VM area to unmap + * @end: end of the VM area to unmap (non-inclusive) + * + * Clears any present PTEs in the virtual address range, flushes TLBs and + * caches. Any subsequent access to the address before it has been re-mapped + * is a kernel bug. + */ +void vunmap_range(unsigned long addr, unsigned long end) +{ + flush_cache_vunmap(addr, end); + vunmap_range_noflush(addr, end); + flush_tlb_kernel_range(addr, end); +} + static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) @@ -1712,7 +1726,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); + vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1990,7 +2004,7 @@ static void vb_free(unsigned long addr, unsigned long size) offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); - unmap_kernel_range_noflush(addr, size); + vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); @@ -2307,23 +2321,6 @@ void __init vmalloc_init(void) vmap_initialized = true; } -/** - * unmap_kernel_range - unmap kernel VM area and flush cache and TLB - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Similar to unmap_kernel_range_noflush() but flushes vcache before - * the unmapping and tlb after. - */ -void unmap_kernel_range(unsigned long addr, unsigned long size) -{ - unsigned long end = addr + size; - - flush_cache_vunmap(addr, end); - unmap_kernel_range_noflush(addr, size); - flush_tlb_kernel_range(addr, end); -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { From d70bec8cc95ad32f6b7e3e6fad72acdd3a5418e9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:59:04 -0700 Subject: [PATCH 112/175] mm/vmalloc: improve allocation failure error messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several reasons why a vmalloc can fail, virtual space exhausted, page array allocation failure, page allocation failure, and kernel page table allocation failure. Add distinct warning messages for the main causes of failure, with some added information like page order or allocation size where applicable. [urezki@gmail.com: print correct vmalloc allocation size] Link: https://lkml.kernel.org/r/20210329193214.GA28602@pc638.lan Link: https://lkml.kernel.org/r/20210322021806.892164-6-npiggin@gmail.com Signed-off-by: Nicholas Piggin Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Christoph Hellwig Cc: Cédric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f7a53c19e84bb4..612a3790cfd4db 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2790,6 +2790,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!pages) { free_vm_area(area); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page array size %lu allocation failed", + nr_small_pages * PAGE_SIZE, array_size); return NULL; } @@ -2814,6 +2818,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page order %u allocation failed", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } @@ -2825,15 +2833,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "failed to map pages", + area->nr_pages * PAGE_SIZE); goto fail; + } return area->addr; fail: - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), size); __vfree(area->addr); return NULL; } @@ -2867,9 +2877,14 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; - if (!size || (size >> PAGE_SHIFT) > totalram_pages()) { - area = NULL; - goto fail; + if (WARN_ON_ONCE(!size)) + return NULL; + + if ((size >> PAGE_SHIFT) > totalram_pages()) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "exceeds total pages", real_size); + return NULL; } if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) && @@ -2897,8 +2912,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, size = PAGE_ALIGN(size); area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); - if (!area) + if (!area) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "vm_struct allocation failed", real_size); goto fail; + } addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) @@ -2923,11 +2942,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto again; } - if (!area) { - /* Warn for area allocation, page allocations already warn */ - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure: %lu bytes", real_size); - } return NULL; } From ad216c0316ad6391d90f4de0a7f59396b2925a06 Mon Sep 17 00:00:00 2001 From: Vijayanand Jitta Date: Thu, 29 Apr 2021 22:59:07 -0700 Subject: [PATCH 113/175] mm: vmalloc: prevent use after free in _vm_unmap_aliases A potential use after free can occur in _vm_unmap_aliases where an already freed vmap_area could be accessed, Consider the following scenario: Process 1 Process 2 __vm_unmap_aliases __vm_unmap_aliases purge_fragmented_blocks_allcpus rcu_read_lock() rcu_read_lock() list_del_rcu(&vb->free_list) list_for_each_entry_rcu(vb .. ) __purge_vmap_area_lazy kmem_cache_free(va) va_start = vb->va->va_start Here Process 1 is in purge path and it does list_del_rcu on vmap_block and later frees the vmap_area, since Process 2 was holding the rcu lock at this time vmap_block will still be present in and Process 2 accesse it and thereby it tries to access vmap_area of that vmap_block which was already freed by Process 1 and this results in use after free. Fix this by adding a check for vb->dirty before accessing vmap_area structure since vb->dirty will be set to VMAP_BBMAP_BITS in purge path checking for this will prevent the use after free. Link: https://lkml.kernel.org/r/1616062105-23263-1-git-send-email-vjitta@codeaurora.org Signed-off-by: Vijayanand Jitta Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 612a3790cfd4db..51874a341ab086 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2040,7 +2040,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { spin_lock(&vb->lock); - if (vb->dirty) { + if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; From a803315858bf8c6863f719f9fb251576fdf68a8c Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 29 Apr 2021 22:59:10 -0700 Subject: [PATCH 114/175] lib/test_vmalloc.c: remove two kvfree_rcu() tests Remove two test cases related to kvfree_rcu() and SLAB. Those are considered as redundant now, because similar test functionality has recently been introduced in the "rcuscale" RCU test-suite. Link: https://lkml.kernel.org/r/20210402202237.20334-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_vmalloc.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 5cf2fe9aab9e01..4eb6abdaa74e20 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -47,8 +47,6 @@ __param(int, run_test_mask, INT_MAX, "\t\tid: 128, name: pcpu_alloc_test\n" "\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n" "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" - "\t\tid: 1024, name: kvfree_rcu_1_arg_slab_test\n" - "\t\tid: 2048, name: kvfree_rcu_2_arg_slab_test\n" /* Add a new test case description here. */ ); @@ -363,42 +361,6 @@ kvfree_rcu_2_arg_vmalloc_test(void) return 0; } -static int -kvfree_rcu_1_arg_slab_test(void) -{ - struct test_kvfree_rcu *p; - int i; - - for (i = 0; i < test_loop_count; i++) { - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return -1; - - p->array[0] = 'a'; - kvfree_rcu(p); - } - - return 0; -} - -static int -kvfree_rcu_2_arg_slab_test(void) -{ - struct test_kvfree_rcu *p; - int i; - - for (i = 0; i < test_loop_count; i++) { - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return -1; - - p->array[0] = 'a'; - kvfree_rcu(p, rcu); - } - - return 0; -} - struct test_case_desc { const char *test_name; int (*test_func)(void); @@ -415,8 +377,6 @@ static struct test_case_desc test_case_array[] = { { "pcpu_alloc_test", pcpu_alloc_test }, { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test }, { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test }, - { "kvfree_rcu_1_arg_slab_test", kvfree_rcu_1_arg_slab_test }, - { "kvfree_rcu_2_arg_slab_test", kvfree_rcu_2_arg_slab_test }, /* Add a new test case here. */ }; From 80f4759964cc70ca8e3c793afbecbdc235ce7272 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 29 Apr 2021 22:59:13 -0700 Subject: [PATCH 115/175] lib/test_vmalloc.c: add a new 'nr_threads' parameter By using this parameter we can specify how many workers are created to perform vmalloc tests. By default it is one CPU. The maximum value is set to 1024. As a result of this change a 'single_cpu_test' one becomes obsolete, therefore it is no longer needed. [urezki@gmail.com: extend max value of nr_threads parameter] Link: https://lkml.kernel.org/r/20210406124536.19658-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20210402202237.20334-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Shuah Khan Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_vmalloc.c | 88 +++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 48 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 4eb6abdaa74e20..01e9543de5664c 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -23,8 +23,8 @@ module_param(name, type, 0444); \ MODULE_PARM_DESC(name, msg) \ -__param(bool, single_cpu_test, false, - "Use single first online CPU to run tests"); +__param(int, nr_threads, 0, + "Number of workers to perform tests(min: 1 max: USHRT_MAX)"); __param(bool, sequential_test_order, false, "Use sequential stress tests order"); @@ -50,13 +50,6 @@ __param(int, run_test_mask, INT_MAX, /* Add a new test case description here. */ ); -/* - * Depends on single_cpu_test parameter. If it is true, then - * use first online CPU to trigger a test on, otherwise go with - * all online CPUs. - */ -static cpumask_t cpus_run_test_mask = CPU_MASK_NONE; - /* * Read write semaphore for synchronization of setup * phase that is done in main thread and workers. @@ -386,16 +379,13 @@ struct test_case_data { u64 time; }; -/* Split it to get rid of: WARNING: line over 80 characters */ -static struct test_case_data - per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)]; - static struct test_driver { struct task_struct *task; + struct test_case_data data[ARRAY_SIZE(test_case_array)]; + unsigned long start; unsigned long stop; - int cpu; -} per_cpu_test_driver[NR_CPUS]; +} *tdriver; static void shuffle_array(int *arr, int n) { @@ -423,9 +413,6 @@ static int test_func(void *private) ktime_t kt; u64 delta; - if (set_cpus_allowed_ptr(current, cpumask_of(t->cpu)) < 0) - pr_err("Failed to set affinity to %d CPU\n", t->cpu); - for (i = 0; i < ARRAY_SIZE(test_case_array); i++) random_array[i] = i; @@ -450,9 +437,9 @@ static int test_func(void *private) kt = ktime_get(); for (j = 0; j < test_repeat_count; j++) { if (!test_case_array[index].test_func()) - per_cpu_test_data[t->cpu][index].test_passed++; + t->data[index].test_passed++; else - per_cpu_test_data[t->cpu][index].test_failed++; + t->data[index].test_failed++; } /* @@ -461,7 +448,7 @@ static int test_func(void *private) delta = (u64) ktime_us_delta(ktime_get(), kt); do_div(delta, (u32) test_repeat_count); - per_cpu_test_data[t->cpu][index].time = delta; + t->data[index].time = delta; } t->stop = get_cycles(); @@ -477,53 +464,56 @@ static int test_func(void *private) return 0; } -static void +static int init_test_configurtion(void) { /* - * Reset all data of all CPUs. + * A maximum number of workers is defined as hard-coded + * value and set to USHRT_MAX. We add such gap just in + * case and for potential heavy stressing. */ - memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data)); + nr_threads = clamp(nr_threads, 1, (int) USHRT_MAX); - if (single_cpu_test) - cpumask_set_cpu(cpumask_first(cpu_online_mask), - &cpus_run_test_mask); - else - cpumask_and(&cpus_run_test_mask, cpu_online_mask, - cpu_online_mask); + /* Allocate the space for test instances. */ + tdriver = kvcalloc(nr_threads, sizeof(*tdriver), GFP_KERNEL); + if (tdriver == NULL) + return -1; if (test_repeat_count <= 0) test_repeat_count = 1; if (test_loop_count <= 0) test_loop_count = 1; + + return 0; } static void do_concurrent_test(void) { - int cpu, ret; + int i, ret; /* * Set some basic configurations plus sanity check. */ - init_test_configurtion(); + ret = init_test_configurtion(); + if (ret < 0) + return; /* * Put on hold all workers. */ down_write(&prepare_for_test_rwsem); - for_each_cpu(cpu, &cpus_run_test_mask) { - struct test_driver *t = &per_cpu_test_driver[cpu]; + for (i = 0; i < nr_threads; i++) { + struct test_driver *t = &tdriver[i]; - t->cpu = cpu; - t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu); + t->task = kthread_run(test_func, t, "vmalloc_test/%d", i); if (!IS_ERR(t->task)) /* Success. */ atomic_inc(&test_n_undone); else - pr_err("Failed to start kthread for %d CPU\n", cpu); + pr_err("Failed to start %d kthread\n", i); } /* @@ -541,29 +531,31 @@ static void do_concurrent_test(void) ret = wait_for_completion_timeout(&test_all_done_comp, HZ); } while (!ret); - for_each_cpu(cpu, &cpus_run_test_mask) { - struct test_driver *t = &per_cpu_test_driver[cpu]; - int i; + for (i = 0; i < nr_threads; i++) { + struct test_driver *t = &tdriver[i]; + int j; if (!IS_ERR(t->task)) kthread_stop(t->task); - for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { - if (!((run_test_mask & (1 << i)) >> i)) + for (j = 0; j < ARRAY_SIZE(test_case_array); j++) { + if (!((run_test_mask & (1 << j)) >> j)) continue; pr_info( "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n", - test_case_array[i].test_name, - per_cpu_test_data[cpu][i].test_passed, - per_cpu_test_data[cpu][i].test_failed, + test_case_array[j].test_name, + t->data[j].test_passed, + t->data[j].test_failed, test_repeat_count, test_loop_count, - per_cpu_test_data[cpu][i].time); + t->data[j].time); } - pr_info("All test took CPU%d=%lu cycles\n", - cpu, t->stop - t->start); + pr_info("All test took worker%d=%lu cycles\n", + i, t->stop - t->start); } + + kvfree(tdriver); } static int vmalloc_test_init(void) From 7bc4ca3ea956669b4e14ee03108c6623a136edfa Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 29 Apr 2021 22:59:16 -0700 Subject: [PATCH 116/175] vm/test_vmalloc.sh: adapt for updated driver interface A 'single_cpu_test' parameter is odd and it does not exist anymore. Instead there was introduced a 'nr_threads' one. If it is not set it behaves as the former parameter. That is why update a "stress mode" according to this change specifying number of workers which are equal to number of CPUs. Also update an output of help message based on a new interface. Link: https://lkml.kernel.org/r/20210402202237.20334-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Shuah Khan Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/test_vmalloc.sh | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh index 06d2bb109f06d6..d73b846736f1c2 100755 --- a/tools/testing/selftests/vm/test_vmalloc.sh +++ b/tools/testing/selftests/vm/test_vmalloc.sh @@ -11,6 +11,7 @@ TEST_NAME="vmalloc" DRIVER="test_${TEST_NAME}" +NUM_CPUS=`grep -c ^processor /proc/cpuinfo` # 1 if fails exitcode=1 @@ -22,9 +23,9 @@ ksft_skip=4 # Static templates for performance, stressing and smoke tests. # Also it is possible to pass any supported parameters manualy. # -PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3" -SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10" -STRESS_PARAM="test_repeat_count=20" +PERF_PARAM="sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" check_test_requirements() { @@ -58,8 +59,8 @@ run_perfformance_check() run_stability_check() { - echo "Run stability tests. In order to stress vmalloc subsystem we run" - echo "all available test cases on all available CPUs simultaneously." + echo "Run stability tests. In order to stress vmalloc subsystem all" + echo "available test cases are run by NUM_CPUS workers simultaneously." echo "It will take time, so be patient." modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 @@ -92,17 +93,17 @@ usage() echo "# Shows help message" echo "./${DRIVER}.sh" echo - echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs" - echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5" + echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" echo echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " echo "sequential order" - echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 " + echo -n "./${DRIVER}.sh sequential_test_order=1 " echo "run_test_mask=23" echo - echo -n "# Runs all tests on all online CPUs, shuffled order, repeats " + echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " echo "20 times" - echo "./${DRIVER}.sh test_repeat_count=20" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" echo echo "# Performance analysis" echo "./${DRIVER}.sh performance" From 187f8cc456f83e4745e326f3026a83a97e7814a1 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 29 Apr 2021 22:59:19 -0700 Subject: [PATCH 117/175] mm/vmalloc: refactor the preloading loagic Instead of keeping open-coded style, move the code related to preloading into a separate function. Therefore introduce the preload_this_cpu_lock() routine that prelaods a current CPU with one extra vmap_area object. There is no functional change as a result of this patch. Link: https://lkml.kernel.org/r/20210402202237.20334-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Shuah Khan Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 60 +++++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 51874a341ab086..83a83f672c09b8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1430,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va) spin_unlock(&free_vmap_area_lock); } +static inline void +preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) +{ + struct vmap_area *va = NULL; + + /* + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. It guarantees that + * a CPU that does an allocation is preloaded. + * + * We do it in non-atomic context, thus it allows us to use more + * permissive allocation masks to be more stable under low memory + * condition and high memory pressure. + */ + if (!this_cpu_read(ne_fit_preload_node)) + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + + spin_lock(lock); + + if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) + kmem_cache_free(vmap_area_cachep, va); +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1439,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { - struct vmap_area *va, *pva; + struct vmap_area *va; unsigned long addr; int purged = 0; int ret; @@ -1465,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: - /* - * Preload this CPU with one extra vmap_area object. It is used - * when fit type of free area is NE_FIT_TYPE. Please note, it - * does not guarantee that an allocation occurs on a CPU that - * is preloaded, instead we minimize the case when it is not. - * It can happen because of cpu migration, because there is a - * race until the below spinlock is taken. - * - * The preload is done in non-atomic context, thus it allows us - * to use more permissive allocation masks to be more stable under - * low memory condition and high memory pressure. In rare case, - * if not preloaded, GFP_NOWAIT is used. - * - * Set "pva" to NULL here, because of "retry" path. - */ - pva = NULL; - - if (!this_cpu_read(ne_fit_preload_node)) - /* - * Even if it fails we do not really care about that. - * Just proceed as it is. If needed "overflow" path - * will refill the cache we allocate from. - */ - pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - - spin_lock(&free_vmap_area_lock); - - if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) - kmem_cache_free(vmap_area_cachep, pva); + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ - addr = __alloc_vmap_area(size, align, vstart, vend); - spin_unlock(&free_vmap_area_lock); - if (unlikely(addr == vend)) goto overflow; From 299420ba358c023ea70d7bab5f61c7744596f30f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 29 Apr 2021 22:59:22 -0700 Subject: [PATCH 118/175] mm/vmalloc: remove an empty line Link: https://lkml.kernel.org/r/20210402202237.20334-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Shuah Khan Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 83a83f672c09b8..d33894d7b27a1d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1503,7 +1503,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_end = addr + size; va->vm = NULL; - spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); From 78f4841e34763079be0661744c1ca997be64eb56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:59:25 -0700 Subject: [PATCH 119/175] mm/doc: fix fault_flag_allow_retry_first kerneldoc make htmldocs reports: include/linux/mm.h:496: warning: Function parameter or member 'flags' not described in 'fault_flag_allow_retry_first' Add a description. Link: https://lkml.kernel.org/r/20210322195022.2143603-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: John Hubbard Cc: Mike Rapoport Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 93097dbd96044a..382c33e7d90671 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -485,6 +485,7 @@ extern pgprot_t protection_map[16]; /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition From 136dfc9949f84089217f84e6478471dabbf14ba7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:59:28 -0700 Subject: [PATCH 120/175] mm/doc: fix page_maybe_dma_pinned kerneldoc make htmldocs reports: include/linux/mm.h:1341: warning: Excess function parameter 'Return' description in 'page_maybe_dma_pinned' Fix a few other formatting nits while I'm editing this description. Link: https://lkml.kernel.org/r/20210322195022.2143603-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Reviewed-by: John Hubbard Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 382c33e7d90671..5acda87619351e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1271,10 +1271,11 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, void unpin_user_pages(struct page **pages, unsigned long npages); /** - * page_maybe_dma_pinned() - report if a page is pinned for DMA. + * page_maybe_dma_pinned - Report if a page is pinned for DMA. + * @page: The page. * * This function checks if a page has been pinned via a call to - * pin_user_pages*(). + * a function in the pin_user_pages() family. * * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably @@ -1292,9 +1293,8 @@ void unpin_user_pages(struct page **pages, unsigned long npages); * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * @page: pointer to page to be queried. - * @Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the page has been "dma-pinned". + * False, if the page is definitely not dma-pinned. */ static inline bool page_maybe_dma_pinned(struct page *page) { From da2f5eb3d344503c4d851bdf1ae2379167074413 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:59:31 -0700 Subject: [PATCH 121/175] mm/doc: turn fault flags into an enum The kernel-doc script complains about include/linux/mm.h:425: warning: wrong kernel-doc identifier on line: * Fault flag definitions. I don't know how to document a series of #defines, so turn these definitions into an enum and document that instead. Link: https://lkml.kernel.org/r/20210322195022.2143603-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Cc: John Hubbard Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5acda87619351e..64b2e3a0b94df8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -432,8 +432,7 @@ extern unsigned int kobjsize(const void *objp); extern pgprot_t protection_map[16]; /** - * Fault flag definitions. - * + * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. @@ -464,16 +463,18 @@ extern pgprot_t protection_map[16]; * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. */ -#define FAULT_FLAG_WRITE 0x01 -#define FAULT_FLAG_MKWRITE 0x02 -#define FAULT_FLAG_ALLOW_RETRY 0x04 -#define FAULT_FLAG_RETRY_NOWAIT 0x08 -#define FAULT_FLAG_KILLABLE 0x10 -#define FAULT_FLAG_TRIED 0x20 -#define FAULT_FLAG_USER 0x40 -#define FAULT_FLAG_REMOTE 0x80 -#define FAULT_FLAG_INSTRUCTION 0x100 -#define FAULT_FLAG_INTERRUPTIBLE 0x200 +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; /* * The default fault flags that should be used by most of the @@ -496,7 +497,7 @@ extern pgprot_t protection_map[16]; * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ -static inline bool fault_flag_allow_retry_first(unsigned int flags) +static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); @@ -531,7 +532,7 @@ struct vm_fault { pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ }; - unsigned int flags; /* FAULT_FLAG_xxx flags + enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ From a87132a229918fbc9d3cdacc61d3c8ae04e497ce Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:59:34 -0700 Subject: [PATCH 122/175] mm/doc: add mm.h and mm_types.h to the mm-api document kerneldoc in include/linux/mm.h and include/linux/mm_types.h wasn't being included in the html build. Link: https://lkml.kernel.org/r/20210322195022.2143603-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Cc: John Hubbard Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/mm-api.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 201b5423303bbb..f1dc5f58feca26 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -92,3 +92,7 @@ More Memory Management Functions :export: .. kernel-doc:: mm/page_alloc.c +.. kernel-doc:: include/linux/mm_types.h + :internal: +.. kernel-doc:: include/linux/mm.h + :internal: From a3ddd79a17ee1ad43cf0200f158c30515da7b09c Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 29 Apr 2021 22:59:37 -0700 Subject: [PATCH 123/175] MAINTAINERS: assign pagewalk.h to MEMORY MANAGEMENT Patch series "kernel-doc and MAINTAINERS clean-up". Roughly 900 warnings of about 21.000 kernel-doc warnings in the kernel tree warn with 'cannot understand function prototype:', i.e., the kernel-doc parser cannot parse the function's signature. The majority, about 600 cases of those, are just struct definitions following the kernel-doc description. Further, spot-check investigations suggest that the authors of the specific kernel-doc descriptions simply were not aware that the general format for a kernel-doc description for a structure requires to prefix the struct name with the keyword 'struct', as in 'struct struct_name - Brief description.'. Details on kernel-doc are at the Link below. Without the struct keyword, kernel-doc does not check if the kernel-doc description fits to the actual struct definition in the source code. Fortunately, in roughly a quarter of these cases, the kernel-doc description is actually complete wrt. its corresponding struct definition. So, the trivial change adding the struct keyword will allow us to keep the kernel-doc descriptions more consistent for future changes, by checking for new kernel-doc warnings. Also, some of the files in ./include/ are not assigned to a specific MAINTAINERS section and hence have no dedicated maintainer. So, if needed, the files in ./include/ are also assigned to the fitting MAINTAINERS section, as I need to identify whom to send the clean-up patch anyway. Here is the change from this kernel-doc janitorial work in the ./include/ directory for MEMORY MANAGEMENT. This patch (of 2): Commit a520110e4a15 ("mm: split out a new pagewalk.h header from mm.h") adds a new file in ./include/linux, but misses to update MAINTAINERS accordingly. Hence, ./scripts/get_maintainers.pl include/linux/pagewalk.h points only to lkml as general fallback for all files, whereas the original include/linux/mm.h clearly marks this file part of MEMORY MANAGEMENT. Assign include/linux/pagewalk.h to MEMORY MANAGEMENT. Link: https://lkml.kernel.org/r/20210322122542.15072-1-lukas.bulwahn@gmail.com Link: https://lkml.kernel.org/r/20210322122542.15072-2-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Cc: Joe Perches Cc: Ralf Ramsauer Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b2c3af92d57c35..591897e035258b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11744,6 +11744,7 @@ F: include/linux/gfp.h F: include/linux/memory_hotplug.h F: include/linux/mm.h F: include/linux/mmzone.h +F: include/linux/pagewalk.h F: include/linux/vmalloc.h F: mm/ From 91ab1a41191ef2d4c6e123951a0f0c3876bd9376 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 29 Apr 2021 22:59:40 -0700 Subject: [PATCH 124/175] pagewalk: prefix struct kernel-doc descriptions The script './scripts/kernel-doc -none ./include/linux/pagewalk.h' reports: include/linux/pagewalk.h:37: warning: cannot understand function prototype: 'struct mm_walk_ops ' include/linux/pagewalk.h:85: warning: cannot understand function prototype: 'struct mm_walk ' A kernel-doc description for a structure requires to prefix the struct name with the keyword 'struct'. So, do that such that no further kernel-doc warnings are reported for this file. Link: https://lkml.kernel.org/r/20210322122542.15072-3-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Cc: Joe Perches Cc: Jonathan Corbet Cc: Ralf Ramsauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index b1cb6b753abb53..ac7b38ad59036d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -7,7 +7,7 @@ struct mm_walk; /** - * mm_walk_ops - callbacks for walk_page_range + * struct mm_walk_ops - callbacks for walk_page_range * @pgd_entry: if set, called for each non-empty PGD (top-level) entry * @p4d_entry: if set, called for each non-empty P4D entry * @pud_entry: if set, called for each non-empty PUD entry @@ -71,7 +71,7 @@ enum page_walk_action { }; /** - * mm_walk - walk_page_range data + * struct mm_walk - walk_page_range data * @ops: operation to call during the walk * @mm: mm_struct representing the target process of page table walk * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) From f76e0c41c0ac7f6ae614dd50ce3e983b974b87c1 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Thu, 29 Apr 2021 22:59:43 -0700 Subject: [PATCH 125/175] mm/kasan: switch from strlcpy to strscpy strlcpy is marked as deprecated in Documentation/process/deprecated.rst, and there is no functional difference when the caller expects truncation (when not checking the return value). strscpy is relatively better as it also avoids scanning the whole source string. Link: https://lkml.kernel.org/r/1613970647-23272-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Acked-by: Alexander Potapenko Reviewed-by: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index de732bc341c5c7..139615ef326b99 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -148,7 +148,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr, } /* Copy token (+ 1 byte for '\0'). */ - strlcpy(token, *frame_descr, tok_len + 1); + strscpy(token, *frame_descr, tok_len + 1); } /* Advance frame_descr past separator. */ From bfcfe37136d718f5f5846f51df9ff22d13752a5b Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 29 Apr 2021 22:59:46 -0700 Subject: [PATCH 126/175] kasan: fix kasan_byte_accessible() to be consistent with actual checks We can sometimes end up with kasan_byte_accessible() being called on non-slab memory. For example ksize() and krealloc() may end up calling it on KFENCE allocated memory. In this case the memory will be tagged with KASAN_SHADOW_INIT, which a subsequent patch ("kasan: initialize shadow to TAG_INVALID for SW_TAGS") will set to the same value as KASAN_TAG_INVALID, causing kasan_byte_accessible() to fail when called on non-slab memory. This highlighted the fact that the check in kasan_byte_accessible() was inconsistent with checks as implemented for loads and stores (kasan_check_range() in SW tags mode and hardware-implemented checks in HW tags mode). kasan_check_range() does not have a check for KASAN_TAG_INVALID, and instead has a comparison against KASAN_SHADOW_START. In HW tags mode, we do not have either, but we do set TCR_EL1.TCMA which corresponds with the comparison against KASAN_TAG_KERNEL. Therefore, update kasan_byte_accessible() for both SW and HW tags modes to correspond with the respective checks on loads and stores. Link: https://linux-review.googlesource.com/id/Ic6d40803c57dcc6331bd97fbb9a60b0d38a65a36 Link: https://lkml.kernel.org/r/20210405220647.1965262-1-pcc@google.com Signed-off-by: Peter Collingbourne Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Peter Collingbourne Cc: Evgenii Stepanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 3 +-- mm/kasan/sw_tags.c | 10 +++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c1581e8a9b8e00..552f8381d98862 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -396,8 +396,7 @@ static inline bool kasan_byte_accessible(const void *addr) u8 ptr_tag = get_tag(addr); u8 mem_tag = hw_get_mem_tag((void *)addr); - return (mem_tag != KASAN_TAG_INVALID) && - (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag); + return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag; } #else /* CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 94c2d33be3332c..00ae8913fc747e 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -121,10 +121,14 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, bool kasan_byte_accessible(const void *addr) { u8 tag = get_tag(addr); - u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); + void *untagged_addr = kasan_reset_tag(addr); + u8 shadow_byte; - return (shadow_byte != KASAN_TAG_INVALID) && - (tag == KASAN_TAG_KERNEL || tag == shadow_byte); + if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + return false; + + shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); + return tag == KASAN_TAG_KERNEL || tag == shadow_byte; } #define DEFINE_HWASAN_LOAD_STORE(size) \ From a064cb00d359bc464df6fd2ab6dfb8dc4b31e361 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 22:59:49 -0700 Subject: [PATCH 127/175] kasan: initialize shadow to TAG_INVALID for SW_TAGS Currently, KASAN_SW_TAGS uses 0xFF as the default tag value for unallocated memory. The underlying idea is that since that memory hasn't been allocated yet, it's only supposed to be dereferenced through a pointer with the native 0xFF tag. While this is a good idea in terms on consistency, practically it doesn't bring any benefit. Since the 0xFF pointer tag is a match-all tag, it doesn't matter what tag the accessed memory has. No accesses through 0xFF-tagged pointers are considered buggy by KASAN. This patch changes the default tag value for unallocated memory to 0xFE, which is the tag KASAN uses for inaccessible memory. This doesn't affect accesses through 0xFF-tagged pointer to this memory, but this allows KASAN to detect wild and large out-of-bounds invalid memory accesses through otherwise-tagged pointers. This is a prepatory patch for the next one, which changes the tag-based KASAN modes to not poison the boot memory. Link: https://lkml.kernel.org/r/c8e93571c18b3528aac5eb33ade213bf133d10ad.1613692950.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Marco Elver Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d53ea3c047bcde..9f5faefd1744f9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -30,7 +30,8 @@ struct kunit_kasan_expectation { /* Software KASAN implementations use shadow memory. */ #ifdef CONFIG_KASAN_SW_TAGS -#define KASAN_SHADOW_INIT 0xFF +/* This matches KASAN_TAG_INVALID. */ +#define KASAN_SHADOW_INIT 0xFE #else #define KASAN_SHADOW_INIT 0 #endif From 2c3356809802037de8ecd24538361dba151812fc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 22:59:52 -0700 Subject: [PATCH 128/175] mm, kasan: don't poison boot memory with tag-based modes During boot, all non-reserved memblock memory is exposed to page_alloc via memblock_free_pages->__free_pages_core(). This results in kasan_free_pages() being called, which poisons that memory. Poisoning all that memory lengthens boot time. The most noticeable effect is observed with the HW_TAGS mode. A boot-time impact may potentially also affect systems with large amount of RAM. This patch changes the tag-based modes to not poison the memory during the memblock->page_alloc transition. An exception is made for KASAN_GENERIC. Since it marks all new memory as accessible, not poisoning the memory released from memblock will lead to KASAN missing invalid boot-time accesses to that memory. With KASAN_SW_TAGS, as it uses the invalid 0xFE tag as the default tag for all memory, it won't miss bad boot-time accesses even if the poisoning of memblock memory is removed. With KASAN_HW_TAGS, the default memory tags values are unspecified. Therefore, if memblock poisoning is removed, this KASAN mode will miss the mentioned type of boot-time bugs with a 1/16 probability. This is taken as an acceptable trafe-off. Internally, the poisoning is removed as follows. __free_pages_core() is used when exposing fresh memory during system boot and when onlining memory during hotplug. This patch adds a new FPI_SKIP_KASAN_POISON flag and passes it to __free_pages_ok() through free_pages_prepare() from __free_pages_core(). If FPI_SKIP_KASAN_POISON is set, kasan_free_pages() is not called. All memory allocated normally when the boot is over keeps getting poisoned as usual. Link: https://lkml.kernel.org/r/a0570dc1e3a8f39a55aa343a1fc08cd5c2d4cad6.1613692950.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Vincenzo Frascino Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Marco Elver Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 39ff5c604cef3a..342cabf912a7ea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -109,6 +109,17 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* + * Don't poison memory with KASAN (only for the tag-based modes). + * During boot, all non-reserved memblock memory is exposed to page_alloc. + * Poisoning all that memory lengthens boot time, especially on systems with + * large amount of RAM. This flag is used to skip that poisoning. + * This is only done for the tag-based KASAN modes, as those are able to + * detect memory corruptions with the memory tags assigned by default. + * All memory allocated normally after boot gets poisoned as usual. + */ +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -385,10 +396,15 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline void kasan_free_nondeferred_pages(struct page *page, int order) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + fpi_t fpi_flags) { - if (!static_branch_unlikely(&deferred_pages)) - kasan_free_pages(page, order); + if (static_branch_unlikely(&deferred_pages)) + return; + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -439,7 +455,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + fpi_t fpi_flags) +{ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order); +} static inline bool early_page_uninitialised(unsigned long pfn) { @@ -1217,7 +1240,7 @@ static void kernel_init_free_pages(struct page *page, int numpages) } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, bool check_free) + unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; @@ -1286,7 +1309,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - kasan_free_nondeferred_pages(page, order); + kasan_free_nondeferred_pages(page, order, fpi_flags); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1308,7 +1331,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1328,9 +1351,9 @@ static bool bulkfree_pcp_prepare(struct page *page) static bool free_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); else - return free_pages_prepare(page, 0, false); + return free_pages_prepare(page, 0, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1538,7 +1561,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order, true)) + if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -1575,7 +1598,7 @@ void __free_pages_core(struct page *page, unsigned int order) * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. */ - __free_pages_ok(page, order, FPI_TO_TAIL); + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } #ifdef CONFIG_NEED_MULTIPLE_NODES From d9b6f90794ba2a2f47d1646cda343924b092b3c2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 22:59:55 -0700 Subject: [PATCH 129/175] arm64: kasan: allow to init memory when setting tags Patch series "kasan: integrate with init_on_alloc/free", v3. This patch series integrates HW_TAGS KASAN with init_on_alloc/free by initializing memory via the same arm64 instruction that sets memory tags. This is expected to improve HW_TAGS KASAN performance when init_on_alloc/free is enabled. The exact perfomance numbers are unknown as MTE-enabled hardware doesn't exist yet. This patch (of 5): This change adds an argument to mte_set_mem_tag_range() that allows to enable memory initialization when settinh the allocation tags. The implementation uses stzg instruction instead of stg when this argument indicates to initialize memory. Combining setting allocation tags with memory initialization will improve HW_TAGS KASAN performance when init_on_alloc/free is enabled. This change doesn't integrate memory initialization with KASAN, this is done is subsequent patches in this series. Link: https://lkml.kernel.org/r/cover.1615296150.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/d04ae90cc36be3fe246ea8025e5085495681c3d7.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Catalin Marinas Cc: Will Deacon Cc: Vincenzo Frascino Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/memory.h | 4 +-- arch/arm64/include/asm/mte-kasan.h | 39 +++++++++++++++++++----------- mm/kasan/kasan.h | 9 ++++--- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index e6c7417bfb9290..6d9915d066fa8c 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -250,8 +250,8 @@ static inline const void *__tag_set(const void *addr, u8 tag) #define arch_init_tags(max_tag) mte_init_tags(max_tag) #define arch_get_random_tag() mte_get_random_tag() #define arch_get_mem_tag(addr) mte_get_mem_tag(addr) -#define arch_set_mem_tag_range(addr, size, tag) \ - mte_set_mem_tag_range((addr), (size), (tag)) +#define arch_set_mem_tag_range(addr, size, tag, init) \ + mte_set_mem_tag_range((addr), (size), (tag), (init)) #endif /* CONFIG_KASAN_HW_TAGS */ /* diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index 4acf8bf41cade6..ddd4d17cf9a07b 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -53,7 +53,8 @@ static inline u8 mte_get_random_tag(void) * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and * size must be non-zero and MTE_GRANULE_SIZE aligned. */ -static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag) +static inline void mte_set_mem_tag_range(void *addr, size_t size, + u8 tag, bool init) { u64 curr, end; @@ -63,18 +64,27 @@ static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag) curr = (u64)__tag_set(addr, tag); end = curr + size; - do { - /* - * 'asm volatile' is required to prevent the compiler to move - * the statement outside of the loop. - */ - asm volatile(__MTE_PREAMBLE "stg %0, [%0]" - : - : "r" (curr) - : "memory"); - - curr += MTE_GRANULE_SIZE; - } while (curr != end); + /* + * 'asm volatile' is required to prevent the compiler to move + * the statement outside of the loop. + */ + if (init) { + do { + asm volatile(__MTE_PREAMBLE "stzg %0, [%0]" + : + : "r" (curr) + : "memory"); + curr += MTE_GRANULE_SIZE; + } while (curr != end); + } else { + do { + asm volatile(__MTE_PREAMBLE "stg %0, [%0]" + : + : "r" (curr) + : "memory"); + curr += MTE_GRANULE_SIZE; + } while (curr != end); + } } void mte_enable_kernel_sync(void); @@ -101,7 +111,8 @@ static inline u8 mte_get_random_tag(void) return 0xFF; } -static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag) +static inline void mte_set_mem_tag_range(void *addr, size_t size, + u8 tag, bool init) { } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 552f8381d98862..0583033a7df617 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -314,7 +314,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define arch_get_mem_tag(addr) (0xFF) #endif #ifndef arch_set_mem_tag_range -#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr)) +#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr)) #endif #define hw_enable_tagging_sync() arch_enable_tagging_sync() @@ -324,7 +324,8 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_force_async_tag_fault() arch_force_async_tag_fault() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) -#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag)) +#define hw_set_mem_tag_range(addr, size, tag, init) \ + arch_set_mem_tag_range((addr), (size), (tag), (init)) #else /* CONFIG_KASAN_HW_TAGS */ @@ -371,7 +372,7 @@ static inline void kasan_poison(const void *addr, size_t size, u8 value) if (WARN_ON(size & KASAN_GRANULE_MASK)) return; - hw_set_mem_tag_range((void *)addr, size, value); + hw_set_mem_tag_range((void *)addr, size, value, false); } static inline void kasan_unpoison(const void *addr, size_t size) @@ -388,7 +389,7 @@ static inline void kasan_unpoison(const void *addr, size_t size) return; size = round_up(size, KASAN_GRANULE_SIZE); - hw_set_mem_tag_range((void *)addr, size, tag); + hw_set_mem_tag_range((void *)addr, size, tag, false); } static inline bool kasan_byte_accessible(const void *addr) From aa5c219c60ccb75b50c16329885b65c275172e4a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 22:59:59 -0700 Subject: [PATCH 130/175] kasan: init memory in kasan_(un)poison for HW_TAGS This change adds an argument to kasan_poison() and kasan_unpoison() that allows initializing memory along with setting the tags for HW_TAGS. Combining setting allocation tags with memory initialization will improve HW_TAGS KASAN performance when init_on_alloc/free is enabled. This change doesn't integrate memory initialization with KASAN, this is done is subsequent patches in this series. Link: https://lkml.kernel.org/r/3054314039fa64510947e674180d675cab1b4c41.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Joonsoo Kim Cc: Kevin Brodsky Cc: Pekka Enberg Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 4 ++-- mm/kasan/common.c | 28 ++++++++++++++-------------- mm/kasan/generic.c | 12 ++++++------ mm/kasan/kasan.h | 14 ++++++++------ mm/kasan/shadow.c | 10 +++++----- mm/kasan/sw_tags.c | 2 +- 6 files changed, 36 insertions(+), 34 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 785e724ce0d888..0882d6c17e62ea 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -1049,14 +1049,14 @@ static void match_all_mem_tag(struct kunit *test) continue; /* Mark the first memory granule with the chosen memory tag. */ - kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag); + kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag, false); /* This access must cause a KASAN report. */ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0); } /* Recover the memory tag and free. */ - kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr)); + kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr), false); kfree(ptr); } diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7b53291dafa175..b3d8eb154f59fe 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -60,7 +60,7 @@ void kasan_disable_current(void) void __kasan_unpoison_range(const void *address, size_t size) { - kasan_unpoison(address, size); + kasan_unpoison(address, size, false); } #ifdef CONFIG_KASAN_STACK @@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task) { void *base = task_stack_page(task); - kasan_unpoison(base, THREAD_SIZE); + kasan_unpoison(base, THREAD_SIZE, false); } /* Unpoison the stack for the current task beyond a watermark sp value. */ @@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - kasan_unpoison(base, watermark - base); + kasan_unpoison(base, watermark - base, false); } #endif /* CONFIG_KASAN_STACK */ @@ -108,14 +108,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order) tag = kasan_random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison(page_address(page), PAGE_SIZE << order); + kasan_unpoison(page_address(page), PAGE_SIZE << order, false); } void __kasan_free_pages(struct page *page, unsigned int order) { if (likely(!PageHighMem(page))) kasan_poison(page_address(page), PAGE_SIZE << order, - KASAN_FREE_PAGE); + KASAN_FREE_PAGE, false); } /* @@ -251,18 +251,18 @@ void __kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); kasan_poison(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - kasan_unpoison(object, cache->object_size); + kasan_unpoison(object, cache->object_size, false); } void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); } /* @@ -351,7 +351,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, } kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_FREE); + KASAN_KMALLOC_FREE, false); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; @@ -407,7 +407,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) if (unlikely(!PageSlab(page))) { if (____kasan_kfree_large(ptr, ip)) return; - kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); + kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false); } else { ____kasan_slab_free(page->slab_cache, ptr, ip, false); } @@ -453,7 +453,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, * Unpoison the whole object. * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. */ - kasan_unpoison(tagged_object, cache->object_size); + kasan_unpoison(tagged_object, cache->object_size, false); /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled()) @@ -496,7 +496,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, redzone_end = round_up((unsigned long)(object + cache->object_size), KASAN_GRANULE_SIZE); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); /* * Save alloc info (if possible) for kmalloc() allocations. @@ -546,7 +546,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, KASAN_GRANULE_SIZE); redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr)); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_PAGE_REDZONE); + KASAN_PAGE_REDZONE, false); return (void *)ptr; } @@ -563,7 +563,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag * Part of it might already have been unpoisoned, but it's unknown * how big that part is. */ - kasan_unpoison(object, size); + kasan_unpoison(object, size, false); page = virt_to_head_page(object); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 2e55e0f82f39c3..53cbf28859b5a8 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -208,11 +208,11 @@ static void register_global(struct kasan_global *global) { size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE); - kasan_unpoison(global->beg, global->size); + kasan_unpoison(global->beg, global->size, false); kasan_poison(global->beg + aligned_size, global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); + KASAN_GLOBAL_REDZONE, false); } void __asan_register_globals(struct kasan_global *globals, size_t size) @@ -292,11 +292,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); kasan_unpoison((const void *)(addr + rounded_down_size), - size - rounded_down_size); + size - rounded_down_size, false); kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_LEFT); + KASAN_ALLOCA_LEFT, false); kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_RIGHT); + KASAN_ALLOCA_RIGHT, false); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -306,7 +306,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - kasan_unpoison(stack_top, stack_bottom - stack_top); + kasan_unpoison(stack_top, stack_bottom - stack_top, false); } EXPORT_SYMBOL(__asan_allocas_unpoison); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 0583033a7df617..071c456a357953 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -359,7 +359,7 @@ static inline u8 kasan_random_tag(void) { return 0; } #ifdef CONFIG_KASAN_HW_TAGS -static inline void kasan_poison(const void *addr, size_t size, u8 value) +static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init) { addr = kasan_reset_tag(addr); @@ -372,10 +372,10 @@ static inline void kasan_poison(const void *addr, size_t size, u8 value) if (WARN_ON(size & KASAN_GRANULE_MASK)) return; - hw_set_mem_tag_range((void *)addr, size, value, false); + hw_set_mem_tag_range((void *)addr, size, value, init); } -static inline void kasan_unpoison(const void *addr, size_t size) +static inline void kasan_unpoison(const void *addr, size_t size, bool init) { u8 tag = get_tag(addr); @@ -389,7 +389,7 @@ static inline void kasan_unpoison(const void *addr, size_t size) return; size = round_up(size, KASAN_GRANULE_SIZE); - hw_set_mem_tag_range((void *)addr, size, tag, false); + hw_set_mem_tag_range((void *)addr, size, tag, init); } static inline bool kasan_byte_accessible(const void *addr) @@ -407,22 +407,24 @@ static inline bool kasan_byte_accessible(const void *addr) * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size, must be aligned to KASAN_GRANULE_SIZE * @value - value that's written to metadata for the range + * @init - whether to initialize the memory range (only for hardware tag-based) * * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. */ -void kasan_poison(const void *addr, size_t size, u8 value); +void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size, can be unaligned + * @init - whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. * For the generic mode, the last granule of the memory range gets partially * unpoisoned based on the @size. */ -void kasan_unpoison(const void *addr, size_t size); +void kasan_unpoison(const void *addr, size_t size, bool init); bool kasan_byte_accessible(const void *addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 63f43443f5d772..727ad46291731f 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -69,7 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -void kasan_poison(const void *addr, size_t size, u8 value) +void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; @@ -106,7 +106,7 @@ void kasan_poison_last_granule(const void *addr, size_t size) } #endif -void kasan_unpoison(const void *addr, size_t size) +void kasan_unpoison(const void *addr, size_t size, bool init) { u8 tag = get_tag(addr); @@ -129,7 +129,7 @@ void kasan_unpoison(const void *addr, size_t size) return; /* Unpoison all granules that cover the object. */ - kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag); + kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false); /* Partially poison the last granule for the generic mode. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) @@ -344,7 +344,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size) return; size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); } void kasan_unpoison_vmalloc(const void *start, unsigned long size) @@ -352,7 +352,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) if (!is_vmalloc_or_module_addr(start)) return; - kasan_unpoison(start, size); + kasan_unpoison(start, size, false); } static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 00ae8913fc747e..9df8e7f69e8709 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -163,7 +163,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - kasan_poison((void *)addr, size, tag); + kasan_poison((void *)addr, size, tag, false); } EXPORT_SYMBOL(__hwasan_tag_memory); From 1bb5eab30d68c1a3d9dbc822e1895e6c06dbe748 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:02 -0700 Subject: [PATCH 131/175] kasan, mm: integrate page_alloc init with HW_TAGS This change uses the previously added memory initialization feature of HW_TAGS KASAN routines for page_alloc memory when init_on_alloc/free is enabled. With this change, kernel_init_free_pages() is no longer called when both HW_TAGS KASAN and init_on_alloc/free are enabled. Instead, memory is initialized in KASAN runtime. To avoid discrepancies with which memory gets initialized that can be caused by future changes, both KASAN and kernel_init_free_pages() hooks are put together and a warning comment is added. This patch changes the order in which memory initialization and page poisoning hooks are called. This doesn't lead to any side-effects, as whenever page poisoning is enabled, memory initialization gets disabled. Combining setting allocation tags with memory initialization improves HW_TAGS KASAN performance when init_on_alloc/free is enabled. [andreyknvl@google.com: fix for "integrate page_alloc init with HW_TAGS"] Link: https://lkml.kernel.org/r/65b6028dea2e9a6e8e2cb779b5115c09457363fc.1617122211.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/e77f0d5b1b20658ef0b8288625c74c2b3690e725.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Tested-by: Vlastimil Babka Reviewed-by: Sergei Trofimovich Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Joonsoo Kim Cc: Kevin Brodsky Cc: Pekka Enberg Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 30 ++++++++++++++++++++++-------- mm/kasan/common.c | 8 ++++---- mm/mempool.c | 4 ++-- mm/page_alloc.c | 41 +++++++++++++++++++++++++++++++---------- 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 9f5faefd1744f9..30aa2bee84002b 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -96,6 +96,11 @@ static __always_inline bool kasan_enabled(void) return static_branch_likely(&kasan_flag_enabled); } +static inline bool kasan_has_integrated_init(void) +{ + return kasan_enabled(); +} + #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_enabled(void) @@ -103,6 +108,11 @@ static inline bool kasan_enabled(void) return true; } +static inline bool kasan_has_integrated_init(void) +{ + return false; +} + #endif /* CONFIG_KASAN_HW_TAGS */ slab_flags_t __kasan_never_merge(void); @@ -120,20 +130,20 @@ static __always_inline void kasan_unpoison_range(const void *addr, size_t size) __kasan_unpoison_range(addr, size); } -void __kasan_alloc_pages(struct page *page, unsigned int order); +void __kasan_alloc_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order) + unsigned int order, bool init) { if (kasan_enabled()) - __kasan_alloc_pages(page, order); + __kasan_alloc_pages(page, order, init); } -void __kasan_free_pages(struct page *page, unsigned int order); +void __kasan_free_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) + unsigned int order, bool init) { if (kasan_enabled()) - __kasan_free_pages(page, order); + __kasan_free_pages(page, order, init); } void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, @@ -277,13 +287,17 @@ static inline bool kasan_enabled(void) { return false; } +static inline bool kasan_has_integrated_init(void) +{ + return false; +} static inline slab_flags_t kasan_never_merge(void) { return 0; } static inline void kasan_unpoison_range(const void *address, size_t size) {} -static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} -static inline void kasan_free_pages(struct page *page, unsigned int order) {} +static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {} +static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {} static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b3d8eb154f59fe..efe58e58cc930f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void) return 0; } -void __kasan_alloc_pages(struct page *page, unsigned int order) +void __kasan_alloc_pages(struct page *page, unsigned int order, bool init) { u8 tag; unsigned long i; @@ -108,14 +108,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order) tag = kasan_random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison(page_address(page), PAGE_SIZE << order, false); + kasan_unpoison(page_address(page), PAGE_SIZE << order, init); } -void __kasan_free_pages(struct page *page, unsigned int order) +void __kasan_free_pages(struct page *page, unsigned int order, bool init) { if (likely(!PageHighMem(page))) kasan_poison(page_address(page), PAGE_SIZE << order, - KASAN_FREE_PAGE, false); + KASAN_FREE_PAGE, init); } /* diff --git a/mm/mempool.c b/mm/mempool.c index 79959fac27d7b5..fe19d290a30152 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -106,7 +106,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_slab_free_mempool(element); else if (pool->alloc == mempool_alloc_pages) - kasan_free_pages(element, (unsigned long)pool->pool_data); + kasan_free_pages(element, (unsigned long)pool->pool_data, false); } static void kasan_unpoison_element(mempool_t *pool, void *element) @@ -114,7 +114,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_range(element, __ksize(element)); else if (pool->alloc == mempool_alloc_pages) - kasan_alloc_pages(element, (unsigned long)pool->pool_data); + kasan_alloc_pages(element, (unsigned long)pool->pool_data, false); } static __always_inline void add_element(mempool_t *pool, void *element) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 342cabf912a7ea..6314de5387f516 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -397,14 +397,14 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); * initialization is done, but this is not likely to happen. */ static inline void kasan_free_nondeferred_pages(struct page *page, int order, - fpi_t fpi_flags) + bool init, fpi_t fpi_flags) { if (static_branch_unlikely(&deferred_pages)) return; if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && (fpi_flags & FPI_SKIP_KASAN_POISON)) return; - kasan_free_pages(page, order); + kasan_free_pages(page, order, init); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -456,12 +456,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) } #else static inline void kasan_free_nondeferred_pages(struct page *page, int order, - fpi_t fpi_flags) + bool init, fpi_t fpi_flags) { if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && (fpi_flags & FPI_SKIP_KASAN_POISON)) return; - kasan_free_pages(page, order); + kasan_free_pages(page, order, init); } static inline bool early_page_uninitialised(unsigned long pfn) @@ -1243,6 +1243,7 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool init; VM_BUG_ON_PAGE(PageTail(page), page); @@ -1300,16 +1301,21 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - if (want_init_on_free()) - kernel_init_free_pages(page, 1 << order); kernel_poison_pages(page, 1 << order); /* + * As memory initialization might be integrated into KASAN, + * kasan_free_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - kasan_free_nondeferred_pages(page, order, fpi_flags); + init = want_init_on_free(); + if (init && !kasan_has_integrated_init()) + kernel_init_free_pages(page, 1 << order); + kasan_free_nondeferred_pages(page, order, init, fpi_flags); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -2316,17 +2322,32 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init; + set_page_private(page, 0); set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); - kasan_alloc_pages(page, order); + + /* + * Page unpoisoning must happen before memory initialization. + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO + * allocations and the page unpoisoning code will complain. + */ kernel_unpoison_pages(page, 1 << order); - set_page_owner(page, order, gfp_flags); - if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) + /* + * As memory initialization might be integrated into KASAN, + * kasan_alloc_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + */ + init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + kasan_alloc_pages(page, order, init); + if (init && !kasan_has_integrated_init()) kernel_init_free_pages(page, 1 << order); + + set_page_owner(page, order, gfp_flags); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, From da844b787245194cfd69f0f1d2fb1dd3640a8a6d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:06 -0700 Subject: [PATCH 132/175] kasan, mm: integrate slab init_on_alloc with HW_TAGS This change uses the previously added memory initialization feature of HW_TAGS KASAN routines for slab memory when init_on_alloc is enabled. With this change, memory initialization memset() is no longer called when both HW_TAGS KASAN and init_on_alloc are enabled. Instead, memory is initialized in KASAN runtime. The memory initialization memset() is moved into slab_post_alloc_hook() that currently directly follows the initialization loop. A new argument is added to slab_post_alloc_hook() that indicates whether to initialize the memory or not. To avoid discrepancies with which memory gets initialized that can be caused by future changes, both KASAN hook and initialization memset() are put together and a warning comment is added. Combining setting allocation tags with memory initialization improves HW_TAGS KASAN performance when init_on_alloc is enabled. Link: https://lkml.kernel.org/r/c1292aeb5d519da221ec74a0684a949b027d7720.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Joonsoo Kim Cc: Kevin Brodsky Cc: Pekka Enberg Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 8 ++++---- mm/kasan/common.c | 4 ++-- mm/slab.c | 28 +++++++++++++--------------- mm/slab.h | 17 +++++++++++++---- mm/slub.c | 27 +++++++++++---------------- 5 files changed, 43 insertions(+), 41 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 30aa2bee84002b..629aee484b6c69 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -226,12 +226,12 @@ static __always_inline void kasan_slab_free_mempool(void *ptr) } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, - void *object, gfp_t flags); + void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( - struct kmem_cache *s, void *object, gfp_t flags) + struct kmem_cache *s, void *object, gfp_t flags, bool init) { if (kasan_enabled()) - return __kasan_slab_alloc(s, object, flags); + return __kasan_slab_alloc(s, object, flags, init); return object; } @@ -320,7 +320,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object) static inline void kasan_kfree_large(void *ptr) {} static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, - gfp_t flags) + gfp_t flags, bool init) { return object; } diff --git a/mm/kasan/common.c b/mm/kasan/common.c index efe58e58cc930f..ac0d4ed9c92175 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -428,7 +428,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object, } void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, - void *object, gfp_t flags) + void *object, gfp_t flags, bool init) { u8 tag; void *tagged_object; @@ -453,7 +453,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, * Unpoison the whole object. * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. */ - kasan_unpoison(tagged_object, cache->object_size, false); + kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled()) diff --git a/mm/slab.c b/mm/slab.c index 4e212cda8693f7..84f183e9b31a28 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3216,6 +3216,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ void *ptr; int slab_node = numa_mem_id(); struct obj_cgroup *objcg = NULL; + bool init = false; flags &= gfp_allowed_mask; cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); @@ -3254,12 +3255,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - - if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) - memset(ptr, 0, cachep->object_size); + init = slab_want_init_on_alloc(flags, cachep); out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr); + slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); return ptr; } @@ -3301,6 +3300,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo unsigned long save_flags; void *objp; struct obj_cgroup *objcg = NULL; + bool init = false; flags &= gfp_allowed_mask; cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); @@ -3317,12 +3317,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); - - if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) - memset(objp, 0, cachep->object_size); + init = slab_want_init_on_alloc(flags, cachep); out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp); + slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init); return objp; } @@ -3542,18 +3540,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); - /* Clear memory outside IRQ disabled section */ - if (unlikely(slab_want_init_on_alloc(flags, s))) - for (i = 0; i < size; i++) - memset(p[i], 0, s->object_size); - - slab_post_alloc_hook(s, objcg, flags, size, p); + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled section. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); /* FIXME: Trace call missing. Christoph would like a bulk variant */ return size; error: local_irq_enable(); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p); + slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; } diff --git a/mm/slab.h b/mm/slab.h index c30ed35b3d5d6f..18c1927cd196ce 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -506,15 +506,24 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, } static inline void slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, void **p) + struct obj_cgroup *objcg, gfp_t flags, + size_t size, void **p, bool init) { size_t i; flags &= gfp_allowed_mask; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ for (i = 0; i < size; i++) { - p[i] = kasan_slab_alloc(s, p[i], flags); - /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + p[i] = kasan_slab_alloc(s, p[i], flags, init); + if (p[i] && init && !kasan_has_integrated_init()) + memset(p[i], 0, s->object_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); } diff --git a/mm/slub.c b/mm/slub.c index a178c738fc92a1..5cf35250f20c32 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2823,6 +2823,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct page *page; unsigned long tid; struct obj_cgroup *objcg = NULL; + bool init = false; s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) @@ -2900,12 +2901,10 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, } maybe_wipe_obj_freeptr(s, object); - - if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) - memset(kasan_reset_tag(object), 0, s->object_size); + init = slab_want_init_on_alloc(gfpflags, s); out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object); + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); return object; } @@ -3357,20 +3356,16 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c->tid = next_tid(c->tid); local_irq_enable(); - /* Clear memory outside IRQ disabled fastpath loop */ - if (unlikely(slab_want_init_on_alloc(flags, s))) { - int j; - - for (j = 0; j < i; j++) - memset(kasan_reset_tag(p[j]), 0, s->object_size); - } - - /* memcg and kmem_cache debug support */ - slab_post_alloc_hook(s, objcg, flags, size, p); + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); return i; error: local_irq_enable(); - slab_post_alloc_hook(s, objcg, flags, i, p); + slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; } @@ -3580,7 +3575,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; From d57a964e09c22441e9fb497d1d7a5c1983a5d1fb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:09 -0700 Subject: [PATCH 133/175] kasan, mm: integrate slab init_on_free with HW_TAGS This change uses the previously added memory initialization feature of HW_TAGS KASAN routines for slab memory when init_on_free is enabled. With this change, memory initialization memset() is no longer called when both HW_TAGS KASAN and init_on_free are enabled. Instead, memory is initialized in KASAN runtime. For SLUB, the memory initialization memset() is moved into slab_free_hook() that currently directly follows the initialization loop. A new argument is added to slab_free_hook() that indicates whether to initialize the memory or not. To avoid discrepancies with which memory gets initialized that can be caused by future changes, both KASAN hook and initialization memset() are put together and a warning comment is added. Combining setting allocation tags with memory initialization improves HW_TAGS KASAN performance when init_on_free is enabled. Link: https://lkml.kernel.org/r/190fd15c1886654afdec0d19ebebd5ade665b601.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Joonsoo Kim Cc: Kevin Brodsky Cc: Pekka Enberg Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 10 ++++++---- mm/kasan/common.c | 13 +++++++------ mm/slab.c | 15 +++++++++++---- mm/slub.c | 43 ++++++++++++++++++++++++------------------- 4 files changed, 48 insertions(+), 33 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 629aee484b6c69..b1678a61e6a763 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -203,11 +203,13 @@ static __always_inline void * __must_check kasan_init_slab_obj( return (void *)object; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip); -static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object) +bool __kasan_slab_free(struct kmem_cache *s, void *object, + unsigned long ip, bool init); +static __always_inline bool kasan_slab_free(struct kmem_cache *s, + void *object, bool init) { if (kasan_enabled()) - return __kasan_slab_free(s, object, _RET_IP_); + return __kasan_slab_free(s, object, _RET_IP_, init); return false; } @@ -313,7 +315,7 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { return false; } diff --git a/mm/kasan/common.c b/mm/kasan/common.c index ac0d4ed9c92175..6bb87f2acd4eb5 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -322,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static inline bool ____kasan_slab_free(struct kmem_cache *cache, - void *object, unsigned long ip, bool quarantine) +static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool quarantine, bool init) { u8 tag; void *tagged_object; @@ -351,7 +351,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, } kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_FREE, false); + KASAN_KMALLOC_FREE, init); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; @@ -362,9 +362,10 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, return kasan_quarantine_put(cache, object); } -bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) +bool __kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool init) { - return ____kasan_slab_free(cache, object, ip, true); + return ____kasan_slab_free(cache, object, ip, true, init); } static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) @@ -409,7 +410,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) return; kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false); } else { - ____kasan_slab_free(page->slab_cache, ptr, ip, false); + ____kasan_slab_free(page->slab_cache, ptr, ip, false, false); } } diff --git a/mm/slab.c b/mm/slab.c index 84f183e9b31a28..df45c437b394b3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3425,17 +3425,24 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + bool init; + if (is_kfence_address(objp)) { kmemleak_free_recursive(objp, cachep->flags); __kfence_free(objp); return; } - if (unlikely(slab_want_init_on_free(cachep))) + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset must be + * kept together to avoid discrepancies in behavior. + */ + init = slab_want_init_on_free(cachep); + if (init && !kasan_has_integrated_init()) memset(objp, 0, cachep->object_size); - - /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp)) + /* KASAN might put objp into memory quarantine, delaying its reuse. */ + if (kasan_slab_free(cachep, objp, init)) return; /* Use KCSAN to help debug racy use-after-free. */ diff --git a/mm/slub.c b/mm/slub.c index 5cf35250f20c32..68123b21e65fb4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1533,7 +1533,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x); } -static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, + void *x, bool init) { kmemleak_free_recursive(x, s->flags); @@ -1559,8 +1560,25 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); - /* KASAN might put x into memory quarantine, delaying its reuse */ - return kasan_slab_free(s, x); + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset's must be + * kept together to avoid discrepancies in behavior. + * + * The initialization memset's clear the object and the metadata, + * but don't touch the SLAB redzone. + */ + if (init) { + int rsize; + + if (!kasan_has_integrated_init()) + memset(kasan_reset_tag(x), 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; + memset((char *)kasan_reset_tag(x) + s->inuse, 0, + s->size - s->inuse - rsize); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ + return kasan_slab_free(s, x, init); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, @@ -1570,10 +1588,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *object; void *next = *head; void *old_tail = *tail ? *tail : *head; - int rsize; if (is_kfence_address(next)) { - slab_free_hook(s, next); + slab_free_hook(s, next, false); return true; } @@ -1585,20 +1602,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, object = next; next = get_freepointer(s, object); - if (slab_want_init_on_free(s)) { - /* - * Clear the object and the metadata, but don't touch - * the redzone. - */ - memset(kasan_reset_tag(object), 0, s->object_size); - rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad - : 0; - memset((char *)kasan_reset_tag(object) + s->inuse, 0, - s->size - s->inuse - rsize); - - } /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object)) { + if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -3236,7 +3241,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, } if (is_kfence_address(object)) { - slab_free_hook(df->s, object); + slab_free_hook(df->s, object, false); __kfence_free(object); p[size] = NULL; /* mark object processed */ return size; From 96d7d1415ae8beb3f6ec62107a97ae73db611213 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:12 -0700 Subject: [PATCH 134/175] kasan: docs: clean up sections Update KASAN documentation: - Give some sections clearer names. - Remove unneeded subsections in the "Tests" section. - Move the "For developers" section and split into subsections. Link: https://lkml.kernel.org/r/c2bbb56eaea80ad484f0ee85bb71959a3a63f1d7.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 54 +++++++++++++++---------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 6f6ab3ed7b793a..2e8dfb47eb49aa 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -177,24 +177,6 @@ particular KASAN features. report or also panic the kernel (default: ``report``). Note, that tag checking gets disabled after the first reported bug. -For developers -~~~~~~~~~~~~~~ - -Software KASAN modes use compiler instrumentation to insert validity checks. -Such instrumentation might be incompatible with some part of the kernel, and -therefore needs to be disabled. To disable instrumentation for specific files -or directories, add a line similar to the following to the respective kernel -Makefile: - -- For a single file (e.g. main.o):: - - KASAN_SANITIZE_main.o := n - -- For all files in one directory:: - - KASAN_SANITIZE := n - - Implementation details ---------------------- @@ -308,8 +290,8 @@ support MTE (but supports TBI). Hardware tag-based KASAN only reports the first found bug. After that MTE tag checking gets disabled. -What memory accesses are sanitised by KASAN? --------------------------------------------- +Shadow memory +------------- The kernel maps memory in a number of different parts of the address space. This poses something of a problem for KASAN, which requires @@ -320,8 +302,8 @@ The range of kernel virtual addresses is large: there is not enough real memory to support a real shadow region for every address that could be accessed by the kernel. -By default -~~~~~~~~~~ +Default behaviour +~~~~~~~~~~~~~~~~~ By default, architectures only map real memory over the shadow region for the linear mapping (and potentially other small areas). For all @@ -371,8 +353,29 @@ unmapped. This will require changes in arch-specific code. This allows ``VMAP_STACK`` support on x86, and can simplify support of architectures that do not have a fixed module region. -CONFIG_KASAN_KUNIT_TEST and CONFIG_KASAN_MODULE_TEST ----------------------------------------------------- +For developers +-------------- + +Ignoring accesses +~~~~~~~~~~~~~~~~~ + +Software KASAN modes use compiler instrumentation to insert validity checks. +Such instrumentation might be incompatible with some part of the kernel, and +therefore needs to be disabled. To disable instrumentation for specific files +or directories, add a line similar to the following to the respective kernel +Makefile: + +- For a single file (e.g. main.o):: + + KASAN_SANITIZE_main.o := n + +- For all files in one directory:: + + KASAN_SANITIZE := n + + +Tests +~~~~~ KASAN tests consist of two parts: @@ -418,21 +421,18 @@ Or, if one of the tests failed:: There are a few ways to run KUnit-compatible KASAN tests. 1. Loadable module -~~~~~~~~~~~~~~~~~~ With ``CONFIG_KUNIT`` enabled, ``CONFIG_KASAN_KUNIT_TEST`` can be built as a loadable module and run on any architecture that supports KASAN by loading the module with insmod or modprobe. The module is called ``test_kasan``. 2. Built-In -~~~~~~~~~~~ With ``CONFIG_KUNIT`` built-in, ``CONFIG_KASAN_KUNIT_TEST`` can be built-in on any architecure that supports KASAN. These and any other KUnit tests enabled will run and print the results at boot as a late-init call. 3. Using kunit_tool -~~~~~~~~~~~~~~~~~~~ With ``CONFIG_KUNIT`` and ``CONFIG_KASAN_KUNIT_TEST`` built-in, it's also possible use ``kunit_tool`` to see the results of these and other KUnit tests From 3cbc37dcdca273485f8ef909fab2c41e8fb5d3b9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:15 -0700 Subject: [PATCH 135/175] kasan: docs: update overview section Update the "Overview" section in KASAN documentation: - Outline main use cases for each mode. - Mention that HW_TAGS mode need compiler support too. - Move the part about SLUB/SLAB support from "Usage" to "Overview". - Punctuation, readability, and other minor clean-ups. Link: https://lkml.kernel.org/r/1486fba8514de3d7db2f47df2192db59228b0a7b.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 2e8dfb47eb49aa..703597a5c77083 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -11,17 +11,31 @@ designed to find out-of-bound and use-after-free bugs. KASAN has three modes: 2. software tag-based KASAN (similar to userspace HWASan), 3. hardware tag-based KASAN (based on hardware memory tagging). -Software KASAN modes (1 and 2) use compile-time instrumentation to insert -validity checks before every memory access, and therefore require a compiler +Generic KASAN is mainly used for debugging due to a large memory overhead. +Software tag-based KASAN can be used for dogfood testing as it has a lower +memory overhead that allows using it with real workloads. Hardware tag-based +KASAN comes with low memory and performance overheads and, therefore, can be +used in production. Either as an in-field memory bug detector or as a security +mitigation. + +Software KASAN modes (#1 and #2) use compile-time instrumentation to insert +validity checks before every memory access and, therefore, require a compiler version that supports that. -Generic KASAN is supported in both GCC and Clang. With GCC it requires version +Generic KASAN is supported in GCC and Clang. With GCC, it requires version 8.3.0 or later. Any supported Clang version is compatible, but detection of out-of-bounds accesses for global variables is only supported since Clang 11. -Tag-based KASAN is only supported in Clang. +Software tag-based KASAN mode is only supported in Clang. -Currently generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390 +The hardware KASAN mode (#3) relies on hardware to perform the checks but +still requires a compiler version that supports memory tagging instructions. +This mode is supported in GCC 10+ and Clang 11+. + +Both software KASAN modes work with SLUB and SLAB memory allocators, +while the hardware tag-based KASAN currently only supports SLUB. + +Currently, generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390, and riscv architectures, and tag-based KASAN modes are supported only for arm64. Usage @@ -39,9 +53,6 @@ For software modes, you also need to choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and inline are compiler instrumentation types. The former produces smaller binary while the latter is 1.1 - 2 times faster. -Both software KASAN modes work with both SLUB and SLAB memory allocators, -while the hardware tag-based KASAN currently only support SLUB. - For better error reports that include stack traces, enable CONFIG_STACKTRACE. To augment reports with last allocation and freeing stack of the physical page, From 86e6f08dd28d6723a19b8a072b6db45cf6a9e4d3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:18 -0700 Subject: [PATCH 136/175] kasan: docs: update usage section Update the "Usage" section in KASAN documentation: - Add inline code snippet markers. - Reword the part about stack traces for clarity. - Other minor clean-ups. Link: https://lkml.kernel.org/r/48427809cd4b8b5d6bc00926cbe87e2b5081df17.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 703597a5c77083..092773adbaf197 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -41,22 +41,21 @@ and riscv architectures, and tag-based KASAN modes are supported only for arm64. Usage ----- -To enable KASAN configure kernel with:: +To enable KASAN, configure the kernel with:: - CONFIG_KASAN = y + CONFIG_KASAN=y -and choose between CONFIG_KASAN_GENERIC (to enable generic KASAN), -CONFIG_KASAN_SW_TAGS (to enable software tag-based KASAN), and -CONFIG_KASAN_HW_TAGS (to enable hardware tag-based KASAN). +and choose between ``CONFIG_KASAN_GENERIC`` (to enable generic KASAN), +``CONFIG_KASAN_SW_TAGS`` (to enable software tag-based KASAN), and +``CONFIG_KASAN_HW_TAGS`` (to enable hardware tag-based KASAN). -For software modes, you also need to choose between CONFIG_KASAN_OUTLINE and -CONFIG_KASAN_INLINE. Outline and inline are compiler instrumentation types. -The former produces smaller binary while the latter is 1.1 - 2 times faster. +For software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and +``CONFIG_KASAN_INLINE``. Outline and inline are compiler instrumentation types. +The former produces a smaller binary while the latter is 1.1-2 times faster. -For better error reports that include stack traces, enable CONFIG_STACKTRACE. - -To augment reports with last allocation and freeing stack of the physical page, -it is recommended to enable also CONFIG_PAGE_OWNER and boot with page_owner=on. +To include alloc and free stack traces of affected slab objects into reports, +enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected +physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``. Error reports ~~~~~~~~~~~~~ From 836f79a2660533c8302f1154168018d9d76458af Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:21 -0700 Subject: [PATCH 137/175] kasan: docs: update error reports section Update the "Error reports" section in KASAN documentation: - Mention that bug titles are best-effort. - Move and reword the part about auxiliary stacks from "Implementation details". - Punctuation, readability, and other minor clean-ups. Link: https://lkml.kernel.org/r/3531e8fe6972cf39d1954e3643237b19eb21227e.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 46 +++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 092773adbaf197..3020ee3d7a6391 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -60,7 +60,7 @@ physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``. Error reports ~~~~~~~~~~~~~ -A typical out-of-bounds access generic KASAN report looks like this:: +A typical KASAN report looks like this:: ================================================================== BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan] @@ -133,33 +133,43 @@ A typical out-of-bounds access generic KASAN report looks like this:: ffff8801f44ec400: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ================================================================== -The header of the report provides a short summary of what kind of bug happened -and what kind of access caused it. It's followed by a stack trace of the bad -access, a stack trace of where the accessed memory was allocated (in case bad -access happens on a slab object), and a stack trace of where the object was -freed (in case of a use-after-free bug report). Next comes a description of -the accessed slab object and information about the accessed memory page. +The report header summarizes what kind of bug happened and what kind of access +caused it. It is followed by a stack trace of the bad access, a stack trace of +where the accessed memory was allocated (in case a slab object was accessed), +and a stack trace of where the object was freed (in case of a use-after-free +bug report). Next comes a description of the accessed slab object and the +information about the accessed memory page. -In the last section the report shows memory state around the accessed address. -Internally KASAN tracks memory state separately for each memory granule, which +In the end, the report shows the memory state around the accessed address. +Internally, KASAN tracks memory state separately for each memory granule, which is either 8 or 16 aligned bytes depending on KASAN mode. Each number in the memory state section of the report shows the state of one of the memory granules that surround the accessed address. -For generic KASAN the size of each memory granule is 8. The state of each +For generic KASAN, the size of each memory granule is 8. The state of each granule is encoded in one shadow byte. Those 8 bytes can be accessible, -partially accessible, freed or be a part of a redzone. KASAN uses the following -encoding for each shadow byte: 0 means that all 8 bytes of the corresponding +partially accessible, freed, or be a part of a redzone. KASAN uses the following +encoding for each shadow byte: 00 means that all 8 bytes of the corresponding memory region are accessible; number N (1 <= N <= 7) means that the first N bytes are accessible, and other (8 - N) bytes are not; any negative value indicates that the entire 8-byte word is inaccessible. KASAN uses different negative values to distinguish between different kinds of inaccessible memory like redzones or freed memory (see mm/kasan/kasan.h). -In the report above the arrows point to the shadow byte 03, which means that -the accessed address is partially accessible. For tag-based KASAN modes this -last report section shows the memory tags around the accessed address -(see the `Implementation details`_ section). +In the report above, the arrow points to the shadow byte ``03``, which means +that the accessed address is partially accessible. + +For tag-based KASAN modes, this last report section shows the memory tags around +the accessed address (see the `Implementation details`_ section). + +Note that KASAN bug titles (like ``slab-out-of-bounds`` or ``use-after-free``) +are best-effort: KASAN prints the most probable bug type based on the limited +information it has. The actual type of the bug might be different. + +Generic KASAN also reports up to two auxiliary call stack traces. These stack +traces point to places in code that interacted with the object but that are not +directly present in the bad access stack trace. Currently, this includes +call_rcu() and workqueue queuing. Boot parameters ~~~~~~~~~~~~~~~ @@ -223,10 +233,6 @@ function calls GCC directly inserts the code to check the shadow memory. This option significantly enlarges kernel but it gives x1.1-x2 performance boost over outline instrumented kernel. -Generic KASAN also reports the last 2 call stacks to creation of work that -potentially has access to an object. Call stacks for the following are shown: -call_rcu() and workqueue queuing. - Generic KASAN is the only mode that delays the reuse of freed object via quarantine (see mm/kasan/quarantine.c for implementation). From f359074768bf406b64d62560e88ff9820b600220 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:24 -0700 Subject: [PATCH 138/175] kasan: docs: update boot parameters section Update the "Boot parameters" section in KASAN documentation: - Mention panic_on_warn. - Mention kasan_multi_shot and its interaction with panic_on_warn. - Clarify kasan.fault=panic interaction with panic_on_warn. - A readability clean-up. Link: https://lkml.kernel.org/r/01364952f15789948f0627d6733b5cdf5209f83a.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 3020ee3d7a6391..37af493c7f8e40 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -174,10 +174,16 @@ call_rcu() and workqueue queuing. Boot parameters ~~~~~~~~~~~~~~~ +KASAN is affected by the generic ``panic_on_warn`` command line parameter. +When it is enabled, KASAN panics the kernel after printing a bug report. + +By default, KASAN prints a bug report only for the first invalid memory access. +With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This +effectively disables ``panic_on_warn`` for KASAN reports. + Hardware tag-based KASAN mode (see the section about various modes below) is intended for use in production as a security mitigation. Therefore, it supports -boot parameters that allow to disable KASAN competely or otherwise control -particular KASAN features. +boot parameters that allow disabling KASAN or controlling its features. - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). @@ -194,8 +200,8 @@ particular KASAN features. traces collection (default: ``on``). - ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN - report or also panic the kernel (default: ``report``). Note, that tag - checking gets disabled after the first reported bug. + report or also panic the kernel (default: ``report``). The panic happens even + if ``kasan_multi_shot`` is enabled. Implementation details ---------------------- From b8191d7d57e86eda934ef82081c294e6a184b000 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:27 -0700 Subject: [PATCH 139/175] kasan: docs: update GENERIC implementation details section Update the "Implementation details" section for generic KASAN: - Don't mention kmemcheck, it's not present in the kernel anymore. - Don't mention GCC as the only supported compiler. - Update kasan_mem_to_shadow() definition to match actual code. - Punctuation, readability, and other minor clean-ups. Link: https://lkml.kernel.org/r/f2f35fdab701f8c709f63d328f98aec2982c8acc.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 37af493c7f8e40..027878f9229153 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -209,12 +209,11 @@ Implementation details Generic KASAN ~~~~~~~~~~~~~ -From a high level perspective, KASAN's approach to memory error detection is -similar to that of kmemcheck: use shadow memory to record whether each byte of -memory is safe to access, and use compile-time instrumentation to insert checks -of shadow memory on each memory access. +Software KASAN modes use shadow memory to record whether each byte of memory is +safe to access and use compile-time instrumentation to insert shadow memory +checks before each memory access. -Generic KASAN dedicates 1/8th of kernel memory to its shadow memory (e.g. 16TB +Generic KASAN dedicates 1/8th of kernel memory to its shadow memory (16TB to cover 128TB on x86_64) and uses direct mapping with a scale and offset to translate a memory address to its corresponding shadow address. @@ -223,23 +222,23 @@ address:: static inline void *kasan_mem_to_shadow(const void *addr) { - return ((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET; } where ``KASAN_SHADOW_SCALE_SHIFT = 3``. Compile-time instrumentation is used to insert memory access checks. Compiler -inserts function calls (__asan_load*(addr), __asan_store*(addr)) before each -memory access of size 1, 2, 4, 8 or 16. These functions check whether memory -access is valid or not by checking corresponding shadow memory. +inserts function calls (``__asan_load*(addr)``, ``__asan_store*(addr)``) before +each memory access of size 1, 2, 4, 8, or 16. These functions check whether +memory accesses are valid or not by checking corresponding shadow memory. -GCC 5.0 has possibility to perform inline instrumentation. Instead of making -function calls GCC directly inserts the code to check the shadow memory. -This option significantly enlarges kernel but it gives x1.1-x2 performance -boost over outline instrumented kernel. +With inline instrumentation, instead of making function calls, the compiler +directly inserts the code to check shadow memory. This option significantly +enlarges the kernel, but it gives an x1.1-x2 performance boost over the +outline-instrumented kernel. -Generic KASAN is the only mode that delays the reuse of freed object via +Generic KASAN is the only mode that delays the reuse of freed objects via quarantine (see mm/kasan/quarantine.c for implementation). Software tag-based KASAN From a6c18d4e763873e900b8932211a3f66589f943a2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:30 -0700 Subject: [PATCH 140/175] kasan: docs: update SW_TAGS implementation details section Update the "Implementation details" section for SW_TAGS KASAN: - Clarify the introduction sentence. - Punctuation, readability, and other minor clean-ups. Link: https://lkml.kernel.org/r/69b9b2e49d8cf789358fa24558be3fc0ce4ee32c.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 39 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 027878f9229153..69f384b0c2d3b6 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -244,38 +244,37 @@ quarantine (see mm/kasan/quarantine.c for implementation). Software tag-based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ -Software tag-based KASAN requires software memory tagging support in the form -of HWASan-like compiler instrumentation (see HWASan documentation for details). - -Software tag-based KASAN is currently only implemented for arm64 architecture. +Software tag-based KASAN uses a software memory tagging approach to checking +access validity. It is currently only implemented for the arm64 architecture. Software tag-based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs -to store a pointer tag in the top byte of kernel pointers. Like generic KASAN -it uses shadow memory to store memory tags associated with each 16-byte memory -cell (therefore it dedicates 1/16th of the kernel memory for shadow memory). +to store a pointer tag in the top byte of kernel pointers. It uses shadow memory +to store memory tags associated with each 16-byte memory cell (therefore, it +dedicates 1/16th of the kernel memory for shadow memory). -On each memory allocation software tag-based KASAN generates a random tag, tags -the allocated memory with this tag, and embeds this tag into the returned +On each memory allocation, software tag-based KASAN generates a random tag, tags +the allocated memory with this tag, and embeds the same tag into the returned pointer. Software tag-based KASAN uses compile-time instrumentation to insert checks -before each memory access. These checks make sure that tag of the memory that -is being accessed is equal to tag of the pointer that is used to access this -memory. In case of a tag mismatch software tag-based KASAN prints a bug report. +before each memory access. These checks make sure that the tag of the memory +that is being accessed is equal to the tag of the pointer that is used to access +this memory. In case of a tag mismatch, software tag-based KASAN prints a bug +report. -Software tag-based KASAN also has two instrumentation modes (outline, that -emits callbacks to check memory accesses; and inline, that performs the shadow +Software tag-based KASAN also has two instrumentation modes (outline, which +emits callbacks to check memory accesses; and inline, which performs the shadow memory checks inline). With outline instrumentation mode, a bug report is -simply printed from the function that performs the access check. With inline -instrumentation a brk instruction is emitted by the compiler, and a dedicated -brk handler is used to print bug reports. +printed from the function that performs the access check. With inline +instrumentation, a ``brk`` instruction is emitted by the compiler, and a +dedicated ``brk`` handler is used to print bug reports. Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through -pointers with 0xFF pointer tag aren't checked). The value 0xFE is currently +pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Software tag-based KASAN currently only supports tagging of -kmem_cache_alloc/kmalloc and page_alloc memory. +Software tag-based KASAN currently only supports tagging of slab and page_alloc +memory. Hardware tag-based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ From bb48675e5aa4f48f5767fb915c73f44f86a81e98 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:33 -0700 Subject: [PATCH 141/175] kasan: docs: update HW_TAGS implementation details section Update the "Implementation details" section for HW_TAGS KASAN: - Punctuation, readability, and other minor clean-ups. Link: https://lkml.kernel.org/r/ee2caf4c138cc1fd239822c2abefd5af6c057744.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 69f384b0c2d3b6..f0dd0eaa9cde54 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -279,35 +279,35 @@ memory. Hardware tag-based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ -Hardware tag-based KASAN is similar to the software mode in concept, but uses +Hardware tag-based KASAN is similar to the software mode in concept but uses hardware memory tagging support instead of compiler instrumentation and shadow memory. Hardware tag-based KASAN is currently only implemented for arm64 architecture and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5 -Instruction Set Architecture, and Top Byte Ignore (TBI). +Instruction Set Architecture and Top Byte Ignore (TBI). Special arm64 instructions are used to assign memory tags for each allocation. Same tags are assigned to pointers to those allocations. On every memory -access, hardware makes sure that tag of the memory that is being accessed is -equal to tag of the pointer that is used to access this memory. In case of a -tag mismatch a fault is generated and a report is printed. +access, hardware makes sure that the tag of the memory that is being accessed is +equal to the tag of the pointer that is used to access this memory. In case of a +tag mismatch, a fault is generated, and a report is printed. Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through -pointers with 0xFF pointer tag aren't checked). The value 0xFE is currently +pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Hardware tag-based KASAN currently only supports tagging of -kmem_cache_alloc/kmalloc and page_alloc memory. +Hardware tag-based KASAN currently only supports tagging of slab and page_alloc +memory. -If the hardware doesn't support MTE (pre ARMv8.5), hardware tag-based KASAN -won't be enabled. In this case all boot parameters are ignored. +If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN +will not be enabled. In this case, all KASAN boot parameters are ignored. -Note, that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being -enabled. Even when kasan.mode=off is provided, or when the hardware doesn't +Note that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being +enabled. Even when ``kasan.mode=off`` is provided or when the hardware does not support MTE (but supports TBI). -Hardware tag-based KASAN only reports the first found bug. After that MTE tag +Hardware tag-based KASAN only reports the first found bug. After that, MTE tag checking gets disabled. Shadow memory From 67ca1c0b74463a7b961bb34c213b37be0deb0ab6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:36 -0700 Subject: [PATCH 142/175] kasan: docs: update shadow memory section Update the "Shadow memory" section in KASAN documentation: - Rearrange the introduction paragraph do it doesn't give a "KASAN has an issue" impression. - Update the list of architectures with vmalloc support. - Punctuation, readability, and other minor clean-ups. Link: https://lkml.kernel.org/r/00f8c38b0fd5290a3f4dced04eaba41383e67e14.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index f0dd0eaa9cde54..894fc8eb374d16 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -313,14 +313,11 @@ checking gets disabled. Shadow memory ------------- -The kernel maps memory in a number of different parts of the address -space. This poses something of a problem for KASAN, which requires -that all addresses accessed by instrumented code have a valid shadow -region. - -The range of kernel virtual addresses is large: there is not enough -real memory to support a real shadow region for every address that -could be accessed by the kernel. +The kernel maps memory in several different parts of the address space. +The range of kernel virtual addresses is large: there is not enough real +memory to support a real shadow region for every address that could be +accessed by the kernel. Therefore, KASAN only maps real shadow for certain +parts of the address space. Default behaviour ~~~~~~~~~~~~~~~~~ @@ -332,10 +329,9 @@ page is mapped over the shadow area. This read-only shadow page declares all memory accesses as permitted. This presents a problem for modules: they do not live in the linear -mapping, but in a dedicated module space. By hooking in to the module -allocator, KASAN can temporarily map real shadow memory to cover -them. This allows detection of invalid accesses to module globals, for -example. +mapping but in a dedicated module space. By hooking into the module +allocator, KASAN temporarily maps real shadow memory to cover them. +This allows detection of invalid accesses to module globals, for example. This also creates an incompatibility with ``VMAP_STACK``: if the stack lives in vmalloc space, it will be shadowed by the read-only page, and @@ -346,9 +342,10 @@ CONFIG_KASAN_VMALLOC ~~~~~~~~~~~~~~~~~~~~ With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the -cost of greater memory usage. Currently this is only supported on x86. +cost of greater memory usage. Currently, this is supported on x86, +riscv, s390, and powerpc. -This works by hooking into vmalloc and vmap, and dynamically +This works by hooking into vmalloc and vmap and dynamically allocating real shadow memory to back the mappings. Most mappings in vmalloc space are small, requiring less than a full @@ -367,10 +364,10 @@ memory. To avoid the difficulties around swapping mappings around, KASAN expects that the part of the shadow region that covers the vmalloc space will -not be covered by the early shadow page, but will be left -unmapped. This will require changes in arch-specific code. +not be covered by the early shadow page but will be left unmapped. +This will require changes in arch-specific code. -This allows ``VMAP_STACK`` support on x86, and can simplify support of +This allows ``VMAP_STACK`` support on x86 and can simplify support of architectures that do not have a fixed module region. For developers From fe547fca0c10b0319881287ca17ca9d7dc1b4757 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:39 -0700 Subject: [PATCH 143/175] kasan: docs: update ignoring accesses section Update the "Ignoring accesses" section in KASAN documentation: - Mention __no_sanitize_address/noinstr. - Mention kasan_disable/enable_current(). - Mention kasan_reset_tag()/page_kasan_tag_reset(). - Readability and punctuation clean-ups. Link: https://lkml.kernel.org/r/4531ba5f3eca61f6aade863c136778cc8c807a64.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 34 +++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 894fc8eb374d16..0d2cb74ee05d0a 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -377,12 +377,18 @@ Ignoring accesses ~~~~~~~~~~~~~~~~~ Software KASAN modes use compiler instrumentation to insert validity checks. -Such instrumentation might be incompatible with some part of the kernel, and -therefore needs to be disabled. To disable instrumentation for specific files -or directories, add a line similar to the following to the respective kernel +Such instrumentation might be incompatible with some parts of the kernel, and +therefore needs to be disabled. + +Other parts of the kernel might access metadata for allocated objects. +Normally, KASAN detects and reports such accesses, but in some cases (e.g., +in memory allocators), these accesses are valid. + +For software KASAN modes, to disable instrumentation for a specific file or +directory, add a ``KASAN_SANITIZE`` annotation to the respective kernel Makefile: -- For a single file (e.g. main.o):: +- For a single file (e.g., main.o):: KASAN_SANITIZE_main.o := n @@ -390,6 +396,26 @@ Makefile: KASAN_SANITIZE := n +For software KASAN modes, to disable instrumentation on a per-function basis, +use the KASAN-specific ``__no_sanitize_address`` function attribute or the +generic ``noinstr`` one. + +Note that disabling compiler instrumentation (either on a per-file or a +per-function basis) makes KASAN ignore the accesses that happen directly in +that code for software KASAN modes. It does not help when the accesses happen +indirectly (through calls to instrumented functions) or with the hardware +tag-based mode that does not use compiler instrumentation. + +For software KASAN modes, to disable KASAN reports in a part of the kernel code +for the current task, annotate this part of the code with a +``kasan_disable_current()``/``kasan_enable_current()`` section. This also +disables the reports for indirect accesses that happen through function calls. + +For tag-based KASAN modes (include the hardware one), to disable access +checking, use ``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that +temporarily disabling access checking via ``page_kasan_tag_reset()`` requires +saving and restoring the per-page KASAN tag via +``page_kasan_tag``/``page_kasan_tag_set``. Tests ~~~~~ From fc23c074ef5ab47c2fb0975f70329da93850c6d0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:42 -0700 Subject: [PATCH 144/175] kasan: docs: update tests section Update the "Tests" section in KASAN documentation: - Add an introductory sentence. - Add proper indentation for the list of ways to run KUnit tests. - Punctuation, readability, and other minor clean-ups. Link: https://lkml.kernel.org/r/fb08845e25c8847ffda271fa19cda2621c04a65b.1615559068.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 32 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 0d2cb74ee05d0a..d3f335ffc7517d 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -420,19 +420,20 @@ saving and restoring the per-page KASAN tag via Tests ~~~~~ -KASAN tests consist of two parts: +There are KASAN tests that allow verifying that KASAN works and can detect +certain types of memory corruptions. The tests consist of two parts: 1. Tests that are integrated with the KUnit Test Framework. Enabled with ``CONFIG_KASAN_KUNIT_TEST``. These tests can be run and partially verified -automatically in a few different ways, see the instructions below. +automatically in a few different ways; see the instructions below. 2. Tests that are currently incompatible with KUnit. Enabled with ``CONFIG_KASAN_MODULE_TEST`` and can only be run as a module. These tests can -only be verified manually, by loading the kernel module and inspecting the +only be verified manually by loading the kernel module and inspecting the kernel log for KASAN reports. -Each KUnit-compatible KASAN test prints a KASAN report if an error is detected. -Then the test prints its number and status. +Each KUnit-compatible KASAN test prints one of multiple KASAN reports if an +error is detected. Then the test prints its number and status. When a test passes:: @@ -460,27 +461,24 @@ Or, if one of the tests failed:: not ok 1 - kasan - There are a few ways to run KUnit-compatible KASAN tests. 1. Loadable module -With ``CONFIG_KUNIT`` enabled, ``CONFIG_KASAN_KUNIT_TEST`` can be built as -a loadable module and run on any architecture that supports KASAN by loading -the module with insmod or modprobe. The module is called ``test_kasan``. + With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable + module and run by loading ``test_kasan.ko`` with ``insmod`` or ``modprobe``. 2. Built-In -With ``CONFIG_KUNIT`` built-in, ``CONFIG_KASAN_KUNIT_TEST`` can be built-in -on any architecure that supports KASAN. These and any other KUnit tests enabled -will run and print the results at boot as a late-init call. + With ``CONFIG_KUNIT`` built-in, KASAN-KUnit tests can be built-in as well. + In this case, the tests will run at boot as a late-init call. 3. Using kunit_tool -With ``CONFIG_KUNIT`` and ``CONFIG_KASAN_KUNIT_TEST`` built-in, it's also -possible use ``kunit_tool`` to see the results of these and other KUnit tests -in a more readable way. This will not print the KASAN reports of the tests that -passed. Use `KUnit documentation `_ -for more up-to-date information on ``kunit_tool``. + With ``CONFIG_KUNIT`` and ``CONFIG_KASAN_KUNIT_TEST`` built-in, it is also + possible to use ``kunit_tool`` to see the results of KUnit tests in a more + readable way. This will not print the KASAN reports of the tests that passed. + See `KUnit documentation `_ + for more up-to-date information on ``kunit_tool``. .. _KUnit: https://www.kernel.org/doc/html/latest/dev-tools/kunit/index.html From 23f61f0fe106da8c9f6a883965439ecc2838f116 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Thu, 29 Apr 2021 23:00:45 -0700 Subject: [PATCH 145/175] kasan: record task_work_add() call stack Why record task_work_add() call stack? Syzbot reports many use-after-free issues for task_work, see [1]. After seeing the free stack and the current auxiliary stack, we think they are useless, we don't know where the work was registered. This work may be the free call stack, so we miss the root cause and don't solve the use-after-free. Add the task_work_add() call stack into the KASAN auxiliary stack in order to improve KASAN reports. It helps programmers solve use-after-free issues. [1]: https://groups.google.com/g/syzkaller-bugs/search?q=kasan%20use-after-free%20task_work_run Link: https://lkml.kernel.org/r/20210316024410.19967-1-walter-zh.wu@mediatek.com Signed-off-by: Walter Wu Suggested-by: Dmitry Vyukov Reviewed-by: Dmitry Vyukov Reviewed-by: Jens Axboe Acked-by: Oleg Nesterov Acked-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrew Morton Cc: Matthias Brugger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/task_work.c | 3 +++ mm/kasan/kasan.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index e9316198c64bf5..1698fbe6f0e134 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -34,6 +34,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, { struct callback_head *head; + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + do { head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 071c456a357953..3820ca54743b54 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -163,7 +163,7 @@ struct kasan_alloc_meta { struct kasan_track alloc_track; #ifdef CONFIG_KASAN_GENERIC /* - * call_rcu() call stack is stored into struct kasan_alloc_meta. + * The auxiliary stack is stored into struct kasan_alloc_meta. * The free stack is stored into struct kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; From 99734b535d9bf8d5826be8f8f3719dfc586c3452 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:49 -0700 Subject: [PATCH 146/175] kasan: detect false-positives in tests Currently, KASAN-KUnit tests can check that a particular annotated part of code causes a KASAN report. However, they do not check that no unwanted reports happen between the annotated parts. This patch implements these checks. It is done by setting report_data.report_found to false in kasan_test_init() and at the end of KUNIT_EXPECT_KASAN_FAIL() and then checking that it remains false at the beginning of KUNIT_EXPECT_KASAN_FAIL() and in kasan_test_exit(). kunit_add_named_resource() call is moved to kasan_test_init(), and the value of fail_data.report_expected is kept as false in between KUNIT_EXPECT_KASAN_FAIL() annotations for consistency. Link: https://lkml.kernel.org/r/48079c52cc329fbc52f4386996598d58022fb872.1617207873.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 55 +++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 0882d6c17e62ea..dc05cfc2d12f07 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -54,6 +54,10 @@ static int kasan_test_init(struct kunit *test) multishot = kasan_save_enable_multi_shot(); kasan_set_tagging_report_once(false); + fail_data.report_found = false; + fail_data.report_expected = false; + kunit_add_named_resource(test, NULL, NULL, &resource, + "kasan_data", &fail_data); return 0; } @@ -61,6 +65,7 @@ static void kasan_test_exit(struct kunit *test) { kasan_set_tagging_report_once(true); kasan_restore_multi_shot(multishot); + KUNIT_EXPECT_FALSE(test, fail_data.report_found); } /** @@ -78,33 +83,31 @@ static void kasan_test_exit(struct kunit *test) * fields, it can reorder or optimize away the accesses to those fields. * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the * expression to prevent that. + * + * In between KUNIT_EXPECT_KASAN_FAIL checks, fail_data.report_found is kept as + * false. This allows detecting KASAN reports that happen outside of the checks + * by asserting !fail_data.report_found at the start of KUNIT_EXPECT_KASAN_FAIL + * and in kasan_test_exit. */ -#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ - if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ - !kasan_async_mode_enabled()) \ - migrate_disable(); \ - WRITE_ONCE(fail_data.report_expected, true); \ - WRITE_ONCE(fail_data.report_found, false); \ - kunit_add_named_resource(test, \ - NULL, \ - NULL, \ - &resource, \ - "kasan_data", &fail_data); \ - barrier(); \ - expression; \ - barrier(); \ - if (kasan_async_mode_enabled()) \ - kasan_force_async_fault(); \ - barrier(); \ - KUNIT_EXPECT_EQ(test, \ - READ_ONCE(fail_data.report_expected), \ - READ_ONCE(fail_data.report_found)); \ - if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ - !kasan_async_mode_enabled()) { \ - if (READ_ONCE(fail_data.report_found)) \ - kasan_enable_tagging_sync(); \ - migrate_enable(); \ - } \ +#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ + !kasan_async_mode_enabled()) \ + migrate_disable(); \ + KUNIT_EXPECT_FALSE(test, READ_ONCE(fail_data.report_found)); \ + WRITE_ONCE(fail_data.report_expected, true); \ + barrier(); \ + expression; \ + barrier(); \ + KUNIT_EXPECT_EQ(test, \ + READ_ONCE(fail_data.report_expected), \ + READ_ONCE(fail_data.report_found)); \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \ + if (READ_ONCE(fail_data.report_found)) \ + kasan_enable_tagging_sync(); \ + migrate_enable(); \ + } \ + WRITE_ONCE(fail_data.report_found, false); \ + WRITE_ONCE(fail_data.report_expected, false); \ } while (0) #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ From e2b5bcf9f5baec35c67ebe05c7713ae6fa9ef61f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 29 Apr 2021 23:00:52 -0700 Subject: [PATCH 147/175] irq_work: record irq_work_queue() call stack Add the irq_work_queue() call stack into the KASAN auxiliary stack in order to improve KASAN reports. this will let us know where the irq work be queued. Link: https://lkml.kernel.org/r/20210331063202.28770-1-qiang.zhang@windriver.com Signed-off-by: Zqiang Reviewed-by: Dmitry Vyukov Acked-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Matthias Brugger Cc: Oleg Nesterov Cc: Walter Wu Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq_work.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e8da1e71583a14..23a7a0ba1388a2 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -19,7 +19,7 @@ #include #include #include - +#include static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); @@ -70,6 +70,9 @@ bool irq_work_queue(struct irq_work *work) if (!irq_work_claim(work)) return false; + /*record irq_work call stack in order to print it in KASAN reports*/ + kasan_record_aux_stack(work); + /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); @@ -98,6 +101,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; + kasan_record_aux_stack(work); + preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ From 1f9d03c5e999ed5a57fa4d8aec9fdf67a6234b80 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 29 Apr 2021 23:00:55 -0700 Subject: [PATCH 148/175] mm: move mem_init_print_info() into mm_init() mem_init_print_info() is called in mem_init() on each architecture, and pass NULL argument, so using void argument and move it into mm_init(). Link: https://lkml.kernel.org/r/20210317015210.33641-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Dave Hansen [x86] Reviewed-by: Christophe Leroy [powerpc] Acked-by: David Hildenbrand Tested-by: Anatoly Pugachev [sparc64] Acked-by: Russell King [arm] Acked-by: Mike Rapoport Cc: Catalin Marinas Cc: Richard Henderson Cc: Guo Ren Cc: Yoshinori Sato Cc: Huacai Chen Cc: Jonas Bonn Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: "David S. Miller" Cc: "Peter Zijlstra" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 1 - arch/arc/mm/init.c | 1 - arch/arm/mm/init.c | 2 -- arch/arm64/mm/init.c | 2 -- arch/csky/mm/init.c | 1 - arch/h8300/mm/init.c | 2 -- arch/hexagon/mm/init.c | 1 - arch/ia64/mm/init.c | 1 - arch/m68k/mm/init.c | 1 - arch/microblaze/mm/init.c | 1 - arch/mips/loongson64/numa.c | 1 - arch/mips/mm/init.c | 1 - arch/mips/sgi-ip27/ip27-memory.c | 1 - arch/nds32/mm/init.c | 1 - arch/nios2/mm/init.c | 1 - arch/openrisc/mm/init.c | 2 -- arch/parisc/mm/init.c | 2 -- arch/powerpc/mm/mem.c | 1 - arch/riscv/mm/init.c | 1 - arch/s390/mm/init.c | 2 -- arch/sh/mm/init.c | 1 - arch/sparc/mm/init_32.c | 2 -- arch/sparc/mm/init_64.c | 1 - arch/um/kernel/mem.c | 1 - arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- arch/xtensa/mm/init.c | 1 - include/linux/mm.h | 2 +- init/main.c | 1 + mm/page_alloc.c | 10 +++++----- 30 files changed, 7 insertions(+), 42 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 3c42b3147fd6f0..a97650a618f1b6 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -282,5 +282,4 @@ mem_init(void) set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); memblock_free_all(); - mem_init_print_info(NULL); } diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index ce07e697916c8f..33832e36bdb7d5 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -194,7 +194,6 @@ void __init mem_init(void) { memblock_free_all(); highmem_init(); - mem_init_print_info(NULL); } #ifdef CONFIG_HIGHMEM diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 828a2561b22958..7022b7b5c40042 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -316,8 +316,6 @@ void __init mem_init(void) free_highpages(); - mem_init_print_info(NULL); - /* * Check boundaries twice: Some fundamental inconsistencies can * be detected at build time already. diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 470f92e6a5426c..ef031511ce29fe 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -491,8 +491,6 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all(); - mem_init_print_info(NULL); - /* * Check boundaries twice: Some fundamental inconsistencies can be * detected at build time already. diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index 894050a8ce093b..bf2004aa811a0c 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -107,7 +107,6 @@ void __init mem_init(void) free_highmem_page(page); } #endif - mem_init_print_info(NULL); } void free_initmem(void) diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1f3b345d68b97b..f7bf4693e3b24d 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -98,6 +98,4 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); - - mem_init_print_info(NULL); } diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f2e6c868e477c8..f01e91e10d95d2 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -55,7 +55,6 @@ void __init mem_init(void) { /* No idea where this is actually declared. Seems to evade LXR. */ memblock_free_all(); - mem_init_print_info(NULL); /* * To-Do: someone somewhere should wipe out the bootmem map diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 97a13eda81bfea..064a967a7b6e3c 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -449,7 +449,6 @@ mem_init (void) set_max_mapnr(max_low_pfn); high_memory = __va(max_low_pfn * PAGE_SIZE); memblock_free_all(); - mem_init_print_info(NULL); /* * For fsyscall entrpoints with no light-weight handler, use the ordinary diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 14c1e541451cad..1759ab875d4728 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -153,5 +153,4 @@ void __init mem_init(void) /* this will put all memory onto the freelists */ memblock_free_all(); init_pointer_tables(); - mem_init_print_info(NULL); } diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 05cf1fb3f5ffa5..ab55c70380a5cc 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -131,7 +131,6 @@ void __init mem_init(void) highmem_setup(); #endif - mem_init_print_info(NULL); mem_init_done = 1; } diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 8315c871c4352d..fa9b4a487a479d 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -178,7 +178,6 @@ void __init mem_init(void) high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); memblock_free_all(); setup_zero_pages(); /* This comes from node 0 */ - mem_init_print_info(NULL); } /* All PCI device belongs to logical Node-0 */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 5cb73bf74a8b6c..c36358758969fa 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -467,7 +467,6 @@ void __init mem_init(void) memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ mem_init_free_highmem(); - mem_init_print_info(NULL); #ifdef CONFIG_64BIT if ((unsigned long) &_text > (unsigned long) CKSEG0) diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 87bb6945ec25dc..6173684b5aaa04 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -420,5 +420,4 @@ void __init mem_init(void) high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); memblock_free_all(); setup_zero_pages(); /* This comes from node 0 */ - mem_init_print_info(NULL); } diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index fa86f7b2f4166d..f63f839738c460 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -191,7 +191,6 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); - mem_init_print_info(NULL); pr_info("virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 61862dbb0e3267..613fcaa5988a9a 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -71,7 +71,6 @@ void __init mem_init(void) /* this will put all memory onto the freelists */ memblock_free_all(); - mem_init_print_info(NULL); } void __init mmu_init(void) diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index bf9b2310fc9364..d5641198b90ce7 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -211,8 +211,6 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); - mem_init_print_info(NULL); - printk("mem_init_done ...........................................\n"); mem_init_done = 1; return; diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 9ca4e4ff689598..591a4e93941537 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -573,8 +573,6 @@ void __init mem_init(void) #endif parisc_vmalloc_start = SET_MAP_OFFSET(MAP_START); - mem_init_print_info(NULL); - #if 0 /* * Do not expose the virtual kernel memory layout to userspace. diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 4e8ce6d8523237..7e11c4cb08b89c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -312,7 +312,6 @@ void __init mem_init(void) (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; #endif - mem_init_print_info(NULL); #ifdef CONFIG_PPC32 pr_info("Kernel virtual memory layout:\n"); #ifdef CONFIG_KASAN diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 067583ab1bd7f2..92e39cfa5227ae 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -102,7 +102,6 @@ void __init mem_init(void) high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); memblock_free_all(); - mem_init_print_info(NULL); print_vm_layout(); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 0e76b2127dc6b8..8ac710de1ab1b8 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -209,8 +209,6 @@ void __init mem_init(void) setup_zero_pages(); /* Setup zeroed pages. */ cmma_init_nodat(); - - mem_init_print_info(NULL); } void free_initmem(void) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 0db6919af8d32c..168d7d4dd73598 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -359,7 +359,6 @@ void __init mem_init(void) vsyscall_init(); - mem_init_print_info(NULL); pr_info("virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 6139c5700ccc9c..1e9f577f084daf 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -292,8 +292,6 @@ void __init mem_init(void) map_high_region(start_pfn, end_pfn); } - - mem_init_print_info(NULL); } void sparc_flush_page_to_ram(struct page *page) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 182bb7bdaa0a15..e454f179cf5df5 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2520,7 +2520,6 @@ void __init mem_init(void) } mark_page_reserved(mem_map_zero); - mem_init_print_info(NULL); if (tlb_type == cheetah || tlb_type == cheetah_plus) cheetah_ecache_flush_init(); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 9242dc91d75193..9019ff5905b136 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -54,7 +54,6 @@ void __init mem_init(void) memblock_free_all(); max_low_pfn = totalram_pages(); max_pfn = max_low_pfn; - mem_init_print_info(NULL); kmalloc_ok = 1; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index da31c2635ee437..21ffb03f6c727c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -755,8 +755,6 @@ void __init mem_init(void) after_bootmem = 1; x86_init.hyper.init_after_bootmem(); - mem_init_print_info(NULL); - /* * Check boundaries twice: Some fundamental inconsistencies can * be detected at build time already. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 58ae2f746b3ee0..e527d829e1ed78 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1351,8 +1351,6 @@ void __init mem_init(void) kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); preallocate_vmalloc_pages(); - - mem_init_print_info(NULL); } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 2daeba9e454ea8..6a32b2cf271856 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -119,7 +119,6 @@ void __init mem_init(void) memblock_free_all(); - mem_init_print_info(NULL); pr_info("virtual kernel memory layout:\n" #ifdef CONFIG_KASAN " kasan : 0x%08lx - 0x%08lx (%5lu MB)\n" diff --git a/include/linux/mm.h b/include/linux/mm.h index 64b2e3a0b94df8..011f43605807f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2360,7 +2360,7 @@ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); -extern void mem_init_print_info(const char *str); +extern void mem_init_print_info(void); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); diff --git a/init/main.c b/init/main.c index ae96c79ad2d3b6..dd11bfd10eadea 100644 --- a/init/main.c +++ b/init/main.c @@ -830,6 +830,7 @@ static void __init mm_init(void) report_meminit(); stack_depot_init(); mem_init(); + mem_init_print_info(); /* page_owner must be initialized after buddy is ready */ page_ext_init_flatmem_late(); kmem_cache_init(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6314de5387f516..b55073d0e84a36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7734,7 +7734,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -void __init mem_init_print_info(const char *str) +void __init mem_init_print_info(void) { unsigned long physpages, codesize, datasize, rosize, bss_size; unsigned long init_code_size, init_data_size; @@ -7773,17 +7773,17 @@ void __init mem_init_print_info(const char *str) #ifdef CONFIG_HIGHMEM ", %luK highmem" #endif - "%s%s)\n", + ")\n", nr_free_pages() << (PAGE_SHIFT - 10), physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), - totalcma_pages << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10) #ifdef CONFIG_HIGHMEM - totalhigh_pages() << (PAGE_SHIFT - 10), + , totalhigh_pages() << (PAGE_SHIFT - 10) #endif - str ? ", " : "", str ? str : ""); + ); } /** From 77febec206262bd80c4176f2281a7970cfe69536 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 29 Apr 2021 23:00:58 -0700 Subject: [PATCH 149/175] mm/page_alloc: drop pr_info_ratelimited() in alloc_contig_range() The information that some PFNs are busy is: a) not helpful for ordinary users: we don't even know *who* called alloc_contig_range(). This is certainly not worth a pr_info.*(). b) not really helpful for debugging: we don't have any details *why* these PFNs are busy, and that is what we usually care about. c) not complete: there are other cases where we fail alloc_contig_range() using different paths that are not getting recorded. For example, we reach this path once we succeeded in isolating pageblocks, but failed to migrate some pages - which can happen easily on ZONE_NORMAL (i.e., has_unmovable_pages() is racy) but also on ZONE_MOVABLE i.e., we would have to retry longer to migrate). For example via virtio-mem when unplugging memory, we can create quite some noise (especially with ZONE_NORMAL) that is not of interest to users - it's expected that some allocations may fail as memory is busy. Let's just drop that pr_info_ratelimit() and rather implement a dynamic debugging mechanism in the future that can give us a better reason why alloc_contig_range() failed on specific pages. Link: https://lkml.kernel.org/r/20210301150945.77012-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Acked-by: Minchan Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b55073d0e84a36..e59d5450d891a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8676,8 +8676,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { - pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", - __func__, outer_start, end); ret = -EBUSY; goto done; } From cef4c7d29d776643e86b600e5ea823f047445d0b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 29 Apr 2021 23:01:01 -0700 Subject: [PATCH 150/175] mm: remove lru_add_drain_all in alloc_contig_range __alloc_contig_migrate_range already has lru_add_drain_all call via migrate_prep. It's necessary to move LRU taget pages into LRU list to be able to isolated. However, lru_add_drain_all call after __alloc_contig_migrate_range is pointless since it has changed source page freeing from putback_lru_pages to put_page[1]. This patch removes it. [1] c6c919eb90e0, ("mm: use put_page() to free page instead of putback_lru_page()" Link: https://lkml.kernel.org/r/20210303204512.2863087-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Oscar Salvador Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e59d5450d891a8..c73557bb8c58a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8649,8 +8649,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, * isolated thus they won't get removed from buddy. */ - lru_add_drain_all(); - order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { From f73c6c8805ed0762d99122d5332fcf42b0c8fbb8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 29 Apr 2021 23:01:04 -0700 Subject: [PATCH 151/175] include/linux/page-flags-layout.h: correctly determine LAST_CPUPID_WIDTH The naming convention used in include/linux/page-flags-layout.h: *_SHIFT: the number of bits trying to allocate *_WIDTH: the number of bits successfully allocated So when it comes to LAST_CPUPID_WIDTH, we need to check whether all previous *_WIDTH and LAST_CPUPID_SHIFT can fit into page flags. This means we need to use NODES_WIDTH, not NODES_SHIFT. Link: https://lkml.kernel.org/r/20210303071609.797782-1-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags-layout.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 7d4ec26d8a3ed2..295c2c687d2cec 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -83,7 +83,7 @@ #define KASAN_TAG_WIDTH 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else From 1587db62d8c0dbd943752f657b452213e1c4d8d4 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 29 Apr 2021 23:01:07 -0700 Subject: [PATCH 152/175] include/linux/page-flags-layout.h: cleanups Tidy things up and delete comments stating the obvious with typos or making no sense. Link: https://lkml.kernel.org/r/20210303071609.797782-2-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags-layout.h | 62 +++++++++++++++---------------- mm/mm_init.c | 4 -- 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 295c2c687d2cec..ef1e3e736e1483 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -21,16 +21,17 @@ #elif MAX_NR_ZONES <= 8 #define ZONES_SHIFT 3 #else -#error ZONES_SHIFT -- too many zones configured adjust calculation +#error ZONES_SHIFT "Too many zones configured" #endif +#define ZONES_WIDTH ZONES_SHIFT + #ifdef CONFIG_SPARSEMEM #include - -/* SECTION_SHIFT #bits space required to store a section # */ #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) - -#endif /* CONFIG_SPARSEMEM */ +#else +#define SECTIONS_SHIFT 0 +#endif #ifndef BUILD_VDSO32_64 /* @@ -54,17 +55,28 @@ #define SECTIONS_WIDTH 0 #endif -#define ZONES_WIDTH ZONES_SHIFT - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT -#else -#ifdef CONFIG_SPARSEMEM_VMEMMAP +#elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" -#endif +#else #define NODES_WIDTH 0 #endif +/* + * Note that this #define MUST have a value so that it can be tested with + * the IS_ENABLED() macro. + */ +#if NODES_SHIFT != 0 && NODES_WIDTH == 0 +#define NODE_NOT_IN_PAGE_FLAGS 1 +#endif + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +#define KASAN_TAG_WIDTH 8 +#else +#define KASAN_TAG_WIDTH 0 +#endif + #ifdef CONFIG_NUMA_BALANCING #define LAST__PID_SHIFT 8 #define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) @@ -77,36 +89,20 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) -#define KASAN_TAG_WIDTH 8 -#else -#define KASAN_TAG_WIDTH 0 -#endif - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 #endif -#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS -#error "Not enough bits in page flags" -#endif - -/* - * We are going to use the flags for the page to node mapping if its in - * there. This includes the case where there is no node, so it is implicit. - * Note that this #define MUST have a value so that it can be tested with - * the IS_ENABLED() macro. - */ -#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) -#define NODE_NOT_IN_PAGE_FLAGS 1 +#if LAST_CPUPID_SHIFT != 0 && LAST_CPUPID_WIDTH == 0 +#define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0 -#define LAST_CPUPID_NOT_IN_PAGE_FLAGS +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ + > BITS_PER_LONG - NR_PAGEFLAGS +#error "Not enough bits in page flags" #endif #endif diff --git a/mm/mm_init.c b/mm/mm_init.c index 8e02e865cc65af..9ddaf0e1b0ab95 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -19,10 +19,6 @@ #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; -#ifndef SECTIONS_SHIFT -#define SECTIONS_SHIFT 0 -#endif - /* The zonelists are simply reported, validation is manual. */ void __init mminit_verify_zonelist(void) { From 8e6a930bb3ea6aa4b623eececc25465d09ee7b13 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:10 -0700 Subject: [PATCH 153/175] mm/page_alloc: rename alloc_mask to alloc_gfp Patch series "Rationalise __alloc_pages wrappers", v3. I was poking around the __alloc_pages variants trying to understand why they each exist, and couldn't really find a good justification for keeping __alloc_pages and __alloc_pages_nodemask as separate functions. That led to getting rid of alloc_pages_current() and then I noticed the documentation was bad, and then I noticed the mempolicy documentation wasn't included. Anyway, this is all cleanups & doc fixes. This patch (of 7): We have two masks involved -- the nodemask and the gfp mask, so alloc_mask is an unclear name. Link: https://lkml.kernel.org/r/20210225150642.2582252-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c73557bb8c58a8..5932a95830dd20 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4966,7 +4966,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask, - struct alloc_context *ac, gfp_t *alloc_mask, + struct alloc_context *ac, gfp_t *alloc_gfp, unsigned int *alloc_flags) { ac->highest_zoneidx = gfp_zone(gfp_mask); @@ -4975,7 +4975,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { - *alloc_mask |= __GFP_HARDWALL; + *alloc_gfp |= __GFP_HARDWALL; /* * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. @@ -5019,7 +5019,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; /* @@ -5032,8 +5032,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, } gfp_mask &= gfp_allowed_mask; - alloc_mask = gfp_mask; - if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) + alloc_gfp = gfp_mask; + if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, + &alloc_gfp, &alloc_flags)) return NULL; /* @@ -5043,7 +5044,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); /* First allocation attempt */ - page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); if (likely(page)) goto out; @@ -5053,7 +5054,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * from a particular context which has been marked by * memalloc_no{fs,io}_{save,restore}. */ - alloc_mask = current_gfp_context(gfp_mask); + alloc_gfp = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -5062,7 +5063,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, */ ac.nodemask = nodemask; - page = __alloc_pages_slowpath(alloc_mask, order, &ac); + page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && @@ -5071,7 +5072,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, page = NULL; } - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); return page; } From 6e5e0f286eb0ecf12afaa3e73c321bc5bf599abb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:13 -0700 Subject: [PATCH 154/175] mm/page_alloc: rename gfp_mask to gfp Shorten some overly-long lines by renaming this identifier. Link: https://lkml.kernel.org/r/20210225150642.2582252-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5932a95830dd20..c565ebad02eee7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5014,7 +5014,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, +__alloc_pages_nodemask(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page; @@ -5027,13 +5027,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * so bail out early if the request is out of bound. */ if (unlikely(order >= MAX_ORDER)) { - WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); return NULL; } - gfp_mask &= gfp_allowed_mask; - alloc_gfp = gfp_mask; - if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) return NULL; @@ -5041,7 +5041,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ - alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); @@ -5054,7 +5054,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * from a particular context which has been marked by * memalloc_no{fs,io}_{save,restore}. */ - alloc_gfp = current_gfp_context(gfp_mask); + alloc_gfp = current_gfp_context(gfp); ac.spread_dirty_pages = false; /* @@ -5066,8 +5066,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; } From 84172f4bb752424415756351a40f8da5714e1554 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:15 -0700 Subject: [PATCH 155/175] mm/page_alloc: combine __alloc_pages and __alloc_pages_nodemask There are only two callers of __alloc_pages() so prune the thicket of alloc_page variants by combining the two functions together. Current callers of __alloc_pages() simply add an extra 'NULL' parameter and current callers of __alloc_pages_nodemask() call __alloc_pages() instead. Link: https://lkml.kernel.org/r/20210225150642.2582252-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/transhuge.rst | 2 +- include/linux/gfp.h | 13 +++---------- mm/hugetlb.c | 2 +- mm/internal.h | 4 ++-- mm/mempolicy.c | 6 +++--- mm/migrate.c | 2 +- mm/page_alloc.c | 5 ++--- 7 files changed, 13 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 3b8a336511a480..c9c37f16eef881 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -402,7 +402,7 @@ compact_fail but failed. It is possible to establish how long the stalls were using the function -tracer to record how long was spent in __alloc_pages_nodemask and +tracer to record how long was spent in __alloc_pages() and using the mm_page_alloc tracepoint to identify which allocations were for huge pages. diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8572a1474e16fb..f39de931bdf91d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -515,15 +515,8 @@ static inline int arch_make_page_accessible(struct page *page) } #endif -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, - nodemask_t *nodemask); - -static inline struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) -{ - return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); -} +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); /* * Allocate pages, preferring the node given as nid. The node must be valid and @@ -535,7 +528,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); - return __alloc_pages(gfp_mask, order, nid); + return __alloc_pages(gfp_mask, order, nid, NULL); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a86a58ef132d55..6c72433bec1e1d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1616,7 +1616,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + page = __alloc_pages(gfp_mask, order, nid, nmask); if (page) __count_vm_event(HTLB_BUDDY_PGALLOC); else diff --git a/mm/internal.h b/mm/internal.h index 42e30e71554ab6..ef5f336f59bd11 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -145,10 +145,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in - * __alloc_pages_nodemask() and then never change. + * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in - * __alloc_pages_nodemask() for the fast path, and might be later changed + * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ab51132547b898..5f0d2029873626 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2140,7 +2140,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, { struct page *page; - page = __alloc_pages(gfp, order, nid); + page = __alloc_pages(gfp, order, nid, NULL); /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ if (!static_branch_likely(&vm_numa_stat_key)) return page; @@ -2237,7 +2237,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); - page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); + page = __alloc_pages(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: return page; @@ -2274,7 +2274,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else - page = __alloc_pages_nodemask(gfp, order, + page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); diff --git a/mm/migrate.c b/mm/migrate.c index 62b81d5257aaa7..47df0df8f21ad2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1617,7 +1617,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; - new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask); + new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask); if (new_page && PageTransHuge(new_page)) prep_transhuge_page(new_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c565ebad02eee7..fce4b9180bdb62 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5013,8 +5013,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, /* * This is the 'heart' of the zoned buddy allocator. */ -struct page * -__alloc_pages_nodemask(gfp_t gfp, unsigned int order, int preferred_nid, +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page; @@ -5076,7 +5075,7 @@ __alloc_pages_nodemask(gfp_t gfp, unsigned int order, int preferred_nid, return page; } -EXPORT_SYMBOL(__alloc_pages_nodemask); +EXPORT_SYMBOL(__alloc_pages); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned From d7f946d0faf90014547ee5d090e9d05018278c7a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:18 -0700 Subject: [PATCH 156/175] mm/mempolicy: rename alloc_pages_current to alloc_pages When CONFIG_NUMA is enabled, alloc_pages() is a wrapper around alloc_pages_current(). This is pointless, just implement alloc_pages() directly. Link: https://lkml.kernel.org/r/20210225150642.2582252-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 +------- mm/mempolicy.c | 6 +++--- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f39de931bdf91d..0a88f84b08f41f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -546,13 +546,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, } #ifdef CONFIG_NUMA -extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); - -static inline struct page * -alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - return alloc_pages_current(gfp_mask, order); -} +struct page *alloc_pages(gfp_t gfp, unsigned int order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5f0d2029873626..c71532b7e3f8c4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2245,7 +2245,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, EXPORT_SYMBOL(alloc_pages_vma); /** - * alloc_pages_current - Allocate pages. + * alloc_pages - Allocate pages. * * @gfp: * %GFP_USER user allocation, @@ -2259,7 +2259,7 @@ EXPORT_SYMBOL(alloc_pages_vma); * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. */ -struct page *alloc_pages_current(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; @@ -2280,7 +2280,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) return page; } -EXPORT_SYMBOL(alloc_pages_current); +EXPORT_SYMBOL(alloc_pages); int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { From 6421ec764a62c51f810c5dc40cd45eeb15801ad9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:21 -0700 Subject: [PATCH 157/175] mm/mempolicy: rewrite alloc_pages documentation Document alloc_pages() for both NUMA and non-NUMA cases as kernel-doc doesn't care. Link: https://lkml.kernel.org/r/20210225150642.2582252-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Acked-by: Michal Hocko Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c71532b7e3f8c4..24c2100fccba2d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2245,19 +2245,18 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, EXPORT_SYMBOL(alloc_pages_vma); /** - * alloc_pages - Allocate pages. + * alloc_pages - Allocate pages. + * @gfp: GFP flags. + * @order: Power of two of number of pages to allocate. * - * @gfp: - * %GFP_USER user allocation, - * %GFP_KERNEL kernel allocation, - * %GFP_HIGHMEM highmem allocation, - * %GFP_FS don't call back into a file system. - * %GFP_ATOMIC don't sleep. - * @order: Power of two of allocation size in pages. 0 is a single page. + * Allocate 1 << @order contiguous pages. The physical address of the + * first page is naturally aligned (eg an order-3 allocation will be aligned + * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current + * process is honoured when in process context. * - * Allocate a page from the kernel page pool. When not in - * interrupt context and apply the current process NUMA policy. - * Returns NULL when no page can be allocated. + * Context: Can be called from any context, providing the appropriate GFP + * flags are used. + * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages(gfp_t gfp, unsigned order) { From eb35073960510762dee417574589b7a8971c68ab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:24 -0700 Subject: [PATCH 158/175] mm/mempolicy: rewrite alloc_pages_vma documentation The current formatting doesn't quite work with kernel-doc. Link: https://lkml.kernel.org/r/20210225150642.2582252-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Acked-by: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 24c2100fccba2d..6d0fe85d4f8d71 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2153,30 +2153,22 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_pages_vma - Allocate a page for a VMA. - * - * @gfp: - * %GFP_USER user allocation. - * %GFP_KERNEL kernel allocations, - * %GFP_HIGHMEM highmem/user allocations, - * %GFP_FS allocation should not call back into a file system. - * %GFP_ATOMIC don't sleep. + * alloc_pages_vma - Allocate a page for a VMA. + * @gfp: GFP flags. + * @order: Order of the GFP allocation. + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: For hugepages try only the preferred node if possible. * - * @order:Order of the GFP allocation. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual Address of the allocation. Must be inside the VMA. - * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible + * Allocate a page for a specific address in @vma, using the appropriate + * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock + * of the mm_struct of the VMA to prevent it from going away. Should be + * used for all allocations for pages that will be mapped into user space. * - * This function allocates a page from the kernel page pool and applies - * a NUMA policy associated with the VMA or the current process. - * When VMA is not NULL caller must read-lock the mmap_lock of the - * mm_struct of the VMA to prevent it from going away. Should be used for - * all allocations for pages that will be mapped into user space. Returns - * NULL when no page can be allocated. + * Return: The page on success or NULL if allocation fails. */ -struct page * -alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, +struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; From 5f076944f06988391a6dbd458fc6485a71088e57 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:27 -0700 Subject: [PATCH 159/175] mm/mempolicy: fix mpol_misplaced kernel-doc Sphinx interprets the Return section as a list and complains about it. Turn it into a sentence and move it to the end of the kernel-doc to fit the kernel-doc style. Link: https://lkml.kernel.org/r/20210225150642.2582252-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Acked-by: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/mm-api.rst | 1 + mm/mempolicy.c | 11 ++++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index f1dc5f58feca26..34f46df91a8b37 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -92,6 +92,7 @@ More Memory Management Functions :export: .. kernel-doc:: mm/page_alloc.c +.. kernel-doc:: mm/mempolicy.c .. kernel-doc:: include/linux/mm_types.h :internal: .. kernel-doc:: include/linux/mm.h diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6d0fe85d4f8d71..cd0295567a042c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2448,14 +2448,11 @@ static void sp_free(struct sp_node *n) * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's - * node id. - * - * Returns: - * -1 - not misplaced, page is in the right node - * node - node id where the page should be - * - * Policy determination "mimics" alloc_page_vma(). + * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. + * + * Return: -1 if the page is in a node that is valid for this policy, or a + * suitable node ID to allocate a replacement page from. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { From a1394bddf9b60e96d075d94b71a8857696598186 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 29 Apr 2021 23:01:30 -0700 Subject: [PATCH 160/175] mm: page_alloc: dump migrate-failed pages Currently, debugging CMA allocation failures is quite limited. The most common source of these failures seems to be page migration which doesn't provide any useful information on the reason of the failure by itself. alloc_contig_range can report those failures as it holds a list of migrate-failed pages. The information logged by dump_page() has already proven helpful for debugging allocation issues, like identifying long-term pinnings on ZONE_MOVABLE or MIGRATE_CMA. Let's use the dynamic debugging infrastructure, such that we avoid flooding the logs and creating a lot of noise on frequent alloc_contig_range() calls. This information is helpful for debugging only. There are two ifdefery conditions to support common dyndbg options: - CONFIG_DYNAMIC_DEBUG_CORE && DYNAMIC_DEBUG_MODULE It aims for supporting the feature with only specific file with adding ccflags. - CONFIG_DYNAMIC_DEBUG It aims for supporting the feature with system wide globally. A simple example to enable the feature: Admin could enable the dump like this(by default, disabled) echo "func alloc_contig_dump_pages +p" > control Admin could disable it. echo "func alloc_contig_dump_pages =_" > control Detail goes Documentation/admin-guide/dynamic-debug-howto.rst A concern is utility functions in dump_page use inconsistent loglevels. In the future, we might want to make the loglevels used inside dump_page() consistent and eventually rework the way we log the information here. See [1]. [1] https://lore.kernel.org/linux-mm/YEh4doXvyuRl5BDB@google.com/ Link: https://lkml.kernel.org/r/20210311194042.825152-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: John Dias Cc: Michal Hocko Cc: David Hildenbrand Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fce4b9180bdb62..4a9f3950ef0f1e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8497,6 +8497,27 @@ static unsigned long pfn_max_align_up(unsigned long pfn) pageblock_nr_pages)); } +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +/* Usage: See admin-guide/dynamic-debug-howto.rst */ +static void alloc_contig_dump_pages(struct list_head *page_list) +{ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); + + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { + struct page *page; + + dump_stack(); + list_for_each_entry(page, page_list, lru) + dump_page(page, "migration failure"); + } +} +#else +static inline void alloc_contig_dump_pages(struct list_head *page_list) +{ +} +#endif + /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) @@ -8540,6 +8561,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); } if (ret < 0) { + alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; } From d68d015a7e5e3d45624960420e32bd52a937447a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Apr 2021 23:01:33 -0700 Subject: [PATCH 161/175] mm/Kconfig: remove default DISCONTIGMEM_MANUAL Commit 214496cb18700fd7 ("ia64: make SPARSEMEM default and disable DISCONTIGMEM") removed the last enabler of ARCH_DISCONTIGMEM_DEFAULT, hence the memory model can no longer default to DISCONTIGMEM_MANUAL. Link: https://lkml.kernel.org/r/20210312141208.3465520-1-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 8a27c7c914280f..3636da27c38573 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,7 +9,6 @@ config SELECT_MEMORY_MODEL choice prompt "Memory model" depends on SELECT_MEMORY_MODEL - default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT default FLATMEM_MANUAL help From 39ddb991fc45abcdcddbec7fcdfe28795d0133d7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 29 Apr 2021 23:01:36 -0700 Subject: [PATCH 162/175] mm, page_alloc: avoid page_to_pfn() in move_freepages() The start_pfn and end_pfn are already available in move_freepages_block(), there is no need to go back and forth between page and pfn in move_freepages and move_freepages_block, and pfn_valid_within() should validate pfn first before touching the page. Link: https://lkml.kernel.org/r/20210323131215.934472-1-liushixin2@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Liu Shixin Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Cc: Stephen Rothwell Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4a9f3950ef0f1e..3bced8e0dd9570 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2431,19 +2431,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * boundary. If alignment is required, use move_freepages_block() */ static int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, + unsigned long start_pfn, unsigned long end_pfn, int migratetype, int *num_movable) { struct page *page; + unsigned long pfn; unsigned int order; int pages_moved = 0; - for (page = start_page; page <= end_page;) { - if (!pfn_valid_within(page_to_pfn(page))) { - page++; + for (pfn = start_pfn; pfn <= end_pfn;) { + if (!pfn_valid_within(pfn)) { + pfn++; continue; } + page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2453,8 +2455,7 @@ static int move_freepages(struct zone *zone, if (num_movable && (PageLRU(page) || __PageMovable(page))) (*num_movable)++; - - page++; + pfn++; continue; } @@ -2464,7 +2465,7 @@ static int move_freepages(struct zone *zone, order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); - page += 1 << order; + pfn += 1 << order; pages_moved += 1 << order; } @@ -2474,25 +2475,22 @@ static int move_freepages(struct zone *zone, int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable) { - unsigned long start_pfn, end_pfn; - struct page *start_page, *end_page; + unsigned long start_pfn, end_pfn, pfn; if (num_movable) *num_movable = 0; - start_pfn = page_to_pfn(page); - start_pfn = start_pfn & ~(pageblock_nr_pages-1); - start_page = pfn_to_page(start_pfn); - end_page = start_page + pageblock_nr_pages - 1; + pfn = page_to_pfn(page); + start_pfn = pfn & ~(pageblock_nr_pages - 1); end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) - start_page = page; + start_pfn = pfn; if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype, + return move_freepages(zone, start_pfn, end_pfn, migratetype, num_movable); } From 8f709dbdf9ff13da19d3154b3248e063364a53d5 Mon Sep 17 00:00:00 2001 From: zhouchuangao Date: Thu, 29 Apr 2021 23:01:39 -0700 Subject: [PATCH 163/175] mm/page_alloc: duplicate include linux/vmalloc.h linux/vmalloc.h is repeatedly in the file page_alloc.c Link: https://lkml.kernel.org/r/1616468751-80656-1-git-send-email-zhouchuangao@vivo.com Signed-off-by: zhouchuangao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bced8e0dd9570..8b24b69d55ef3c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,8 +72,6 @@ #include #include #include -#include - #include #include #include From cb66bede617581309883432e9a633e8cade2a36e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:42 -0700 Subject: [PATCH 164/175] mm/page_alloc: rename alloced to allocated Patch series "Introduce a bulk order-0 page allocator with two in-tree users", v6. This series introduces a bulk order-0 page allocator with sunrpc and the network page pool being the first users. The implementation is not efficient as semantics needed to be ironed out first. If no other semantic changes are needed, it can be made more efficient. Despite that, this is a performance-related for users that require multiple pages for an operation without multiple round-trips to the page allocator. Quoting the last patch for the high-speed networking use-case Kernel XDP stats CPU pps Delta Baseline XDP-RX CPU total 3,771,046 n/a List XDP-RX CPU total 3,940,242 +4.49% Array XDP-RX CPU total 4,249,224 +12.68% Via the SUNRPC traces of svc_alloc_arg() Single page: 25.007 us per call over 532,571 calls Bulk list: 6.258 us per call over 517,034 calls Bulk array: 4.590 us per call over 517,442 calls Both potential users in this series are corner cases (NFS and high-speed networks) so it is unlikely that most users will see any benefit in the short term. Other potential other users are batch allocations for page cache readahead, fault around and SLUB allocations when high-order pages are unavailable. It's unknown how much benefit would be seen by converting multiple page allocation calls to a single batch or what difference it may make to headline performance. Light testing of my own running dbench over NFS passed. Chuck and Jesper conducted their own tests and details are included in the changelogs. Patch 1 renames a variable name that is particularly unpopular Patch 2 adds a bulk page allocator Patch 3 adds an array-based version of the bulk allocator Patches 4-5 adds micro-optimisations to the implementation Patches 6-7 SUNRPC user Patches 8-9 Network page_pool user This patch (of 9): Review feedback of the bulk allocator twice found problems with "alloced" being a counter for pages allocated. The naming was based on the API name "alloc" and was based on the idea that verbal communication about malloc tends to use the fake word "malloced" instead of the fake word mallocated. To be consistent, this preparation patch renames alloced to allocated in rmqueue_bulk so the bulk allocator and per-cpu allocator use similar names when the bulk allocator is introduced. Link: https://lkml.kernel.org/r/20210325114228.27719-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210325114228.27719-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Alexander Lobakin Acked-by: Vlastimil Babka Cc: Chuck Lever Cc: Jesper Dangaard Brouer Cc: Christoph Hellwig Cc: Alexander Duyck Cc: Ilias Apalodimas Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8b24b69d55ef3c..fd283d9b6a4da0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2949,7 +2949,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, alloced = 0; + int i, allocated = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2972,7 +2972,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->lru, list); - alloced++; + allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -2981,12 +2981,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'alloced' which is the number of + * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return alloced; + return allocated; } #ifdef CONFIG_NUMA From 387ba26fb1cb9be9e35dc14a6d97188e916eda05 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:45 -0700 Subject: [PATCH 165/175] mm/page_alloc: add a bulk page allocator This patch adds a new page allocator interface via alloc_pages_bulk, and __alloc_pages_bulk_nodemask. A caller requests a number of pages to be allocated and added to a list. The API is not guaranteed to return the requested number of pages and may fail if the preferred allocation zone has limited free memory, the cpuset changes during the allocation or page debugging decides to fail an allocation. It's up to the caller to request more pages in batch if necessary. Note that this implementation is not very efficient and could be improved but it would require refactoring. The intent is to make it available early to determine what semantics are required by different callers. Once the full semantics are nailed down, it can be refactored. [mgorman@techsingularity.net: fix alloc_pages_bulk() return type, per Matthew] Link: https://lkml.kernel.org/r/20210325123713.GQ3697@techsingularity.net [mgorman@techsingularity.net: fix uninit var warning] Link: https://lkml.kernel.org/r/20210330114847.GX3697@techsingularity.net [mgorman@techsingularity.net: fix comment, per Vlastimil] Link: https://lkml.kernel.org/r/20210412110255.GV3697@techsingularity.net Link: https://lkml.kernel.org/r/20210325114228.27719-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Reviewed-by: Alexander Lobakin Tested-by: Colin Ian King Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 11 +++++ mm/page_alloc.c | 118 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0a88f84b08f41f..a2be8f4174a941 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -518,6 +518,17 @@ static inline int arch_make_page_accessible(struct page *page) struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *list); + +/* Bulk allocate order-0 pages */ +static inline unsigned long +alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list); +} + /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd283d9b6a4da0..d8209d48a54303 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5006,6 +5006,124 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, return true; } +/* + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list + * @gfp: GFP flags for the allocation + * @preferred_nid: The preferred NUMA node ID to allocate from + * @nodemask: Set of nodes to allocate from, may be NULL + * @nr_pages: The number of pages desired on the list + * @page_list: List to store the allocated pages + * + * This is a batched version of the page allocator that attempts to + * allocate nr_pages quickly and add them to a list. + * + * Returns the number of pages on the list. + */ +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list) +{ + struct page *page; + unsigned long flags; + struct zone *zone; + struct zoneref *z; + struct per_cpu_pages *pcp; + struct list_head *pcp_list; + struct alloc_context ac; + gfp_t alloc_gfp; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + int allocated = 0; + + if (WARN_ON_ONCE(nr_pages <= 0)) + return 0; + + /* Use the single page allocator for one page. */ + if (nr_pages == 1) + goto failed; + + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + return 0; + gfp = alloc_gfp; + + /* Find an allowed local zone that meets the low watermark. */ + for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + unsigned long mark; + + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp)) { + continue; + } + + if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && + zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + goto failed; + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + if (zone_watermark_fast(zone, 0, mark, + zonelist_zone_idx(ac.preferred_zoneref), + alloc_flags, gfp)) { + break; + } + } + + /* + * If there are no allowed local zones that meets the watermarks then + * try to allocate a single page and reclaim if necessary. + */ + if (!zone) + goto failed; + + /* Attempt the batch allocation */ + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp_list = &pcp->lists[ac.migratetype]; + + while (allocated < nr_pages) { + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + pcp, pcp_list); + if (!page) { + /* Try and get at least one page */ + if (!allocated) + goto failed_irq; + break; + } + + /* + * Ideally this would be batched but the best way to do + * that cheaply is to first convert zone_statistics to + * be inaccurate per-cpu counter like vm_events to avoid + * a RMW cycle then do the accounting with IRQs enabled. + */ + __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); + zone_statistics(ac.preferred_zoneref->zone, zone); + + prep_new_page(page, 0, gfp, 0); + list_add(&page->lru, page_list); + allocated++; + } + + local_irq_restore(flags); + + return allocated; + +failed_irq: + local_irq_restore(flags); + +failed: + page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + if (page) { + list_add(&page->lru, page_list); + allocated = 1; + } + + return allocated; +} +EXPORT_SYMBOL_GPL(__alloc_pages_bulk); + /* * This is the 'heart' of the zoned buddy allocator. */ From 0f87d9d30f21390dd71114f30e63038980e6eb3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:48 -0700 Subject: [PATCH 166/175] mm/page_alloc: add an array-based interface to the bulk page allocator The proposed callers for the bulk allocator store pages from the bulk allocator in an array. This patch adds an array-based interface to the API to avoid multiple list iterations. The page list interface is preserved to avoid requiring all users of the bulk API to allocate and manage enough storage to store the pages. [akpm@linux-foundation.org: remove now unused local `allocated'] Link: https://lkml.kernel.org/r/20210325114228.27719-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Acked-by: Vlastimil Babka Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 13 +++++++--- mm/page_alloc.c | 60 +++++++++++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a2be8f4174a941..26f4d907254a4f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -520,13 +520,20 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *list); + struct list_head *page_list, + struct page **page_array); /* Bulk allocate order-0 pages */ static inline unsigned long -alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) { - return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list); + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); +} + +static inline unsigned long +alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8209d48a54303..e240704b5d399b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5007,21 +5007,29 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL - * @nr_pages: The number of pages desired on the list - * @page_list: List to store the allocated pages + * @nr_pages: The number of pages desired on the list or array + * @page_list: Optional list to store the allocated pages + * @page_array: Optional array to store the pages * * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly and add them to a list. + * allocate nr_pages quickly. Pages are added to page_list if page_list + * is not NULL, otherwise it is assumed that the page_array is valid. * - * Returns the number of pages on the list. + * For lists, nr_pages is the number of pages that should be allocated. + * + * For arrays, only NULL elements are populated with pages and nr_pages + * is the maximum number of pages that will be stored in the array. + * + * Returns the number of pages on the list or array. */ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list) + struct list_head *page_list, + struct page **page_array) { struct page *page; unsigned long flags; @@ -5032,13 +5040,20 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct alloc_context ac; gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; - int allocated = 0; + int nr_populated = 0; if (WARN_ON_ONCE(nr_pages <= 0)) return 0; + /* + * Skip populated array elements to determine if any pages need + * to be allocated before disabling IRQs. + */ + while (page_array && page_array[nr_populated] && nr_populated < nr_pages) + nr_populated++; + /* Use the single page allocator for one page. */ - if (nr_pages == 1) + if (nr_pages - nr_populated == 1) goto failed; /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ @@ -5082,12 +5097,19 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp = &this_cpu_ptr(zone->pageset)->pcp; pcp_list = &pcp->lists[ac.migratetype]; - while (allocated < nr_pages) { + while (nr_populated < nr_pages) { + + /* Skip existing pages */ + if (page_array && page_array[nr_populated]) { + nr_populated++; + continue; + } + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, pcp, pcp_list); if (!page) { /* Try and get at least one page */ - if (!allocated) + if (!nr_populated) goto failed_irq; break; } @@ -5102,13 +5124,16 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, zone_statistics(ac.preferred_zoneref->zone, zone); prep_new_page(page, 0, gfp, 0); - list_add(&page->lru, page_list); - allocated++; + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; } local_irq_restore(flags); - return allocated; + return nr_populated; failed_irq: local_irq_restore(flags); @@ -5116,11 +5141,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); if (page) { - list_add(&page->lru, page_list); - allocated = 1; + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; } - return allocated; + return nr_populated; } EXPORT_SYMBOL_GPL(__alloc_pages_bulk); From ce76f9a1d9a21c2633dcd2a5605f923286e16e1d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:01:51 -0700 Subject: [PATCH 167/175] mm/page_alloc: optimize code layout for __alloc_pages_bulk Looking at perf-report and ASM-code for __alloc_pages_bulk() it is clear that the code activated is suboptimal. The compiler guesses wrong and places unlikely code at the beginning. Due to the use of WARN_ON_ONCE() macro the UD2 asm instruction is added to the code, which confuse the I-cache prefetcher in the CPU. [mgorman@techsingularity.net: minor changes and rebasing] Link: https://lkml.kernel.org/r/20210325114228.27719-5-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Acked-By: Vlastimil Babka Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e240704b5d399b..3f2291695b19ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5042,7 +5042,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, unsigned int alloc_flags = ALLOC_WMARK_LOW; int nr_populated = 0; - if (WARN_ON_ONCE(nr_pages <= 0)) + if (unlikely(nr_pages <= 0)) return 0; /* @@ -5089,7 +5089,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, * If there are no allowed local zones that meets the watermarks then * try to allocate a single page and reclaim if necessary. */ - if (!zone) + if (unlikely(!zone)) goto failed; /* Attempt the batch allocation */ @@ -5107,7 +5107,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, pcp, pcp_list); - if (!page) { + if (unlikely(!page)) { /* Try and get at least one page */ if (!nr_populated) goto failed_irq; From 3b822017b636bf4261a644c16b01eb3900f2a9a0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:01:55 -0700 Subject: [PATCH 168/175] mm/page_alloc: inline __rmqueue_pcplist When __alloc_pages_bulk() got introduced two callers of __rmqueue_pcplist exist and the compiler chooses to not inline this function. ./scripts/bloat-o-meter vmlinux-before vmlinux-inline__rmqueue_pcplist add/remove: 0/1 grow/shrink: 2/0 up/down: 164/-125 (39) Function old new delta rmqueue 2197 2296 +99 __alloc_pages_bulk 1921 1986 +65 __rmqueue_pcplist 125 - -125 Total: Before=19374127, After=19374166, chg +0.00% modprobe page_bench04_bulk loops=$((10**7)) Type:time_bulk_page_alloc_free_array - Per elem: 106 cycles(tsc) 29.595 ns (step:64) - (measurement period time:0.295955434 sec time_interval:295955434) - (invoke count:10000000 tsc_interval:1065447105) Before: - Per elem: 110 cycles(tsc) 30.633 ns (step:64) Link: https://lkml.kernel.org/r/20210325114228.27719-6-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Acked-by: Vlastimil Babka Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3f2291695b19ec..91455f0d22c12e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3456,7 +3456,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +static inline +struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) From ab8362645fba90fa44ec1991ad05544e307dd02f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 29 Apr 2021 23:01:58 -0700 Subject: [PATCH 169/175] SUNRPC: set rq_page_end differently Patch series "SUNRPC consumer for the bulk page allocator" This patch set and the measurements below are based on yesterday's bulk allocator series: git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git mm-bulk-rebase-v5r9 The patches change SUNRPC to invoke the array-based bulk allocator instead of alloc_page(). The micro-benchmark results are promising. I ran a mixture of 256KB reads and writes over NFSv3. The server's kernel is built with KASAN enabled, so the comparison is exaggerated but I believe it is still valid. I instrumented svc_recv() to measure the latency of each call to svc_alloc_arg() and report it via a trace point. The following results are averages across the trace events. Single page: 25.007 us per call over 532,571 calls Bulk list: 6.258 us per call over 517,034 calls Bulk array: 4.590 us per call over 517,442 calls This patch (of 2) Refactor: I'm about to use the loop variable @i for something else. As far as the "i++" is concerned, that is a post-increment. The value of @i is not used subsequently, so the increment operator is unnecessary and can be removed. Also note that nfsd_read_actor() was renamed nfsd_splice_actor() by commit cf8208d0eabd ("sendfile: convert nfsd to splice_direct_to_actor()"). Link: https://lkml.kernel.org/r/20210325114228.27719-7-mgorman@techsingularity.net Signed-off-by: Chuck Lever Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Cc: Alexander Duyck Cc: Christoph Hellwig Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/svc_xprt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 42565f0c7d5a71..63eb6b9a4973dd 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -661,7 +661,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) static int svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; - struct xdr_buf *arg; + struct xdr_buf *arg = &rqstp->rq_arg; int pages; int i; @@ -686,11 +686,10 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) } rqstp->rq_pages[i] = p; } - rqstp->rq_page_end = &rqstp->rq_pages[i]; - rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ + rqstp->rq_page_end = &rqstp->rq_pages[pages]; + rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ /* Make arg->head point to first page and arg->pages point to rest */ - arg = &rqstp->rq_arg; arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); arg->head[0].iov_len = PAGE_SIZE; arg->pages = rqstp->rq_pages + 1; From f6e70aab9dfe0c2f79cf7dbcb1e80fa71dc60b09 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 29 Apr 2021 23:02:01 -0700 Subject: [PATCH 170/175] SUNRPC: refresh rq_pages using a bulk page allocator Reduce the rate at which nfsd threads hammer on the page allocator. This improves throughput scalability by enabling the threads to run more independently of each other. [mgorman: Update interpretation of alloc_pages_bulk return value] Link: https://lkml.kernel.org/r/20210325114228.27719-8-mgorman@techsingularity.net Signed-off-by: Chuck Lever Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Cc: Alexander Duyck Cc: Christoph Hellwig Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/svc_xprt.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 63eb6b9a4973dd..d66a8e44a1aeb5 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -662,30 +662,29 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; - int pages; - int i; + unsigned long pages, filled; - /* now allocate needed pages. If we get a failure, sleep briefly */ pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; if (pages > RPCSVC_MAXPAGES) { - pr_warn_once("svc: warning: pages=%u > RPCSVC_MAXPAGES=%lu\n", + pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", pages, RPCSVC_MAXPAGES); /* use as many pages as possible */ pages = RPCSVC_MAXPAGES; } - for (i = 0; i < pages ; i++) - while (rqstp->rq_pages[i] == NULL) { - struct page *p = alloc_page(GFP_KERNEL); - if (!p) { - set_current_state(TASK_INTERRUPTIBLE); - if (signalled() || kthread_should_stop()) { - set_current_state(TASK_RUNNING); - return -EINTR; - } - schedule_timeout(msecs_to_jiffies(500)); - } - rqstp->rq_pages[i] = p; + + for (;;) { + filled = alloc_pages_bulk_array(GFP_KERNEL, pages, + rqstp->rq_pages); + if (filled == pages) + break; + + set_current_state(TASK_INTERRUPTIBLE); + if (signalled() || kthread_should_stop()) { + set_current_state(TASK_RUNNING); + return -EINTR; } + schedule_timeout(msecs_to_jiffies(500)); + } rqstp->rq_page_end = &rqstp->rq_pages[pages]; rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ From dfa59717b97d4203e6b44ee82874d4f758d93542 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:02:04 -0700 Subject: [PATCH 171/175] net: page_pool: refactor dma_map into own function page_pool_dma_map In preparation for next patch, move the dma mapping into its own function, as this will make it easier to follow the changes. [ilias.apalodimas: make page_pool_dma_map return boolean] Link: https://lkml.kernel.org/r/20210325114228.27719-9-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Ilias Apalodimas Reviewed-by: Alexander Lobakin Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/page_pool.c | 45 +++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ad8b0707af04b8..40e1b2beaa6c4e 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -180,14 +180,37 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool, pool->p.dma_dir); } +static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +{ + dma_addr_t dma; + + /* Setup DMA mapping: use 'struct page' area for storing DMA-addr + * since dma_addr_t can be either 32 or 64 bits and does not always fit + * into page private data (i.e 32bit cpu with 64bit DMA caps) + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page_attrs(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); + if (dma_mapping_error(pool->p.dev, dma)) + return false; + + page->dma_addr = dma; + + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + + return true; +} + /* slow path */ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, gfp_t _gfp) { + unsigned int pp_flags = pool->p.flags; struct page *page; gfp_t gfp = _gfp; - dma_addr_t dma; /* We could always set __GFP_COMP, and avoid this branch, as * prep_new_page() can handle order-0 with __GFP_COMP. @@ -211,30 +234,14 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, if (!page) return NULL; - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) - goto skip_dma_map; - - /* Setup DMA mapping: use 'struct page' area for storing DMA-addr - * since dma_addr_t can be either 32 or 64 bits and does not always fit - * into page private data (i.e 32bit cpu with 64bit DMA caps) - * This mapping is kept for lifetime of page, until leaving pool. - */ - dma = dma_map_page_attrs(pool->p.dev, page, 0, - (PAGE_SIZE << pool->p.order), - pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); - if (dma_mapping_error(pool->p.dev, dma)) { + if ((pp_flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { put_page(page); return NULL; } - page->dma_addr = dma; - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, pool->p.max_len); - -skip_dma_map: /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); /* When page just alloc'ed is should/must have refcnt 1. */ From be5dba25b4b27f262626ddc9079d4858a75462fd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:02:07 -0700 Subject: [PATCH 172/175] net: page_pool: use alloc_pages_bulk in refill code path There are cases where the page_pool need to refill with pages from the page allocator. Some workloads cause the page_pool to release pages instead of recycling these pages. For these workload it can improve performance to bulk alloc pages from the page-allocator to refill the alloc cache. For XDP-redirect workload with 100G mlx5 driver (that use page_pool) redirecting xdp_frame packets into a veth, that does XDP_PASS to create an SKB from the xdp_frame, which then cannot return the page to the page_pool. Performance results under GitHub xdp-project[1]: [1] https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool06_alloc_pages_bulk.org Mel: The patch "net: page_pool: convert to use alloc_pages_bulk_array variant" was squashed with this patch. From the test page, the array variant was superior with one of the test results as follows. Kernel XDP stats CPU pps Delta Baseline XDP-RX CPU total 3,771,046 n/a List XDP-RX CPU total 3,940,242 +4.49% Array XDP-RX CPU total 4,249,224 +12.68% Link: https://lkml.kernel.org/r/20210325114228.27719-10-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/page_pool.h | 2 +- net/core/page_pool.c | 82 ++++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index b5b1953053468e..6d517a37c18bf9 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -65,7 +65,7 @@ #define PP_ALLOC_CACHE_REFILL 64 struct pp_alloc_cache { u32 count; - void *cache[PP_ALLOC_CACHE_SIZE]; + struct page *cache[PP_ALLOC_CACHE_SIZE]; }; struct page_pool_params { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 40e1b2beaa6c4e..9ec1aa9640adeb 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -203,38 +203,17 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; } -/* slow path */ -noinline -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t _gfp) +static struct page *__page_pool_alloc_page_order(struct page_pool *pool, + gfp_t gfp) { - unsigned int pp_flags = pool->p.flags; struct page *page; - gfp_t gfp = _gfp; - - /* We could always set __GFP_COMP, and avoid this branch, as - * prep_new_page() can handle order-0 with __GFP_COMP. - */ - if (pool->p.order) - gfp |= __GFP_COMP; - - /* FUTURE development: - * - * Current slow-path essentially falls back to single page - * allocations, which doesn't improve performance. This code - * need bulk allocation support from the page allocator code. - */ - /* Cache was empty, do real allocation */ -#ifdef CONFIG_NUMA + gfp |= __GFP_COMP; page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); -#else - page = alloc_pages(gfp, pool->p.order); -#endif - if (!page) + if (unlikely(!page)) return NULL; - if ((pp_flags & PP_FLAG_DMA_MAP) && + if ((pool->p.flags & PP_FLAG_DMA_MAP) && unlikely(!page_pool_dma_map(pool, page))) { put_page(page); return NULL; @@ -243,6 +222,57 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t gfp) +{ + const int bulk = PP_ALLOC_CACHE_REFILL; + unsigned int pp_flags = pool->p.flags; + unsigned int pp_order = pool->p.order; + struct page *page; + int i, nr_pages; + + /* Don't support bulk alloc for high-order pages */ + if (unlikely(pp_order)) + return __page_pool_alloc_page_order(pool, gfp); + + /* Unnecessary as alloc cache is empty, but guarantees zero count */ + if (unlikely(pool->alloc.count > 0)) + return pool->alloc.cache[--pool->alloc.count]; + + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); + + nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); + if (unlikely(!nr_pages)) + return NULL; + + /* Pages have been filled into alloc.cache array, but count is zero and + * page element have not been (possibly) DMA mapped. + */ + for (i = 0; i < nr_pages; i++) { + page = pool->alloc.cache[i]; + if ((pp_flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { + put_page(page); + continue; + } + pool->alloc.cache[pool->alloc.count++] = page; + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, page, + pool->pages_state_hold_cnt); + } + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + else + page = NULL; /* When page just alloc'ed is should/must have refcnt 1. */ return page; From 9df65f522536719682bccd24245ff94db956256c Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 23:02:11 -0700 Subject: [PATCH 173/175] mm: page_alloc: ignore init_on_free=1 for debug_pagealloc=1 On !ARCH_SUPPORTS_DEBUG_PAGEALLOC (like ia64) debug_pagealloc=1 implies page_poison=on: if (page_poisoning_enabled() || (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())) static_branch_enable(&_page_poisoning_enabled); page_poison=on needs to override init_on_free=1. Before the change it did not work as expected for the following case: - have PAGE_POISONING=y - have page_poison unset - have !ARCH_SUPPORTS_DEBUG_PAGEALLOC arch (like ia64) - have init_on_free=1 - have debug_pagealloc=1 That way we get both keys enabled: - static_branch_enable(&init_on_free); - static_branch_enable(&_page_poisoning_enabled); which leads to poisoned pages returned for __GFP_ZERO pages. After the change we execute only: - static_branch_enable(&_page_poisoning_enabled); and ignore init_on_free=1. Link: https://lkml.kernel.org/r/20210329222555.3077928-1-slyfox@gentoo.org Link: https://lkml.org/lkml/2021/3/26/443 Fixes: 8db26a3d4735 ("mm, page_poison: use static key more efficiently") Signed-off-by: Sergei Trofimovich Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Andrey Konovalov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 91455f0d22c12e..6b208b1843bf0e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -786,32 +786,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, */ void init_mem_debugging_and_hardening(void) { + bool page_poisoning_requested = false; + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) { + static_branch_enable(&_page_poisoning_enabled); + page_poisoning_requested = true; + } +#endif + if (_init_on_alloc_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_alloc\n"); else static_branch_enable(&init_on_alloc); } if (_init_on_free_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_free\n"); else static_branch_enable(&init_on_free); } -#ifdef CONFIG_PAGE_POISONING - /* - * Page poisoning is debug page alloc for some arches. If - * either of those options are enabled, enable poisoning. - */ - if (page_poisoning_enabled() || - (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && - debug_pagealloc_enabled())) - static_branch_enable(&_page_poisoning_enabled); -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; From 198fba4137a1803a9cb93992b56c2ecba1aa83a3 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 29 Apr 2021 23:02:16 -0700 Subject: [PATCH 174/175] mm/mmzone.h: fix existing kernel-doc comments and link them to core-api There are a couple of kernel-doc comments in include/linux/mmzone.h but they have minor formatting issues that would cause kernel-doc warnings. Fix the formatting of those comments, add missing Return: descriptions and link include/linux/mmzone.h to Documentation/core-api/mm-api.rst Link: https://lkml.kernel.org/r/20210426141927.1314326-2-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/mm-api.rst | 1 + include/linux/mmzone.h | 43 +++++++++++++++++-------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 34f46df91a8b37..a42f9baddfbf8c 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -97,3 +97,4 @@ More Memory Management Functions :internal: .. kernel-doc:: include/linux/mm.h :internal: +.. kernel-doc:: include/linux/mmzone.h diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec758489..3b220574104826 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -993,7 +993,8 @@ static inline int is_highmem_idx(enum zone_type idx) * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. - * @zone - pointer to struct zone variable + * @zone: pointer to struct zone variable + * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(struct zone *zone) { @@ -1044,7 +1045,7 @@ extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes - * @pgdat - pointer to a pg_data_t variable + * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ @@ -1052,7 +1053,7 @@ extern struct zone *next_zone(struct zone *zone); pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones - * @zone - pointer to struct zone variable + * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. @@ -1091,15 +1092,18 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point - * @z - The cursor used as a starting point for the search - * @highest_zoneidx - The zone index of the highest zone to return - * @nodes - An optional nodemask to filter the zonelist with + * @z: The cursor used as a starting point for the search + * @highest_zoneidx: The zone index of the highest zone to return + * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. + * + * Return: the next zone at or below highest_zoneidx within the allowed + * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, @@ -1112,10 +1116,9 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist - * @zonelist - The zonelist to search for a suitable zone - * @highest_zoneidx - The zone index of the highest zone to return - * @nodes - An optional nodemask to filter the zonelist with - * @return - Zoneref pointer for the first suitable zone found (see below) + * @zonelist: The zonelist to search for a suitable zone + * @highest_zoneidx: The zone index of the highest zone to return + * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be @@ -1125,6 +1128,8 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. + * + * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, @@ -1136,11 +1141,11 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask - * @zone - The current zone in the iterator - * @z - The current pointer within zonelist->_zonerefs being iterated - * @zlist - The zonelist being iterated - * @highidx - The zone index of the highest zone to return - * @nodemask - Nodemask allowed by the allocator + * @zone: The current zone in the iterator + * @z: The current pointer within zonelist->_zonerefs being iterated + * @zlist: The zonelist being iterated + * @highidx: The zone index of the highest zone to return + * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask @@ -1160,10 +1165,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index - * @zone - The current zone in the iterator - * @z - The current pointer within zonelist->zones being iterated - * @zlist - The zonelist being iterated - * @highidx - The zone index of the highest zone to return + * @zone: The current zone in the iterator + * @z: The current pointer within zonelist->zones being iterated + * @zlist: The zonelist being iterated + * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ From 4d75136be8bf3ae01b0bc3e725b2cdc921e103bd Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Thu, 29 Apr 2021 23:02:19 -0700 Subject: [PATCH 175/175] mm/memory-failure: unnecessary amount of unmapping It appears that unmap_mapping_range() actually takes a 'size' as its third argument rather than a location, the current calling fashion causes unnecessary amount of unmapping to occur. Link: https://lkml.kernel.org/r/20210420002821.2749748-1-jane.chu@oracle.com Fixes: 6100e34b2526e ("mm, memory_failure: Teach memory_failure() about dev_pagemap pages") Signed-off-by: Jane Chu Reviewed-by: Dan Williams Reviewed-by: Naoya Horiguchi Cc: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 24210c9bd84348..bd3945446d47e9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1368,7 +1368,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * communicated in siginfo, see kill_proc() */ start = (page->index << PAGE_SHIFT) & ~(size - 1); - unmap_mapping_range(page->mapping, start, start + size, 0); + unmap_mapping_range(page->mapping, start, size, 0); } kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); rc = 0;