Skip to content

Commit

Permalink
mm: Add support for unaccepted memory
Browse files Browse the repository at this point in the history
UEFI Specification version 2.9 introduces the concept of memory
acceptance. Some Virtual Machine platforms, such as Intel TDX or AMD
SEV-SNP, require memory to be accepted before it can be used by the
guest. Accepting happens via a protocol specific to the Virtual Machine
platform.

There are several ways the kernel can deal with unaccepted memory:

 1. Accept all the memory during boot. It is easy to implement and it
    doesn't have runtime cost once the system is booted. The downside is
    very long boot time.

    Accept can be parallelized to multiple CPUs to keep it manageable
    (i.e. via DEFERRED_STRUCT_PAGE_INIT), but it tends to saturate
    memory bandwidth and does not scale beyond the point.

 2. Accept a block of memory on the first use. It requires more
    infrastructure and changes in page allocator to make it work, but
    it provides good boot time.

    On-demand memory accept means latency spikes every time kernel steps
    onto a new memory block. The spikes will go away once workload data
    set size gets stabilized or all memory gets accepted.

 3. Accept all memory in background. Introduce a thread (or multiple)
    that gets memory accepted proactively. It will minimize time the
    system experience latency spikes on memory allocation while keeping
    low boot time.

    This approach cannot function on its own. It is an extension of #2:
    background memory acceptance requires functional scheduler, but the
    page allocator may need to tap into unaccepted memory before that.

    The downside of the approach is that these threads also steal CPU
    cycles and memory bandwidth from the user's workload and may hurt
    user experience.

Implement #1 and #2 for now. #2 is the default. Some workloads may want
to use #1 with accept_memory=eager in kernel command line. #3 can be
implemented later based on user's demands.

Support of unaccepted memory requires a few changes in core-mm code:

  - memblock accepts memory on allocation. It serves early boot memory
    allocations and doesn't limit them to pre-accepted pool of memory.

  - page allocator accepts memory on the first allocation of the page.
    When kernel runs out of accepted memory, it accepts memory until the
    high watermark is reached. It helps to minimize fragmentation.

EFI code will provide two helpers if the platform supports unaccepted
memory:

 - accept_memory() makes a range of physical addresses accepted.

 - range_contains_unaccepted_memory() checks anything within the range
   of physical addresses requires acceptance.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Signed-off-by: Borislav Petkov (AMD) <[email protected]>
Reviewed-by: Vlastimil Babka <[email protected]>
Acked-by: Mike Rapoport <[email protected]>	# memblock
Link: https://lore.kernel.org/r/[email protected]
  • Loading branch information
kiryl authored and bp3tk0v committed Jun 6, 2023
1 parent 9561de3 commit dcdfdd4
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 0 deletions.
7 changes: 7 additions & 0 deletions drivers/base/node.c
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,9 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d ShmemPmdMapped: %8lu kB\n"
"Node %d FileHugePages: %8lu kB\n"
"Node %d FilePmdMapped: %8lu kB\n"
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
"Node %d Unaccepted: %8lu kB\n"
#endif
,
nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
Expand Down Expand Up @@ -477,6 +480,10 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
nid, K(node_page_state(pgdat, NR_FILE_THPS)),
nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
,
nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED))
#endif
);
len += hugetlb_report_node_meminfo(buf, len, nid);
Expand Down
5 changes: 5 additions & 0 deletions fs/proc/meminfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_zone_page_state(NR_FREE_CMA_PAGES));
#endif

#ifdef CONFIG_UNACCEPTED_MEMORY
show_val_kb(m, "Unaccepted: ",
global_zone_page_state(NR_UNACCEPTED));
#endif

hugetlb_report_meminfo(m);

arch_report_meminfo(m);
Expand Down
19 changes: 19 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -3816,4 +3816,23 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
}
#endif

#ifdef CONFIG_UNACCEPTED_MEMORY

bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
void accept_memory(phys_addr_t start, phys_addr_t end);

#else

static inline bool range_contains_unaccepted_memory(phys_addr_t start,
phys_addr_t end)
{
return false;
}

static inline void accept_memory(phys_addr_t start, phys_addr_t end)
{
}

#endif

#endif /* _LINUX_MM_H */
8 changes: 8 additions & 0 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ enum zone_stat_item {
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
NR_UNACCEPTED,
#endif
NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
Expand Down Expand Up @@ -910,6 +913,11 @@ struct zone {
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER + 1];

#ifdef CONFIG_UNACCEPTED_MEMORY
/* Pages to be accepted. All pages on the list are MAX_ORDER */
struct list_head unaccepted_pages;
#endif

/* zone flags, see below */
unsigned long flags;

Expand Down
9 changes: 9 additions & 0 deletions mm/memblock.c
Original file line number Diff line number Diff line change
Expand Up @@ -1436,6 +1436,15 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
*/
kmemleak_alloc_phys(found, size, 0);

/*
* Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
* require memory to be accepted before it can be used by the
* guest.
*
* Accept the memory of the allocated buffer.
*/
accept_memory(found, found + size);

return found;
}

Expand Down
7 changes: 7 additions & 0 deletions mm/mm_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -1375,6 +1375,10 @@ static void __meminit zone_init_free_lists(struct zone *zone)
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}

#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
#endif
}

void __meminit init_currently_empty_zone(struct zone *zone,
Expand Down Expand Up @@ -1960,6 +1964,9 @@ static void __init deferred_free_range(unsigned long pfn,
return;
}

/* Accept chunks smaller than MAX_ORDER upfront */
accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));

for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
Expand Down
173 changes: 173 additions & 0 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,12 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif

static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
static bool try_to_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);

int page_group_by_mobility_disabled __read_mostly;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
Expand Down Expand Up @@ -1481,6 +1487,13 @@ void __free_pages_core(struct page *page, unsigned int order)

atomic_long_add(nr_pages, &page_zone(page)->managed_pages);

if (page_contains_unaccepted(page, order)) {
if (order == MAX_ORDER && __free_unaccepted(page))
return;

accept_page(page, order);
}

/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
Expand Down Expand Up @@ -3159,6 +3172,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
unusable_free += zone_page_state(z, NR_UNACCEPTED);
#endif

return unusable_free;
}
Expand Down Expand Up @@ -3458,6 +3474,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
gfp_mask)) {
int ret;

if (has_unaccepted_memory()) {
if (try_to_accept_memory(zone, order))
goto try_this_zone;
}

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
Expand Down Expand Up @@ -3510,6 +3531,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,

return page;
} else {
if (has_unaccepted_memory()) {
if (try_to_accept_memory(zone, order))
goto try_this_zone;
}

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (deferred_pages_enabled()) {
Expand Down Expand Up @@ -7215,3 +7241,150 @@ bool has_managed_dma(void)
return false;
}
#endif /* CONFIG_ZONE_DMA */

#ifdef CONFIG_UNACCEPTED_MEMORY

/* Counts number of zones with unaccepted pages. */
static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);

static bool lazy_accept = true;

static int __init accept_memory_parse(char *p)
{
if (!strcmp(p, "lazy")) {
lazy_accept = true;
return 0;
} else if (!strcmp(p, "eager")) {
lazy_accept = false;
return 0;
} else {
return -EINVAL;
}
}
early_param("accept_memory", accept_memory_parse);

static bool page_contains_unaccepted(struct page *page, unsigned int order)
{
phys_addr_t start = page_to_phys(page);
phys_addr_t end = start + (PAGE_SIZE << order);

return range_contains_unaccepted_memory(start, end);
}

static void accept_page(struct page *page, unsigned int order)
{
phys_addr_t start = page_to_phys(page);

accept_memory(start, start + (PAGE_SIZE << order));
}

static bool try_to_accept_memory_one(struct zone *zone)
{
unsigned long flags;
struct page *page;
bool last;

if (list_empty(&zone->unaccepted_pages))
return false;

spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
struct page, lru);
if (!page) {
spin_unlock_irqrestore(&zone->lock, flags);
return false;
}

list_del(&page->lru);
last = list_empty(&zone->unaccepted_pages);

__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);

accept_page(page, MAX_ORDER);

__free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);

if (last)
static_branch_dec(&zones_with_unaccepted_pages);

return true;
}

static bool try_to_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
int ret = false;

/* How much to accept to get to high watermark? */
to_accept = high_wmark_pages(zone) -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0));

/* Accept at least one page */
do {
if (!try_to_accept_memory_one(zone))
break;
ret = true;
to_accept -= MAX_ORDER_NR_PAGES;
} while (to_accept > 0);

return ret;
}

static inline bool has_unaccepted_memory(void)
{
return static_branch_unlikely(&zones_with_unaccepted_pages);
}

static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
bool first = false;

if (!lazy_accept)
return false;

spin_lock_irqsave(&zone->lock, flags);
first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);

if (first)
static_branch_inc(&zones_with_unaccepted_pages);

return true;
}

#else

static bool page_contains_unaccepted(struct page *page, unsigned int order)
{
return false;
}

static void accept_page(struct page *page, unsigned int order)
{
}

static bool try_to_accept_memory(struct zone *zone, unsigned int order)
{
return false;
}

static inline bool has_unaccepted_memory(void)
{
return false;
}

static bool __free_unaccepted(struct page *page)
{
BUILD_BUG();
return false;
}

#endif /* CONFIG_UNACCEPTED_MEMORY */
3 changes: 3 additions & 0 deletions mm/vmstat.c
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,9 @@ const char * const vmstat_text[] = {
"nr_zspages",
#endif
"nr_free_cma",
#ifdef CONFIG_UNACCEPTED_MEMORY
"nr_unaccepted",
#endif

/* enum numa_stat_item counters */
#ifdef CONFIG_NUMA
Expand Down

0 comments on commit dcdfdd4

Please sign in to comment.